提交 dfce491c 编写于 作者: T TomorrowIsAnOtherDay

Merge branch 'develop' into CN_docs

from __future__ import absolute_import from __future__ import absolute_import
from __future__ import print_function from __future__ import print_function
from __future__ import unicode_literals
import argparse import argparse
import io, re import io, re
......
...@@ -18,3 +18,7 @@ ...@@ -18,3 +18,7 @@
FROM parl/parl-test:cuda9.0-cudnn7-v2 FROM parl/parl-test:cuda9.0-cudnn7-v2
COPY ./requirements.txt /root/ COPY ./requirements.txt /root/
RUN apt-get install -y libgflags-dev libgoogle-glog-dev libomp-dev unzip
RUN apt-get install -y libgtest-dev && cd /usr/src/gtest && mkdir build \
&& cd build && cmake .. && make && cp libgtest*.a /usr/local/lib
...@@ -69,7 +69,7 @@ function run_test_with_gpu() { ...@@ -69,7 +69,7 @@ function run_test_with_gpu() {
Running unit tests with GPU... Running unit tests with GPU...
======================================== ========================================
EOF EOF
ctest --output-on-failure -j10 ctest --output-on-failure -j20 --verbose
cd ${REPO_ROOT} cd ${REPO_ROOT}
rm -rf ${REPO_ROOT}/build rm -rf ${REPO_ROOT}/build
} }
...@@ -90,7 +90,7 @@ function run_test_with_cpu() { ...@@ -90,7 +90,7 @@ function run_test_with_cpu() {
===================================================== =====================================================
EOF EOF
if [ $# -eq 1 ];then if [ $# -eq 1 ];then
ctest --output-on-failure -j10 ctest --output-on-failure -j20 --verbose
else else
ctest --output-on-failure ctest --output-on-failure
fi fi
...@@ -145,7 +145,8 @@ function main() { ...@@ -145,7 +145,8 @@ function main() {
;; ;;
test) test)
# test code compability in environments with various python versions # test code compability in environments with various python versions
declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37") #declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
declare -a envs=("py27" "py36")
for env in "${envs[@]}";do for env in "${envs[@]}";do
cd /work cd /work
source ~/.bashrc source ~/.bashrc
...@@ -169,6 +170,10 @@ function main() { ...@@ -169,6 +170,10 @@ function main() {
pip install -r .teamcity/requirements_torch.txt pip install -r .teamcity/requirements_torch.txt
run_test_with_cpu $env "DIS_TESTING_TORCH" run_test_with_cpu $env "DIS_TESTING_TORCH"
fi fi
# clean env
export LC_ALL=C.UTF-8
export LANG=C.UTF-8
xparl stop
done done
run_test_with_gpu run_test_with_gpu
......
...@@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97 ...@@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97
gym gym
details details
parameterized parameterized
timeout_decorator
...@@ -2,4 +2,3 @@ ...@@ -2,4 +2,3 @@
gym gym
details details
parameterized parameterized
timeout_decorator
...@@ -37,7 +37,8 @@ if __name__ == '__main__': ...@@ -37,7 +37,8 @@ if __name__ == '__main__':
exclude_examples = [ exclude_examples = [
'NeurIPS2019-Learn-to-Move-Challenge', 'NeurIPS2019-Learn-to-Move-Challenge',
'NeurIPS2018-AI-for-Prosthetics-Challenge', 'EagerMode' 'NeurIPS2018-AI-for-Prosthetics-Challenge', 'LiftSim_baseline',
'EagerMode'
] ]
for example in os.listdir('../examples/'): for example in os.listdir('../examples/'):
if example not in exclude_examples: if example not in exclude_examples:
......
#!/usr/bin/env bash
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You need install mingw-cmake.
function init() {
RED='\033[0;31m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NONE='\033[0m'
REPO_ROOT=`pwd`
}
function abort(){
echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
echo "Please use pre-commit to check what is wrong." 1>&2
exit 1
}
function run_test_with_cpu() {
export CUDA_VISIBLE_DEVICES="-1"
mkdir -p ${REPO_ROOT}/build
cd ${REPO_ROOT}/build
if [ $# -eq 1 ];then
cmake -G "MinGW Makefiles" ..
else
cmake -G "MinGW Makefiles" .. -$2=ON
fi
cat <<EOF
=====================================================
Running unit tests with CPU in the environment: $1
=====================================================
EOF
if [ $# -eq 1 ];then
ctest --output-on-failure -j10
else
ctest --output-on-failure
fi
cd ${REPO_ROOT}
rm -rf ${REPO_ROOT}/build
}
function main() {
set -e
local CMD=$1
init
env="unused_variable"
# run unittest in windows (used in local machine)
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple .
pip uninstall -y torch torchvision
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple paddlepaddle==1.6.1 gym details parameterized
run_test_with_cpu $env
run_test_with_cpu $env "DIS_TESTING_SERIALLY"
pip uninstall -y paddlepaddle
pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
run_test_with_cpu $env "DIS_TESTING_TORCH"
}
main $@
...@@ -33,6 +33,7 @@ function(py_test TARGET_NAME) ...@@ -33,6 +33,7 @@ function(py_test TARGET_NAME)
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND python -u ${py_test_SRCS} ${py_test_ARGS} COMMAND python -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 300)
endfunction() endfunction()
function(import_test TARGET_NAME) function(import_test TARGET_NAME)
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
</p> </p>
[English](./README.md) | 简体中文 [English](./README.md) | 简体中文
[**文档**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md) [**文档**](https://parl.readthedocs.io/en/stable/index.html)
> PARL 是一个高性能、灵活的强化学习框架。 > PARL 是一个高性能、灵活的强化学习框架。
# 特点 # 特点
...@@ -61,7 +61,7 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour ...@@ -61,7 +61,7 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour
# 安装: # 安装:
### 依赖 ### 依赖
- Python 2.7 or 3.5+. - Python 2.7 or 3.5+. (**Windows系统**目前仅支持python3.6+以上的环境)
- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle) - [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle)
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
</p> </p>
English | [简体中文](./README.cn.md) English | [简体中文](./README.cn.md)
[**Documentation**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md) [**Documentation**](https://parl.readthedocs.io/en/stable/index.html)
> PARL is a flexible and high-efficient reinforcement learning framework. > PARL is a flexible and high-efficient reinforcement learning framework.
...@@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c ...@@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c
# Install: # Install:
### Dependencies ### Dependencies
- Python 2.7 or 3.5+. - Python 2.7 or 3.5+(On **Windows**, PARL only supprorts the enviroment with python3.6+).
- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone) - [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone)
......
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
from tqdm import tqdm
from parl.utils import logger
class Arena():
"""
An Arena class where any 2 agents can be pit against each other.
"""
def __init__(self, player1, player2, game, display=None):
"""
Input:
player 1,2: two functions that takes board as input, return action
game: Game object
display: a function that takes board as input and prints it (e.g.
display in othello/OthelloGame). Is necessary for verbose
mode.
see othello/OthelloPlayers.py for an example. See pit.py for pitting
human players/other baselines with each other.
"""
self.player1 = player1
self.player2 = player2
self.game = game
self.display = display
def playGame(self, verbose=False):
"""
Executes one episode of a game.
Returns:
either
winner: player who won the game (1 if player1, -1 if player2)
or
draw result returned from the game that is neither 1, -1, nor 0.
"""
players = [self.player2, None, self.player1]
curPlayer = 1
board = self.game.getInitBoard()
it = 0
while self.game.getGameEnded(board, curPlayer) == 0:
it += 1
if verbose:
assert self.display
print("Turn ", str(it), "Player ", str(curPlayer))
self.display(board)
action = players[curPlayer + 1](self.game.getCanonicalForm(
board, curPlayer))
valids = self.game.getValidMoves(
self.game.getCanonicalForm(board, curPlayer), 1)
if valids[action] == 0:
logger.error('Action {} is not valid!'.format(action))
logger.debug('valids = {}'.format(valids))
assert valids[action] > 0
board, curPlayer = self.game.getNextState(board, curPlayer, action)
if verbose:
assert self.display
print("Game over: Turn ", str(it), "Result ",
str(self.game.getGameEnded(board, 1)))
self.display(board)
return curPlayer * self.game.getGameEnded(board, curPlayer)
def playGames(self, num, verbose=False):
"""
Plays num games in which player1 starts num/2 games and player2 starts
num/2 games.
Returns:
oneWon: games won by player1
twoWon: games won by player2
draws: games won by nobody
"""
num = int(num / 2)
oneWon = 0
twoWon = 0
draws = 0
for _ in tqdm(range(num), desc="Arena.playGames (1)"):
gameResult = self.playGame(verbose=verbose)
if gameResult == 1:
oneWon += 1
elif gameResult == -1:
twoWon += 1
else:
draws += 1
self.player1, self.player2 = self.player2, self.player1
for _ in tqdm(range(num), desc="Arena.playGames (2)"):
gameResult = self.playGame(verbose=verbose)
if gameResult == -1:
oneWon += 1
elif gameResult == 1:
twoWon += 1
else:
draws += 1
return oneWon, twoWon, draws
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import threading
import queue
import pickle
from pickle import Pickler, Unpickler
from random import shuffle
from parl.utils import tensorboard
import numpy as np
from tqdm import tqdm
import parl
from parl.utils import logger
from actor import Actor
from utils import split_group, get_test_dataset
from alphazero_agent import create_agent
class Coach():
"""
This class executes the self-play, learning and evaluating.
"""
def __init__(self, game, args):
self.game = game
self.args = args
# neural network of current generation
self.current_agent = create_agent(self.game)
# neural network of previous generation
self.previous_agent = create_agent(self.game)
# history of examples from args.numItersForTrainExamplesHistory latest iterations
self.trainExamplesHistory = []
self.remote_actors_signal_queues = []
self.remote_actors_return_queue = queue.Queue()
self.test_dataset = get_test_dataset()
def _run_remote_tasks(self, signal_queue):
# The remote actor will actually run on the local machine or other machines of xparl cluster
remote_actor = Actor(self.game, self.args)
while True:
# receive running task signal
# signal: specify task type and task input data (optional)
signal = signal_queue.get()
if signal["task"] == "self-play":
episode_num_each_actor = self.args.numEps // self.args.actors_num
result = remote_actor.self_play(
self.current_agent.get_weights(), episode_num_each_actor)
self.remote_actors_return_queue.put({"self-play": result})
elif signal["task"] == "pitting":
games_num_each_actor = self.args.arenaCompare // self.args.actors_num
result = remote_actor.pitting(
self.previous_agent.get_weights(),
self.current_agent.get_weights(), games_num_each_actor)
self.remote_actors_return_queue.put({"pitting": result})
elif signal["task"] == "evaluate_test_dataset":
test_dataset = signal["test_dataset"]
result = remote_actor.evaluate_test_dataset(
self.current_agent.get_weights(), test_dataset)
self.remote_actors_return_queue.put({
"evaluate_test_dataset":
result
})
else:
raise NotImplementedError
def _create_remote_actors(self):
# connect to xparl cluster to submit jobs
parl.connect(self.args.master_address)
for i in range(self.args.actors_num):
signal_queue = queue.Queue()
self.remote_actors_signal_queues.append(signal_queue)
remote_thread = threading.Thread(
target=self._run_remote_tasks, args=(signal_queue, ))
remote_thread.setDaemon(True)
remote_thread.start()
def learn(self):
"""Each iteration:
1. Performs numEps episodes of self-play.
2. Retrains neural network with examples in trainExamplesHistory
(which has a maximum length of numItersForTrainExamplesHistory).
3. Evaluates the new neural network with the test dataset.
4. Pits the new neural network against the old one and accepts it
only if it wins >= updateThreshold fraction of games.
"""
# create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel.
self._create_remote_actors()
for iteration in range(1, self.args.numIters + 1):
logger.info('Starting Iter #{} ...'.format(iteration))
####################
logger.info('Step1: self-play in parallel...')
iterationTrainExamples = []
# update weights of remote actors to the latest weights, and ask them to run self-play task
for signal_queue in self.remote_actors_signal_queues:
signal_queue.put({"task": "self-play"})
# wait for all remote actors (a total of self.args.actors_num) to return the self-play results
for _ in range(self.args.actors_num):
result = self.remote_actors_return_queue.get()
iterationTrainExamples.extend(result["self-play"])
# save the iteration examples to the history
self.trainExamplesHistory.append(iterationTrainExamples)
if len(self.trainExamplesHistory
) > self.args.numItersForTrainExamplesHistory:
logger.warning("Removing the oldest entry in trainExamples.")
self.trainExamplesHistory.pop(0)
self.saveTrainExamples(iteration) # backup history to a file
####################
logger.info('Step2: train neural network...')
# shuffle examples before training
trainExamples = []
for e in self.trainExamplesHistory:
trainExamples.extend(e)
shuffle(trainExamples)
# training new network, keeping a copy of the old one
self.current_agent.save(
os.path.join(self.args.checkpoint, 'temp.pth.tar'))
self.previous_agent.restore(
os.path.join(self.args.checkpoint, 'temp.pth.tar'))
self.current_agent.learn(trainExamples)
####################
logger.info('Step3: evaluate test dataset in parallel...')
cnt = 0
# update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset
for i, data in enumerate(
split_group(
self.test_dataset,
len(self.test_dataset) // self.args.actors_num)):
self.remote_actors_signal_queues[i].put({
"task":
"evaluate_test_dataset",
"test_dataset":
data
})
cnt += len(data)
perfect_moves_cnt, good_moves_cnt = 0, 0
# wait for all remote actors (a total of self.args.actors_num) to return the evaluating results
for _ in range(self.args.actors_num):
(perfect_moves,
good_moves) = self.remote_actors_return_queue.get(
)["evaluate_test_dataset"]
perfect_moves_cnt += perfect_moves
good_moves_cnt += good_moves
logger.info('perfect moves rate: {}, good moves rate: {}'.format(
perfect_moves_cnt / cnt, good_moves_cnt / cnt))
tensorboard.add_scalar('perfect_moves_rate',
perfect_moves_cnt / cnt, iteration)
tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt,
iteration)
####################
logger.info(
'Step4: pitting against previous generation in parallel...')
# transfer weights of previous generation and current generation to the remote actors, and ask them to pit.
for signal_queue in self.remote_actors_signal_queues:
signal_queue.put({"task": "pitting"})
previous_wins, current_wins, draws = 0, 0, 0
for _ in range(self.args.actors_num):
(pwins_, cwins_,
draws_) = self.remote_actors_return_queue.get()["pitting"]
previous_wins += pwins_
current_wins += cwins_
draws += draws_
logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
(current_wins, previous_wins, draws))
if previous_wins + current_wins == 0 or float(current_wins) / (
previous_wins + current_wins) < self.args.updateThreshold:
logger.info('REJECTING NEW MODEL')
self.current_agent.restore(
os.path.join(self.args.checkpoint, 'temp.pth.tar'))
else:
logger.info('ACCEPTING NEW MODEL')
self.current_agent.save(
os.path.join(self.args.checkpoint, 'best.pth.tar'))
self.current_agent.save(
os.path.join(self.args.checkpoint,
self.getCheckpointFile(iteration)))
def getCheckpointFile(self, iteration):
return 'checkpoint_' + str(iteration) + '.pth.tar'
def saveTrainExamples(self, iteration):
folder = self.args.checkpoint
if not os.path.exists(folder):
os.makedirs(folder)
filename = os.path.join(
folder,
self.getCheckpointFile(iteration) + ".examples")
with open(filename, "wb+") as f:
Pickler(f).dump(self.trainExamplesHistory)
f.closed
def loadModel(self):
self.current_agent.restore(
os.path.join(self.args.load_folder_file[0],
self.args.load_folder_file[1]))
def loadTrainExamples(self):
modelFile = os.path.join(self.args.load_folder_file[0],
self.args.load_folder_file[1])
examplesFile = modelFile + ".examples"
if not os.path.isfile(examplesFile):
logger.warning(
"File {} with trainExamples not found!".format(examplesFile))
r = input("Continue? [y|n]")
if r != "y":
sys.exit()
else:
logger.info("File with trainExamples found. Loading it...")
with open(examplesFile, "rb") as f:
self.trainExamplesHistory = Unpickler(f).load()
logger.info('Loading done!')
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
import math
import time
import numpy as np
EPS = 1e-8
class MCTS():
"""
This class handles the MCTS tree.
"""
def __init__(self, game, nn_agent, args, dirichlet_noise=False):
self.game = game
self.nn_agent = nn_agent
self.args = args
self.dirichlet_noise = dirichlet_noise
self.Qsa = {} # stores Q values for s,a (as defined in the paper)
self.Nsa = {} # stores #times edge s,a was visited
self.Ns = {} # stores #times board s was visited
self.Ps = {} # stores initial policy (returned by neural net)
self.Es = {} # stores game.getGameEnded ended for board s
self.Vs = {} # stores game.getValidMoves for board s
def getActionProb(self, canonicalBoard, temp=1):
"""
This function performs numMCTSSims simulations of MCTS starting from
canonicalBoard.
Returns:
probs: a policy vector where the probability of the ith action is
proportional to Nsa[(s,a)]**(1./temp)
"""
for i in range(self.args.numMCTSSims):
dir_noise = (i == 0 and self.dirichlet_noise)
self.search(canonicalBoard, dirichlet_noise=dir_noise)
s = self.game.stringRepresentation(canonicalBoard)
counts = [
self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
for a in range(self.game.getActionSize())
]
if temp == 0:
bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
bestA = np.random.choice(bestAs)
probs = [0] * len(counts)
probs[bestA] = 1
return probs
counts = [x**(1. / temp) for x in counts]
counts_sum = float(sum(counts))
probs = [x / counts_sum for x in counts]
return probs
def search(self, canonicalBoard, dirichlet_noise=False):
"""
This function performs one iteration of MCTS. It is recursively called
till a leaf node is found. The action chosen at each node is one that
has the maximum upper confidence bound as in the paper.
Once a leaf node is found, the neural network is called to return an
initial policy P and a value v for the state. This value is propagated
up the search path. In case the leaf node is a terminal state, the
outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
updated.
NOTE: the return values are the negative of the value of the current
state. This is done since v is in [-1,1] and if v is the value of a
state for the current player, then its value is -v for the other player.
Returns:
v: the negative of the value of the current canonicalBoard
"""
s = self.game.stringRepresentation(canonicalBoard)
if s not in self.Es:
self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
if self.Es[s] != 0:
# terminal node
return -self.Es[s]
if s not in self.Ps:
# leaf node
self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
valids = self.game.getValidMoves(canonicalBoard, 1)
self.Ps[s] = self.Ps[s] * valids # masking invalid moves
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
if sum_Ps_s > 0:
self.Ps[s] /= sum_Ps_s # renormalize
else:
# if all valid moves were masked make all valid moves equally probable
# NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
# If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
print("All valid moves were masked, doing a workaround.")
self.Ps[s] = self.Ps[s] + valids
self.Ps[s] /= np.sum(self.Ps[s])
self.Vs[s] = valids
self.Ns[s] = 0
return -v
valids = self.Vs[s]
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
self.Ps[s] /= sum_Ps_s # renormalize
cur_best = -float('inf')
best_act = -1
# pick the action with the highest upper confidence bound
for a in range(self.game.getActionSize()):
if valids[a]:
if (s, a) in self.Qsa:
u = self.Qsa[
(s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s]) / (1 + self.Nsa[(s, a)])
else:
u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s] + EPS) # Q = 0 ?
if u > cur_best:
cur_best = u
best_act = a
a = best_act
next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
next_s = self.game.getCanonicalForm(next_s, next_player)
v = self.search(next_s)
if (s, a) in self.Qsa:
self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
(s, a)] + v) / (self.Nsa[(s, a)] + 1)
self.Nsa[(s, a)] += 1
else:
self.Qsa[(s, a)] = v
self.Nsa[(s, a)] = 1
self.Ns[s] += 1
return -v
def applyDirNoise(self, s, valids):
dir_values = np.random.dirichlet(
[self.args.dirichletAlpha] * np.count_nonzero(valids))
dir_idx = 0
for idx in range(len(self.Ps[s])):
if self.Ps[s][idx]:
self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
0.25 * dir_values[dir_idx])
dir_idx += 1
## AlphaZero baseline for Connect4 game (distributed version)
- In this example, we provide a fine-tuned AlphaZero baseline to solve the Connect4 game, based on the code of [alpha-zero-general](https://github.com/suragnair/alpha-zero-general) repo.
- We take advantage of the parallelism capacity of [PARL](https://github.com/PaddlePaddle/PARL) to support running self-play and evaluating tasks in parallel.
- We also provide scripts to pack your well-trained model to a submission file, which can be submitted to the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition directly.
### Dependencies
- python3
- [parl==1.3](https://github.com/PaddlePaddle/PARL)
- torch
- tqdm
### Training
1. Download the [1k connect4 validation set](https://www.kaggle.com/petercnudde/1k-connect4-validation-set) to the current directory. (filename: `refmoves1k_kaggle`)
2. Start xparl cluster
```bash
# You can change following `cpu_num` and `args.actor_nums` in the main.py
# based on the CPU number of your machine.
xparl start --port 8010 --cpu_num 25
```
```bash
# [OPTIONAL] You can also run the following script in other machines to add more CPU resource
# to the xparl cluster, so you can increase the parallelism (args.actor_nums).
xparl connect --address MASTER_IP:8010 --cpu_num [CPU_NUM]
```
3. Run training script
```bash
python main.py
```
4. Visualize (good moves rate and perfect moves rate)
```
tensorboard --logdir .
```
### Submitting
To submit the well-trained model to the Kaggle, you can use our provided script to generate `submission.py`, for example:
```bash
python gen_submission.py saved_model/best.pth.tar
```
### Performance
- Following are `good moves rate` and `perfect moves rate` indicators in tensorbaord, please refer to the [link](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) for specific meaning.
<img src=".pic/good_moves.png" width = "300" alt="good moves rate"/> <img src=".pic/perfect_moves.png" width = "300" alt="perfect moves rate"/>
> It takes about 1 day to run 25 iterations on the machine with 25 cpus.
- It can reach about score 1368 (rank 5 on 2020/06/04) in the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition.
### Reference
- [suragnair/alpha-zero-general](https://github.com/suragnair/alpha-zero-general)
- [Scoring connect-x agents](https://www.kaggle.com/petercnudde/scoring-connect-x-agents)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl
import os
from alphazero_agent import create_agent
from MCTS import MCTS
from Arena import Arena
from utils import win_loss_draw
@parl.remote_class
class Actor(object):
def __init__(self, game, args):
os.environ['OMP_NUM_THREADS'] = "1"
self.game = game
self.args = args
# neural network of previous generation
self.previous_agent = create_agent(self.game, cuda=False)
# neural network of current generation
self.current_agent = create_agent(self.game, cuda=False)
# MCTS of previous generation
self.previous_mcts = MCTS(
self.game, self.previous_agent, self.args, dirichlet_noise=True)
# MCTS of current generation
self.current_mcts = MCTS(
self.game, self.current_agent, self.args, dirichlet_noise=True)
def self_play(self, current_weights, game_num):
"""Collecting training data by self-play.
Args:
current_weights (numpy.array): latest weights of neural network
game_num (int): game number of self-play
Returns:
train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v)
"""
# update weights of current neural network with latest weights
self.current_agent.set_weights(current_weights)
train_examples = []
for _ in range(game_num):
# reset node state of MCTS
self.current_mcts = MCTS(
self.game, self.current_agent, self.args, dirichlet_noise=True)
train_examples.extend(self._executeEpisode())
return train_examples
def pitting(self, previous_weights, current_weights, games_num):
"""Fighting between previous generation agent and current generation agent
Args:
previous_weights (numpy.array): weights of previous generation neural network
current_weights (numpy.array): weights of current generation neural network
game_num (int): game number of fighting
Returns:
tuple of (game number of previous agent won, game number of current agent won, game number of draw)
"""
# update weights of previous and current neural network
self.previous_agent.set_weights(previous_weights)
self.current_agent.set_weights(current_weights)
# reset node state of MCTS
self.previous_mcts = MCTS(self.game, self.previous_agent, self.args)
self.current_mcts = MCTS(self.game, self.current_agent, self.args)
arena = Arena(
lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)),
lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)),
self.game)
previous_wins, current_wins, draws = arena.playGames(games_num)
return (previous_wins, current_wins, draws)
def evaluate_test_dataset(self, current_weights, test_dataset):
"""Evaluate performance of latest neural nerwork
Args:
current_weights (numpy.array): latest weights of neural network
test_dataset (list): game number of self-play
Returns:
tuple of (number of perfect moves, number of good moves)
"""
# update weights of current neural network with latest weights
self.current_agent.set_weights(current_weights)
perfect_move_count, good_move_count = 0, 0
for data in test_dataset:
self.current_mcts = MCTS(self.game, self.current_agent, self.args)
x = self.game.getCanonicalForm(data['board'], data['player'])
agent_move = int(
np.argmax(self.current_mcts.getActionProb(x, temp=0)))
moves = data["move_score"]
perfect_score = max(moves)
perfect_moves = [i for i in range(7) if moves[i] == perfect_score]
if agent_move in perfect_moves:
perfect_move_count += 1
if win_loss_draw(
moves[agent_move]) == win_loss_draw(perfect_score):
good_move_count += 1
return (perfect_move_count, good_move_count)
def _executeEpisode(self):
"""
This function executes one episode of self-play, starting with player 1.
As the game goes on, each turn is added as a training example to
trainExamples. The game is played till the game ends. After the game
ends, the outcome of the game is used to assign values to each example
in trainExamples.
It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter
uses temp=0.
Returns:
trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
pi is the MCTS informed policy vector, v is +1 if
the player eventually won the game, else -1.
"""
trainExamples = []
board = self.game.getInitBoard()
self.curPlayer = 1
episodeStep = 0
while True:
episodeStep += 1
canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
temp = int(episodeStep < self.args.tempThresholdStep)
pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp)
sym = self.game.getSymmetries(canonicalBoard, pi)
for b, p in sym: # board, pi
trainExamples.append([b, self.curPlayer, p, None])
action = np.random.choice(len(pi), p=pi)
board, self.curPlayer = self.game.getNextState(
board, self.curPlayer, action)
r = self.game.getGameEnded(board, self.curPlayer)
if r != 0:
return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
for x in trainExamples]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import parl
import torch
import torch.optim as optim
from tqdm import tqdm
from utils import *
from connect4_model import Connect4Model
args = dotdict({
'lr': 0.001,
'dropout': 0.3,
'epochs': 5,
'batch_size': 64,
'num_channels': 64,
})
class AlphaZero(parl.Algorithm):
def __init__(self, model):
self.model = model
def learn(self, boards, target_pis, target_vs, optimizer):
self.model.train() # train mode
# compute model output
out_log_pi, out_v = self.model(boards)
pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0]
v_loss = torch.sum(
(target_vs - out_v.view(-1))**2) / target_vs.size()[0]
total_loss = pi_loss + v_loss
# compute gradient and do SGD step
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
return total_loss, pi_loss, v_loss
def predict(self, board):
self.model.eval() # eval mode
with torch.no_grad():
log_pi, v = self.model(board)
pi = torch.exp(log_pi)
return pi, v
def create_agent(game, cuda=True):
cuda = cuda and torch.cuda.is_available()
model = Connect4Model(game, args)
if cuda:
model.cuda()
algorithm = AlphaZero(model)
alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
return alphazero_agent
class AlphaZeroAgent(parl.Agent):
def __init__(self, algorithm, game, cuda):
super(AlphaZeroAgent, self).__init__(algorithm)
self.cuda = cuda
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
def learn(self, examples):
"""
Args:
examples: list of examples, each example is of form (board, pi, v)
"""
optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr)
for epoch in range(args.epochs):
print('EPOCH ::: ' + str(epoch + 1))
batch_count = int(len(examples) / args.batch_size)
pbar = tqdm(range(batch_count), desc='Training Net')
for _ in pbar:
sample_ids = np.random.randint(
len(examples), size=args.batch_size)
boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
boards = torch.FloatTensor(np.array(boards).astype(np.float64))
target_pis = torch.FloatTensor(np.array(pis))
target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))
if self.cuda:
boards, target_pis, target_vs = boards.contiguous().cuda(
), target_pis.contiguous().cuda(), target_vs.contiguous(
).cuda()
total_loss, pi_loss, v_loss = self.algorithm.learn(
boards, target_pis, target_vs, optimizer)
# record loss with tqdm
pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item())
def predict(self, board):
"""
Args:
board (np.array): input board
Return:
pi (np.array): probability of actions
v (np.array): estimated value of input
"""
# preparing input
board = torch.FloatTensor(board.astype(np.float64))
if self.cuda:
board = board.contiguous().cuda()
board = board.view(1, self.board_x, self.board_y)
pi, v = self.algorithm.predict(board)
return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
def create_agent(game, cuda=True):
cuda = cuda and torch.cuda.is_available()
model = Connect4Model(game, args)
if cuda:
model.cuda()
algorithm = AlphaZero(model)
alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
return alphazero_agent
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
import numpy as np
from collections import namedtuple
DEFAULT_HEIGHT = 6
DEFAULT_WIDTH = 7
DEFAULT_WIN_LENGTH = 4
WinState = namedtuple('WinState', 'is_ended winner')
class Board():
"""
Connect4 Board.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
"Set up initial board configuration."
self.height = height or DEFAULT_HEIGHT
self.width = width or DEFAULT_WIDTH
self.win_length = win_length or DEFAULT_WIN_LENGTH
if np_pieces is None:
self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
else:
self.np_pieces = np_pieces
assert self.np_pieces.shape == (self.height, self.width)
def add_stone(self, column, player):
"Create copy of board containing new stone."
available_idx, = np.where(self.np_pieces[:, column] == 0)
if len(available_idx) == 0:
raise ValueError(
"Can't play column %s on board %s" % (column, self))
self.np_pieces[available_idx[-1]][column] = player
def get_valid_moves(self):
"Any zero value in top row in a valid move"
return self.np_pieces[0] == 0
def get_win_state(self):
for player in [-1, 1]:
player_pieces = self.np_pieces == -player
# Check rows & columns for win
if (self._is_straight_winner(player_pieces)
or self._is_straight_winner(player_pieces.transpose())
or self._is_diagonal_winner(player_pieces)):
return WinState(True, -player)
# draw has very little value.
if not self.get_valid_moves().any():
return WinState(True, None)
# Game is not ended yet.
return WinState(False, None)
def with_np_pieces(self, np_pieces):
"""Create copy of board with specified pieces."""
if np_pieces is None:
np_pieces = self.np_pieces
return Board(self.height, self.width, self.win_length, np_pieces)
def _is_diagonal_winner(self, player_pieces):
"""Checks if player_pieces contains a diagonal win."""
win_length = self.win_length
for i in range(len(player_pieces) - win_length + 1):
for j in range(len(player_pieces[0]) - win_length + 1):
if all(player_pieces[i + x][j + x] for x in range(win_length)):
return True
for j in range(win_length - 1, len(player_pieces[0])):
if all(player_pieces[i + x][j - x] for x in range(win_length)):
return True
return False
def _is_straight_winner(self, player_pieces):
"""Checks if player_pieces contains a vertical or horizontal win."""
run_lengths = [
player_pieces[:, i:i + self.win_length].sum(axis=1)
for i in range(len(player_pieces) - self.win_length + 2)
]
return max([x.max() for x in run_lengths]) >= self.win_length
def __str__(self):
return str(self.np_pieces)
class Connect4Game(object):
"""
Connect4 Game class implementing the alpha-zero-general Game interface.
Use 1 for player1 and -1 for player2.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
self._base_board = Board(height, width, win_length, np_pieces)
def getInitBoard(self):
"""
Returns:
startBoard: a representation of the board (ideally this is the form
that will be the input to your neural network)
"""
return self._base_board.np_pieces
def getBoardSize(self):
"""
Returns:
(x,y): a tuple of board dimensions
"""
return (self._base_board.height, self._base_board.width)
def getActionSize(self):
"""
Returns:
actionSize: number of all possible actions
"""
return self._base_board.width
def getNextState(self, board, player, action):
"""Returns a copy of the board with updated move, original board is unmodified.
Input:
board: current board
player: current player (1 or -1)
action: action taken by current player
Returns:
nextBoard: board after applying action
nextPlayer: player who plays in the next turn (should be -player)
"""
b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
b.add_stone(action, player)
return b.np_pieces, -player
def getValidMoves(self, board, player):
"""Any zero value in top row in a valid move.
Input:
board: current board
player: current player
Returns:
validMoves: a binary vector of length self.getActionSize(), 1 for
moves that are valid from the current board and player,
0 for invalid moves
"""
return self._base_board.with_np_pieces(
np_pieces=board).get_valid_moves()
def getGameEnded(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
r: 0 if game has not ended. 1 if player won, -1 if player lost,
small non-zero value for draw.
"""
b = self._base_board.with_np_pieces(np_pieces=board)
winstate = b.get_win_state()
if winstate.is_ended:
if winstate.winner is None:
# draw has very little value.
return 1e-4
elif winstate.winner == player:
return +1
elif winstate.winner == -player:
return -1
else:
raise ValueError('Unexpected winstate found: ', winstate)
else:
# 0 used to represent unfinished game.
return 0
def getCanonicalForm(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
canonicalBoard: returns canonical form of board. The canonical form
should be independent of player. For e.g. in chess,
the canonical form can be chosen to be from the pov
of white. When the player is white, we can return
board as is. When the player is black, we can invert
the colors and return the board.
"""
return board * player
def getSymmetries(self, board, pi):
"""Board is left/right board symmetric
Input:
board: current board
pi: policy vector of size self.getActionSize()
Returns:
symmForms: a list of [(board,pi)] where each tuple is a symmetrical
form of the board and the corresponding pi vector. This
is used when training the neural network from examples.
"""
return [(board, pi),
(np.array(board[:, ::-1], copy=True),
np.array(pi[::-1], copy=True))]
def stringRepresentation(self, board):
"""
Input:
board: current board
Returns:
boardString: a quick conversion of board to a string format.
Required by MCTS for hashing.
"""
return board.tostring()
@staticmethod
def display(board):
print(" -----------------------")
print(' '.join(map(str, range(len(board[0])))))
print(board)
print(" -----------------------")
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Connect4Model(parl.Model):
def __init__(self, game, args):
# game params
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
self.args = args
super(Connect4Model, self).__init__()
self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.conv4 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.bn1 = nn.BatchNorm2d(args.num_channels)
self.bn2 = nn.BatchNorm2d(args.num_channels)
self.bn3 = nn.BatchNorm2d(args.num_channels)
self.bn4 = nn.BatchNorm2d(args.num_channels)
self.fc1 = nn.Linear(
args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
self.fc_bn1 = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 64)
self.fc_bn2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, self.action_size)
self.fc4 = nn.Linear(64, 1)
def forward(self, s):
"""
Args:
s(torch.Tensor): batch_size x board_x x board_y
"""
# batch_size x 1 x board_x x board_y
s = s.view(-1, 1, self.board_x, self.board_y)
# batch_size x num_channels x board_x x board_y
s = F.relu(self.bn1(self.conv1(s)))
# batch_size x num_channels x board_x x board_y
s = F.relu(self.bn2(self.conv2(s)))
# batch_size x num_channels x (board_x-2) x (board_y-2)
s = F.relu(self.bn3(self.conv3(s)))
# batch_size x num_channels x (board_x-4) x (board_y-4)
s = F.relu(self.bn4(self.conv4(s)))
s = s.view(
-1,
self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
s = F.dropout(
F.relu(self.fc_bn1(self.fc1(s))),
p=self.args.dropout,
training=self.training) # batch_size x 128
s = F.dropout(
F.relu(self.fc_bn2(self.fc2(s))),
p=self.args.dropout,
training=self.training) # batch_size x 64
pi = self.fc3(s) # batch_size x action_size
v = self.fc4(s) # batch_size x 1
return F.log_softmax(pi, dim=1), torch.tanh(v)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import base64
import inspect
import os
assert len(sys.argv) == 2, "please specify model path."
model_path = sys.argv[1]
with open(model_path, 'rb') as f:
raw_bytes = f.read()
encoded_weights = base64.encodebytes(raw_bytes)
# encode weights of model to byte string
submission_file = """
import base64
decoded = base64.b64decode({})
""".format(encoded_weights)
# insert code snippet of loading weights
with open('submission_template.py', 'r') as f:
submission_file += ''.join(f.readlines())
# generate final submission file
with open('submission.py', 'w') as f:
f.write(submission_file)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from Coach import Coach
from connect4_game import Connect4Game
from utils import *
from parl.utils import logger
args = dotdict({
# master address of xparl cluster
'master_address': 'localhost:8010',
# number of remote actors (execute tasks [self-play/pitting/evaluate_test_dataset] in parallel).
'actors_num': 25,
# total number of iteration
'numIters': 200,
# Number of complete self-play games to simulate during a new iteration.
'numEps': 500,
# Number of games to play during arena (pitting) play to determine if new neural network will be accepted.
'arenaCompare': 50,
# Number of games moves for MCTS to simulate.
'numMCTSSims': 800,
# temp=1 (Temperature, τ (tau)) if episodeStep < tempThresholdStep, and thereafter uses temp=0.
'tempThresholdStep': 15,
# During arena playoff, new neural net will be accepted if threshold or more of games are won.
'updateThreshold': 0.6,
# CPUCT parameter
'cpuct': 4,
# alpha parameter of dirichlet noise which is added to the policy (pi)
'dirichletAlpha': 1.0,
# history of examples from numItersForTrainExamplesHistory latest iterations (training data)
'numItersForTrainExamplesHistory': 20,
# folder to save model and training examples
'checkpoint': './saved_model/',
# whether to load saved model and training examples
'load_model': False,
'load_folder_file': ('./saved_model', 'checkpoint_1.pth.tar'),
})
# Plays arenaCompare games in which player1 starts arenaCompare/2 games and player2 starts arenaCompare/2 games.
assert args.arenaCompare % 2 == 0
# make sure the tasks can be split evenly among different remote actors
assert args.numEps % args.actors_num == 0
assert (args.arenaCompare // 2) % args.actors_num == 0
assert 1000 % args.actors_num == 0 # there are 1000 boards state in test_dataset
def main():
game = Connect4Game()
c = Coach(game, args)
if args.load_model:
logger.info('Loading checkpoint {}...'.format(args.load_folder_file))
c.loadModel()
logger.info("Loading 'trainExamples' from file {}...".format(
args.load_folder_file))
c.loadTrainExamples()
c.learn()
if __name__ == "__main__":
main()
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
import os
os.environ['OMP_NUM_THREADS'] = "1"
# ===== utils.py =====
class dotdict(dict):
def __getattr__(self, name):
return self[name]
# ===== MCTS.py ======
import math
import time
import numpy as np
EPS = 1e-8
class MCTS():
"""
This class handles the MCTS tree.
"""
def __init__(self, game, nn_agent, args, dirichlet_noise=False):
self.game = game
self.nn_agent = nn_agent
self.args = args
self.dirichlet_noise = dirichlet_noise
self.Qsa = {} # stores Q values for s,a (as defined in the paper)
self.Nsa = {} # stores #times edge s,a was visited
self.Ns = {} # stores #times board s was visited
self.Ps = {} # stores initial policy (returned by neural net)
self.Es = {} # stores game.getGameEnded ended for board s
self.Vs = {} # stores game.getValidMoves for board s
def getActionProb(self, canonicalBoard, temp=1, timelimit=4.9):
"""
This function performs numMCTSSims simulations of MCTS starting from
canonicalBoard.
Returns:
probs: a policy vector where the probability of the ith action is
proportional to Nsa[(s,a)]**(1./temp)
"""
dir_noise = self.dirichlet_noise
start_time = time.time()
while time.time() - start_time < timelimit:
self.search(canonicalBoard, dirichlet_noise=dir_noise)
s = self.game.stringRepresentation(canonicalBoard)
counts = [
self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
for a in range(self.game.getActionSize())
]
if temp == 0:
bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
bestA = np.random.choice(bestAs)
probs = [0] * len(counts)
probs[bestA] = 1
return probs
counts = [x**(1. / temp) for x in counts]
counts_sum = float(sum(counts))
probs = [x / counts_sum for x in counts]
return probs
def search(self, canonicalBoard, dirichlet_noise=False):
"""
This function performs one iteration of MCTS. It is recursively called
till a leaf node is found. The action chosen at each node is one that
has the maximum upper confidence bound as in the paper.
Once a leaf node is found, the neural network is called to return an
initial policy P and a value v for the state. This value is propagated
up the search path. In case the leaf node is a terminal state, the
outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
updated.
NOTE: the return values are the negative of the value of the current
state. This is done since v is in [-1,1] and if v is the value of a
state for the current player, then its value is -v for the other player.
Returns:
v: the negative of the value of the current canonicalBoard
"""
s = self.game.stringRepresentation(canonicalBoard)
if s not in self.Es:
self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
if self.Es[s] != 0:
# terminal node
return -self.Es[s]
if s not in self.Ps:
# leaf node
self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
valids = self.game.getValidMoves(canonicalBoard, 1)
self.Ps[s] = self.Ps[s] * valids # masking invalid moves
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
if sum_Ps_s > 0:
self.Ps[s] /= sum_Ps_s # renormalize
else:
# if all valid moves were masked make all valid moves equally probable
# NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
# If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
print("All valid moves were masked, doing a workaround.")
self.Ps[s] = self.Ps[s] + valids
self.Ps[s] /= np.sum(self.Ps[s])
self.Vs[s] = valids
self.Ns[s] = 0
return -v
valids = self.Vs[s]
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
self.Ps[s] /= sum_Ps_s # renormalize
cur_best = -float('inf')
best_act = -1
# pick the action with the highest upper confidence bound
for a in range(self.game.getActionSize()):
if valids[a]:
if (s, a) in self.Qsa:
u = self.Qsa[
(s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s]) / (1 + self.Nsa[(s, a)])
else:
u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s] + EPS) # Q = 0 ?
if u > cur_best:
cur_best = u
best_act = a
a = best_act
next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
next_s = self.game.getCanonicalForm(next_s, next_player)
v = self.search(next_s)
if (s, a) in self.Qsa:
self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
(s, a)] + v) / (self.Nsa[(s, a)] + 1)
self.Nsa[(s, a)] += 1
else:
self.Qsa[(s, a)] = v
self.Nsa[(s, a)] = 1
self.Ns[s] += 1
return -v
def applyDirNoise(self, s, valids):
dir_values = np.random.dirichlet(
[self.args.dirichletAlpha] * np.count_nonzero(valids))
dir_idx = 0
for idx in range(len(self.Ps[s])):
if self.Ps[s][idx]:
self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
0.25 * dir_values[dir_idx])
dir_idx += 1
# ===== connect4_game.py ======
import numpy as np
from collections import namedtuple
DEFAULT_HEIGHT = 6
DEFAULT_WIDTH = 7
DEFAULT_WIN_LENGTH = 4
WinState = namedtuple('WinState', 'is_ended winner')
class Board():
"""
Connect4 Board.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
"Set up initial board configuration."
self.height = height or DEFAULT_HEIGHT
self.width = width or DEFAULT_WIDTH
self.win_length = win_length or DEFAULT_WIN_LENGTH
if np_pieces is None:
self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
else:
self.np_pieces = np_pieces
assert self.np_pieces.shape == (self.height, self.width)
def add_stone(self, column, player):
"Create copy of board containing new stone."
available_idx, = np.where(self.np_pieces[:, column] == 0)
if len(available_idx) == 0:
raise ValueError(
"Can't play column %s on board %s" % (column, self))
self.np_pieces[available_idx[-1]][column] = player
def get_valid_moves(self):
"Any zero value in top row in a valid move"
return self.np_pieces[0] == 0
def get_win_state(self):
for player in [-1, 1]:
player_pieces = self.np_pieces == -player
# Check rows & columns for win
if (self._is_straight_winner(player_pieces)
or self._is_straight_winner(player_pieces.transpose())
or self._is_diagonal_winner(player_pieces)):
return WinState(True, -player)
# draw has very little value.
if not self.get_valid_moves().any():
return WinState(True, None)
# Game is not ended yet.
return WinState(False, None)
def with_np_pieces(self, np_pieces):
"""Create copy of board with specified pieces."""
if np_pieces is None:
np_pieces = self.np_pieces
return Board(self.height, self.width, self.win_length, np_pieces)
def _is_diagonal_winner(self, player_pieces):
"""Checks if player_pieces contains a diagonal win."""
win_length = self.win_length
for i in range(len(player_pieces) - win_length + 1):
for j in range(len(player_pieces[0]) - win_length + 1):
if all(player_pieces[i + x][j + x] for x in range(win_length)):
return True
for j in range(win_length - 1, len(player_pieces[0])):
if all(player_pieces[i + x][j - x] for x in range(win_length)):
return True
return False
def _is_straight_winner(self, player_pieces):
"""Checks if player_pieces contains a vertical or horizontal win."""
run_lengths = [
player_pieces[:, i:i + self.win_length].sum(axis=1)
for i in range(len(player_pieces) - self.win_length + 2)
]
return max([x.max() for x in run_lengths]) >= self.win_length
def __str__(self):
return str(self.np_pieces)
class Connect4Game(object):
"""
Connect4 Game class implementing the alpha-zero-general Game interface.
Use 1 for player1 and -1 for player2.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
self._base_board = Board(height, width, win_length, np_pieces)
def getInitBoard(self):
"""
Returns:
startBoard: a representation of the board (ideally this is the form
that will be the input to your neural network)
"""
return self._base_board.np_pieces
def getBoardSize(self):
"""
Returns:
(x,y): a tuple of board dimensions
"""
return (self._base_board.height, self._base_board.width)
def getActionSize(self):
"""
Returns:
actionSize: number of all possible actions
"""
return self._base_board.width
def getNextState(self, board, player, action):
"""Returns a copy of the board with updated move, original board is unmodified.
Input:
board: current board
player: current player (1 or -1)
action: action taken by current player
Returns:
nextBoard: board after applying action
nextPlayer: player who plays in the next turn (should be -player)
"""
b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
b.add_stone(action, player)
return b.np_pieces, -player
def getValidMoves(self, board, player):
"""Any zero value in top row in a valid move.
Input:
board: current board
player: current player
Returns:
validMoves: a binary vector of length self.getActionSize(), 1 for
moves that are valid from the current board and player,
0 for invalid moves
"""
return self._base_board.with_np_pieces(
np_pieces=board).get_valid_moves()
def getGameEnded(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
r: 0 if game has not ended. 1 if player won, -1 if player lost,
small non-zero value for draw.
"""
b = self._base_board.with_np_pieces(np_pieces=board)
winstate = b.get_win_state()
if winstate.is_ended:
if winstate.winner is None:
# draw has very little value.
return 1e-4
elif winstate.winner == player:
return +1
elif winstate.winner == -player:
return -1
else:
raise ValueError('Unexpected winstate found: ', winstate)
else:
# 0 used to represent unfinished game.
return 0
def getCanonicalForm(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
canonicalBoard: returns canonical form of board. The canonical form
should be independent of player. For e.g. in chess,
the canonical form can be chosen to be from the pov
of white. When the player is white, we can return
board as is. When the player is black, we can invert
the colors and return the board.
"""
return board * player
def getSymmetries(self, board, pi):
"""Board is left/right board symmetric
Input:
board: current board
pi: policy vector of size self.getActionSize()
Returns:
symmForms: a list of [(board,pi)] where each tuple is a symmetrical
form of the board and the corresponding pi vector. This
is used when training the neural network from examples.
"""
return [(board, pi),
(np.array(board[:, ::-1], copy=True),
np.array(pi[::-1], copy=True))]
def stringRepresentation(self, board):
"""
Input:
board: current board
Returns:
boardString: a quick conversion of board to a string format.
Required by MCTS for hashing.
"""
return board.tostring()
@staticmethod
def display(board):
print(" -----------------------")
print(' '.join(map(str, range(len(board[0])))))
print(board)
print(" -----------------------")
# ===== connect4_model ======
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#class Connect4Model(parl.Model): # Kaggle doesn't support parl package
class Connect4Model(nn.Module):
def __init__(self, game, args):
# game params
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
self.args = args
super(Connect4Model, self).__init__()
self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.conv4 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.bn1 = nn.BatchNorm2d(args.num_channels)
self.bn2 = nn.BatchNorm2d(args.num_channels)
self.bn3 = nn.BatchNorm2d(args.num_channels)
self.bn4 = nn.BatchNorm2d(args.num_channels)
self.fc1 = nn.Linear(
args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
self.fc_bn1 = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 64)
self.fc_bn2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, self.action_size)
self.fc4 = nn.Linear(64, 1)
def forward(self, s):
# s: batch_size x board_x x board_y
s = s.view(-1, 1, self.board_x,
self.board_y) # batch_size x 1 x board_x x board_y
s = F.relu(self.bn1(
self.conv1(s))) # batch_size x num_channels x board_x x board_y
s = F.relu(self.bn2(
self.conv2(s))) # batch_size x num_channels x board_x x board_y
s = F.relu(self.bn3(self.conv3(
s))) # batch_size x num_channels x (board_x-2) x (board_y-2)
s = F.relu(self.bn4(self.conv4(
s))) # batch_size x num_channels x (board_x-4) x (board_y-4)
s = s.view(
-1,
self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
s = F.dropout(
F.relu(self.fc_bn1(self.fc1(s))),
p=self.args.dropout,
training=self.training) # batch_size x 128
s = F.dropout(
F.relu(self.fc_bn2(self.fc2(s))),
p=self.args.dropout,
training=self.training) # batch_size x 64
pi = self.fc3(s) # batch_size x action_size
v = self.fc4(s) # batch_size x 1
return F.log_softmax(pi, dim=1), torch.tanh(v)
# ===== simple agent ======
args = dotdict({
'dropout': 0.3,
'num_channels': 64,
})
class SimpleAgent():
def __init__(self, game, cuda=True):
self.cuda = cuda and torch.cuda.is_available()
self.model = Connect4Model(game, args)
if self.cuda:
self.model.cuda()
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
def predict(self, board):
"""
Args:
board (np.array): input board
Return:
pi (np.array): probability of actions
v (np.array): estimated value of input
"""
# preparing input
board = torch.FloatTensor(board.astype(np.float64))
if self.cuda:
board = board.contiguous().cuda()
board = board.view(1, self.board_x, self.board_y)
self.model.eval() # eval mode
with torch.no_grad():
log_pi, v = self.model(board)
pi = torch.exp(log_pi)
return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
def load_checkpoint(self, buffer):
map_location = None if self.cuda else 'cpu'
checkpoint = torch.load(buffer, map_location=map_location)
self.model.load_state_dict(checkpoint)
# ===== predict function ======
import base64
import io
game = Connect4Game()
# AlphaZero players
agent = SimpleAgent(game)
buffer = io.BytesIO(decoded)
agent.load_checkpoint(buffer)
mcts_args = dotdict({'numMCTSSims': 800, 'cpuct': 1.0})
mcts = MCTS(game, agent, mcts_args)
def alphazero_agent(obs, config):
board = np.reshape(obs.board.copy(), game.getBoardSize()).astype(int)
board[np.where(board == 2)] = -1
player = 1
if obs.mark == 2:
player = -1
x = game.getCanonicalForm(board, player)
action = np.argmax(
mcts.getActionProb(x, temp=0, timelimit=config.timeout - 0.5))
return int(action)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class dotdict(dict):
def __getattr__(self, name):
try:
return self[name]
except KeyError:
raise AttributeError(name)
def win_loss_draw(score):
if score > 0:
return 'win'
if score < 0:
return 'loss'
return 'draw'
"""
split one list to multiple lists
"""
split_group = lambda the_list, group_size: zip(*(iter(the_list), ) * group_size)
import numpy as np
import json
from connect4_game import Connect4Game
def get_test_dataset():
game = Connect4Game()
test_dataset = []
with open("refmoves1k_kaggle") as f:
for line in f:
data = json.loads(line)
board = data["board"]
board = np.reshape(board, game.getBoardSize()).astype(int)
board[np.where(board == 2)] = -1
# find out how many moves are played to set the correct mark.
ply = len([x for x in data["board"] if x > 0])
if ply & 1:
player = -1
else:
player = 1
test_dataset.append({
'board': board,
'player': player,
'move_score': data['move score'],
})
return test_dataset
...@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind ...@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils import machine_info from parl.utils import machine_info
from parl.utils import logger, get_gpu_count, tensorboard from parl.utils import logger, get_gpu_count, summary
from parl.algorithms import A2C from parl.algorithms import A2C
from atari_model import ActorCritic from atari_model import ActorCritic
...@@ -205,18 +205,18 @@ class Learner(object): ...@@ -205,18 +205,18 @@ class Learner(object):
} }
if metric['mean_episode_rewards'] is not None: if metric['mean_episode_rewards'] is not None:
tensorboard.add_scalar('train/mean_reward', summary.add_scalar('train/mean_reward',
metric['mean_episode_rewards'], metric['mean_episode_rewards'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/total_loss', metric['total_loss'], summary.add_scalar('train/total_loss', metric['total_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/pi_loss', metric['pi_loss'], summary.add_scalar('train/pi_loss', metric['pi_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/vf_loss', metric['vf_loss'], summary.add_scalar('train/vf_loss', metric['vf_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/entropy', metric['entropy'], summary.add_scalar('train/entropy', metric['entropy'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/learn_rate', metric['lr'], summary.add_scalar('train/learn_rate', metric['lr'],
self.sample_total_steps) self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
...@@ -16,16 +16,16 @@ import numpy as np ...@@ -16,16 +16,16 @@ import numpy as np
import copy import copy
from collections import deque, namedtuple from collections import deque, namedtuple
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
class ReplayMemory(object): class ReplayMemory(object):
def __init__(self, max_size, state_shape, context_len): def __init__(self, max_size, obs_shape, context_len):
self.max_size = int(max_size) self.max_size = int(max_size)
self.state_shape = state_shape self.obs_shape = obs_shape
self.context_len = int(context_len) self.context_len = int(context_len)
self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8') self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
self.action = np.zeros((self.max_size, ), dtype='int32') self.action = np.zeros((self.max_size, ), dtype='int32')
self.reward = np.zeros((self.max_size, ), dtype='float32') self.reward = np.zeros((self.max_size, ), dtype='float32')
self.isOver = np.zeros((self.max_size, ), dtype='bool') self.isOver = np.zeros((self.max_size, ), dtype='bool')
...@@ -48,42 +48,41 @@ class ReplayMemory(object): ...@@ -48,42 +48,41 @@ class ReplayMemory(object):
else: else:
self._context.append(exp) self._context.append(exp)
def recent_state(self): def recent_obs(self):
""" maintain recent state for training""" """ maintain recent obs for training"""
lst = list(self._context) lst = list(self._context)
states = [np.zeros(self.state_shape, dtype='uint8')] * \ obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
(self._context.maxlen - len(lst)) (self._context.maxlen - len(lst))
states.extend([k.state for k in lst]) obs.extend([k.obs for k in lst])
return states return obs
def sample(self, idx): def sample(self, idx):
""" return state, action, reward, isOver, """ return obs, action, reward, isOver,
note that some frames in state may be generated from last episode, note that some frames in obs may be generated from last episode,
they should be removed from state they should be removed from obs
""" """
state = np.zeros( obs = np.zeros(
(self.context_len + 1, ) + self.state_shape, dtype=np.uint8) (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
state_idx = np.arange(idx, obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
idx + self.context_len + 1) % self._curr_size
# confirm that no frame was generated from last episode # confirm that no frame was generated from last episode
has_last_episode = False has_last_episode = False
for k in range(self.context_len - 2, -1, -1): for k in range(self.context_len - 2, -1, -1):
to_check_idx = state_idx[k] to_check_idx = obs_idx[k]
if self.isOver[to_check_idx]: if self.isOver[to_check_idx]:
has_last_episode = True has_last_episode = True
state_idx = state_idx[k + 1:] obs_idx = obs_idx[k + 1:]
state[k + 1:] = self.state[state_idx] obs[k + 1:] = self.obs[obs_idx]
break break
if not has_last_episode: if not has_last_episode:
state = self.state[state_idx] obs = self.obs[obs_idx]
real_idx = (idx + self.context_len - 1) % self._curr_size real_idx = (idx + self.context_len - 1) % self._curr_size
action = self.action[real_idx] action = self.action[real_idx]
reward = self.reward[real_idx] reward = self.reward[real_idx]
isOver = self.isOver[real_idx] isOver = self.isOver[real_idx]
return state, reward, action, isOver return obs, reward, action, isOver
def __len__(self): def __len__(self):
return self._curr_size return self._curr_size
...@@ -92,7 +91,7 @@ class ReplayMemory(object): ...@@ -92,7 +91,7 @@ class ReplayMemory(object):
return self._curr_size return self._curr_size
def _assign(self, pos, exp): def _assign(self, pos, exp):
self.state[pos] = exp.state self.obs[pos] = exp.obs
self.reward[pos] = exp.reward self.reward[pos] = exp.reward
self.action[pos] = exp.action self.action[pos] = exp.action
self.isOver[pos] = exp.isOver self.isOver[pos] = exp.isOver
...@@ -107,8 +106,8 @@ class ReplayMemory(object): ...@@ -107,8 +106,8 @@ class ReplayMemory(object):
return self._process_batch(batch_exp) return self._process_batch(batch_exp)
def _process_batch(self, batch_exp): def _process_batch(self, batch_exp):
state = np.asarray([e[0] for e in batch_exp], dtype='uint8') obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
reward = np.asarray([e[1] for e in batch_exp], dtype='float32') reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
action = np.asarray([e[2] for e in batch_exp], dtype='int8') action = np.asarray([e[2] for e in batch_exp], dtype='int8')
isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
return [state, action, reward, isOver] return [obs, action, reward, isOver]
...@@ -22,11 +22,11 @@ import parl ...@@ -22,11 +22,11 @@ import parl
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from parl.utils import tensorboard, logger from parl.utils import summary, logger
from parl.algorithms import DQN, DDQN from parl.algorithms import DQN, DDQN
from agent import AtariAgent from agent import AtariAgent
from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState from atari_wrapper import FireResetEnv, FrameStack, LimitLength
from model import AtariModel from model import AtariModel
from replay_memory import ReplayMemory, Experience from replay_memory import ReplayMemory, Experience
from utils import get_player from utils import get_player
...@@ -43,57 +43,57 @@ GAMMA = 0.99 ...@@ -43,57 +43,57 @@ GAMMA = 0.99
def run_train_episode(env, agent, rpm): def run_train_episode(env, agent, rpm):
total_reward = 0 total_reward = 0
all_cost = [] all_cost = []
state = env.reset() obs = env.reset()
steps = 0 steps = 0
while True: while True:
steps += 1 steps += 1
context = rpm.recent_state() context = rpm.recent_obs()
context.append(state) context.append(obs)
context = np.stack(context, axis=0) context = np.stack(context, axis=0)
action = agent.sample(context) action = agent.sample(context)
next_state, reward, isOver, _ = env.step(action) next_obs, reward, isOver, _ = env.step(action)
rpm.append(Experience(state, action, reward, isOver)) rpm.append(Experience(obs, action, reward, isOver))
if rpm.size() > MEMORY_WARMUP_SIZE: if rpm.size() > MEMORY_WARMUP_SIZE:
if steps % UPDATE_FREQ == 0: if steps % UPDATE_FREQ == 0:
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size) args.batch_size)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
batch_next_state = batch_all_state[:, 1:, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :]
cost = agent.learn(batch_state, batch_action, batch_reward, cost = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_state, batch_isOver) batch_next_obs, batch_isOver)
all_cost.append(cost) all_cost.append(cost)
total_reward += reward total_reward += reward
state = next_state obs = next_obs
if isOver: if isOver:
mean_loss = np.mean(all_cost) if all_cost else None mean_loss = np.mean(all_cost) if all_cost else None
return total_reward, steps, mean_loss return total_reward, steps, mean_loss
def run_evaluate_episode(env, agent): def run_evaluate_episode(env, agent):
state = env.reset() obs = env.reset()
total_reward = 0 total_reward = 0
while True: while True:
pred_Q = agent.predict(state) pred_Q = agent.predict(obs)
action = pred_Q.max(1)[1].item() action = pred_Q.max(1)[1].item()
state, reward, isOver, _ = env.step(action) obs, reward, isOver, _ = env.step(action)
total_reward += reward total_reward += reward
if isOver: if isOver:
return total_reward return total_reward
def get_fixed_states(rpm, batch_size): def get_fixed_obs(rpm, batch_size):
states = [] obs = []
for _ in range(3): for _ in range(3):
batch_all_state = rpm.sample_batch(batch_size)[0] batch_all_obs = rpm.sample_batch(batch_size)[0]
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
states.append(batch_state) obs.append(batch_obs)
fixed_states = np.concatenate(states, axis=0) fixed_obs = np.concatenate(obs, axis=0)
return fixed_states return fixed_obs
def evaluate_fixed_Q(agent, states): def evaluate_fixed_Q(agent, obs):
with torch.no_grad(): with torch.no_grad():
max_pred_Q = agent.alg.model(states).max(1)[0].mean() max_pred_Q = agent.alg.model(obs).max(1)[0].mean()
return max_pred_Q.item() return max_pred_Q.item()
...@@ -131,9 +131,9 @@ def main(): ...@@ -131,9 +131,9 @@ def main():
total_reward, steps, _ = run_train_episode(env, agent, rpm) total_reward, steps, _ = run_train_episode(env, agent, rpm)
pbar.update(steps) pbar.update(steps)
# Get fixed states to check value function. # Get fixed obs to check value function.
fixed_states = get_fixed_states(rpm, args.batch_size) fixed_obs = get_fixed_obs(rpm, args.batch_size)
fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device) fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device)
# train # train
test_flag = 0 test_flag = 0
...@@ -152,18 +152,17 @@ def main(): ...@@ -152,18 +152,17 @@ def main():
for _ in range(3): for _ in range(3):
eval_rewards.append(run_evaluate_episode(test_env, agent)) eval_rewards.append(run_evaluate_episode(test_env, agent))
tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards), summary.add_scalar('dqn/eval', np.mean(eval_rewards),
total_steps) total_steps)
tensorboard.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/score', total_reward, total_steps)
tensorboard.add_scalar('dqn/loss', loss, total_steps) summary.add_scalar('dqn/loss', loss, total_steps)
tensorboard.add_scalar('dqn/exploration', agent.exploration, summary.add_scalar('dqn/exploration', agent.exploration,
total_steps) total_steps)
tensorboard.add_scalar('dqn/Q value', summary.add_scalar('dqn/Q value',
evaluate_fixed_Q(agent, fixed_states), evaluate_fixed_Q(agent, fixed_obs),
total_steps)
tensorboard.add_scalar('dqn/grad_norm',
get_grad_norm(agent.alg.model),
total_steps) total_steps)
summary.add_scalar('dqn/grad_norm',
get_grad_norm(agent.alg.model), total_steps)
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import torch
def get_args():
parser = argparse.ArgumentParser(description='RL')
parser.add_argument(
'--lr', type=float, default=3e-4, help='learning rate (default: 3e-4)')
parser.add_argument(
'--eps',
type=float,
default=1e-5,
help='RMSprop optimizer epsilon (default: 1e-5)')
parser.add_argument(
'--gamma',
type=float,
default=0.99,
help='discount factor for rewards (default: 0.99)')
parser.add_argument(
'--gae-lambda',
type=float,
default=0.95,
help='gae lambda parameter (default: 0.95)')
parser.add_argument(
'--entropy-coef',
type=float,
default=0.,
help='entropy term coefficient (default: 0.)')
parser.add_argument(
'--value-loss-coef',
type=float,
default=0.5,
help='value loss coefficient (default: 0.5)')
parser.add_argument(
'--max-grad-norm',
type=float,
default=0.5,
help='max norm of gradients (default: 0.5)')
parser.add_argument(
'--seed', type=int, default=1, help='random seed (default: 1)')
parser.add_argument(
'--num-steps',
type=int,
default=2048,
help='number of maximum forward steps in ppo (default: 2048)')
parser.add_argument(
'--ppo-epoch',
type=int,
default=10,
help='number of ppo epochs (default: 10)')
parser.add_argument(
'--num-mini-batch',
type=int,
default=32,
help='number of batches for ppo (default: 32)')
parser.add_argument(
'--clip-param',
type=float,
default=0.2,
help='ppo clip parameter (default: 0.2)')
parser.add_argument(
'--log-interval',
type=int,
default=1,
help='log interval, one log per n updates (default: 1)')
parser.add_argument(
'--eval-interval',
type=int,
default=10,
help='eval interval, one eval per n updates (default: 10)')
parser.add_argument(
'--num-env-steps',
type=int,
default=10e5,
help='number of environment steps to train (default: 10e5)')
parser.add_argument(
'--env-name',
default='Hopper-v2',
help='environment to train on (default: Hopper-v2)')
parser.add_argument(
'--use-linear-lr-decay',
action='store_true',
default=False,
help='use a linear schedule on the learning rate')
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
return args
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import torch
import utils
from wrapper import make_env
def evaluate(agent, ob_rms, env_name, seed, device):
if seed != None:
seed += 1
eval_envs = make_env(env_name, seed, None)
vec_norm = utils.get_vec_normalize(eval_envs)
if vec_norm is not None:
vec_norm.eval()
vec_norm.ob_rms = ob_rms
eval_episode_rewards = []
obs = eval_envs.reset()
eval_masks = torch.zeros(1, 1, device=device)
while len(eval_episode_rewards) < 10:
with torch.no_grad():
action = agent.predict(obs)
# Obser reward and next obs
obs, _, done, infos = eval_envs.step(action)
eval_masks = torch.tensor(
[[0.0] if done_ else [1.0] for done_ in done],
dtype=torch.float32,
device=device)
for info in infos:
if 'episode' in info.keys():
eval_episode_rewards.append(info['episode']['r'])
eval_envs.close()
print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
len(eval_episode_rewards), np.mean(eval_episode_rewards)))
return np.mean(eval_episode_rewards)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import torch
class MujocoAgent(parl.Agent):
def __init__(self, algorithm, device):
self.alg = algorithm
self.device = device
def predict(self, obs):
obs = torch.from_numpy(obs).float().to(self.device)
action = self.alg.predict(obs)
return action.cpu().numpy()
def sample(self, obs):
obs = torch.from_numpy(obs).to(self.device)
value, action, action_log_probs = self.alg.sample(obs)
return value.cpu().numpy(), action.cpu().numpy(), \
action_log_probs.cpu().numpy()
def learn(self, next_value, gamma, gae_lambda, ppo_epoch, num_mini_batch,
rollouts):
value_loss_epoch = 0
action_loss_epoch = 0
dist_entropy_epoch = 0
for e in range(ppo_epoch):
data_generator = rollouts.sample_batch(next_value, gamma,
gae_lambda, num_mini_batch)
for sample in data_generator:
obs_batch, actions_batch, \
value_preds_batch, return_batch, old_action_log_probs_batch, \
adv_targ = sample
obs_batch = torch.from_numpy(obs_batch).to('cuda')
actions_batch = torch.from_numpy(actions_batch).to('cuda').to(
'cuda')
value_preds_batch = torch.from_numpy(value_preds_batch).to(
'cuda')
return_batch = torch.from_numpy(return_batch).to('cuda')
old_action_log_probs_batch = torch.from_numpy(
old_action_log_probs_batch).to('cuda')
adv_targ = torch.from_numpy(adv_targ).to('cuda')
value_loss, action_loss, dist_entropy = self.alg.learn(
obs_batch, actions_batch, value_preds_batch, return_batch,
old_action_log_probs_batch, adv_targ)
value_loss_epoch += value_loss
action_loss_epoch += action_loss
dist_entropy_epoch += dist_entropy
num_updates = ppo_epoch * num_mini_batch
value_loss_epoch /= num_updates
action_loss_epoch /= num_updates
dist_entropy_epoch /= num_updates
return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
def value(self, obs):
obs = torch.from_numpy(obs).to(self.device)
return self.alg.value(obs).cpu().numpy()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
class MujocoModel(parl.Model):
def __init__(self, obs_dim, act_dim):
super(MujocoModel, self).__init__()
self.actor = Actor(obs_dim, act_dim)
self.critic = Critic(obs_dim)
def policy(self, obs):
return self.actor(obs)
def value(self, obs):
return self.critic(obs)
class Actor(parl.Model):
def __init__(self, obs_dim, act_dim):
super(Actor, self).__init__()
self.fc1 = nn.Linear(obs_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc_mean = nn.Linear(64, act_dim)
self.log_std = nn.Parameter(torch.zeros(act_dim))
def forward(self, obs):
x = torch.tanh(self.fc1(obs))
x = torch.tanh(self.fc2(x))
mean = self.fc_mean(x)
return mean, self.log_std
class Critic(parl.Model):
def __init__(self, obs_dim):
super(Critic, self).__init__()
self.fc1 = nn.Linear(obs_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 1)
def forward(self, obs):
x = torch.tanh(self.fc1(obs))
x = torch.tanh(self.fc2(x))
value = self.fc3(x)
return value
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
class RolloutStorage(object):
def __init__(self, num_steps, obs_dim, act_dim):
self.num_steps = num_steps
self.obs_dim = obs_dim
self.act_dim = act_dim
self.obs = np.zeros((num_steps + 1, obs_dim), dtype='float32')
self.actions = np.zeros((num_steps, act_dim), dtype='float32')
self.value_preds = np.zeros((num_steps + 1, ), dtype='float32')
self.returns = np.zeros((num_steps + 1, ), dtype='float32')
self.action_log_probs = np.zeros((num_steps, ), dtype='float32')
self.rewards = np.zeros((num_steps, ), dtype='float32')
self.masks = np.ones((num_steps + 1, ), dtype='bool')
self.bad_masks = np.ones((num_steps + 1, ), dtype='bool')
self.step = 0
def append(self, obs, actions, action_log_probs, value_preds, rewards,
masks, bad_masks):
"""
print("obs")
print(obs)
print("masks")
print(masks)
print("rewards")
print(rewards)
exit()
"""
self.obs[self.step + 1] = obs
self.actions[self.step] = actions
self.rewards[self.step] = rewards
self.action_log_probs[self.step] = action_log_probs
self.value_preds[self.step] = value_preds
self.masks[self.step + 1] = masks
self.bad_masks[self.step + 1] = bad_masks
self.step = (self.step + 1) % self.num_steps
def sample_batch(self,
next_value,
gamma,
gae_lambda,
num_mini_batch,
mini_batch_size=None):
# calculate return and advantage first
self.compute_returns(next_value, gamma, gae_lambda)
advantages = self.returns[:-1] - self.value_preds[:-1]
advantages = (advantages - advantages.mean()) / (
advantages.std() + 1e-5)
# generate sample batch
mini_batch_size = self.num_steps // num_mini_batch
sampler = BatchSampler(
SubsetRandomSampler(range(self.num_steps)),
mini_batch_size,
drop_last=True)
for indices in sampler:
obs_batch = self.obs[:-1][indices]
actions_batch = self.actions[indices]
value_preds_batch = self.value_preds[:-1][indices]
returns_batch = self.returns[:-1][indices]
old_action_log_probs_batch = self.action_log_probs[indices]
value_preds_batch = value_preds_batch.reshape(-1, 1)
returns_batch = returns_batch.reshape(-1, 1)
old_action_log_probs_batch = old_action_log_probs_batch.reshape(
-1, 1)
adv_targ = advantages[indices]
adv_targ = adv_targ.reshape(-1, 1)
yield obs_batch, actions_batch, value_preds_batch, returns_batch, old_action_log_probs_batch, adv_targ
def after_update(self):
self.obs[0] = np.copy(self.obs[-1])
self.masks[0] = np.copy(self.masks[-1])
self.bad_masks[0] = np.copy(self.bad_masks[-1])
def compute_returns(self, next_value, gamma, gae_lambda):
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.size)):
delta = self.rewards[step] + gamma * self.value_preds[
step + 1] * self.masks[step + 1] - self.value_preds[step]
gae = delta + gamma * gae_lambda * self.masks[step + 1] * gae
gae = gae * self.bad_masks[step + 1]
self.returns[step] = gae + self.value_preds[step]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# modified from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail
import copy
import os
from collections import deque
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import utils
from arguments import get_args
from wrapper import make_env
from mujoco_model import MujocoModel
from parl.algorithms import PPO
from mujoco_agent import MujocoAgent
from storage import RolloutStorage
from evaluation import evaluate
def main():
args = get_args()
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.set_num_threads(1)
device = torch.device("cuda:0" if args.cuda else "cpu")
envs = make_env(args.env_name, args.seed, args.gamma)
model = MujocoModel(envs.observation_space.shape[0],
envs.action_space.shape[0])
model.to(device)
algorithm = PPO(
model,
args.clip_param,
args.value_loss_coef,
args.entropy_coef,
initial_lr=args.lr,
eps=args.eps,
max_grad_norm=args.max_grad_norm)
agent = MujocoAgent(algorithm, device)
rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
envs.action_space.shape[0])
obs = envs.reset()
rollouts.obs[0] = np.copy(obs)
episode_rewards = deque(maxlen=10)
num_updates = int(args.num_env_steps) // args.num_steps
for j in range(num_updates):
if args.use_linear_lr_decay:
# decrease learning rate linearly
utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
args.lr)
for step in range(args.num_steps):
# Sample actions
with torch.no_grad():
value, action, action_log_prob = agent.sample(
rollouts.obs[step]) # why use obs from rollouts???有病吧
# Obser reward and next obs
obs, reward, done, infos = envs.step(action)
for info in infos:
if 'episode' in info.keys():
episode_rewards.append(info['episode']['r'])
# If done then clean the history of observations.
masks = torch.FloatTensor(
[[0.0] if done_ else [1.0] for done_ in done])
bad_masks = torch.FloatTensor(
[[0.0] if 'bad_transition' in info.keys() else [1.0]
for info in infos])
rollouts.append(obs, action, action_log_prob, value, reward, masks,
bad_masks)
with torch.no_grad():
next_value = agent.value(rollouts.obs[-1])
value_loss, action_loss, dist_entropy = agent.learn(
next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
args.num_mini_batch, rollouts)
rollouts.after_update()
if j % args.log_interval == 0 and len(episode_rewards) > 1:
total_num_steps = (j + 1) * args.num_steps
print(
"Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
.format(j, total_num_steps, len(episode_rewards),
np.mean(episode_rewards), np.median(episode_rewards),
np.min(episode_rewards), np.max(episode_rewards),
dist_entropy, value_loss, action_loss))
if (args.eval_interval is not None and len(episode_rewards) > 1
and j % args.eval_interval == 0):
ob_rms = utils.get_vec_normalize(envs).ob_rms
eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
args.seed, device)
if __name__ == "__main__":
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
import torch
import torch.nn as nn
from wrapper import VecNormalize
def get_vec_normalize(venv):
if isinstance(venv, VecNormalize):
return venv
elif hasattr(venv, 'venv'):
return get_vec_normalize(venv.venv)
return None
def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
"""Decreases the learning rate linearly"""
lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def init(module, weight_init, bias_init, gain=1):
weight_init(module.weight.data, gain=gain)
bias_init(module.bias.data)
return module
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Simplified version of https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl/component/envs.py
import numpy as np
import gym
from gym.core import Wrapper
import time
class TimeLimitMask(gym.Wrapper):
def step(self, action):
obs, rew, done, info = self.env.step(action)
if done and self.env._max_episode_steps == self.env._elapsed_steps:
info['bad_transition'] = True
return obs, rew, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class MonitorEnv(gym.Wrapper):
def __init__(self, env):
Wrapper.__init__(self, env=env)
self.tstart = time.time()
self.rewards = None
def step(self, action):
ob, rew, done, info = self.env.step(action)
self.update(ob, rew, done, info)
return (ob, rew, done, info)
def update(self, ob, rew, done, info):
self.rewards.append(rew)
if done:
eprew = sum(self.rewards)
eplen = len(self.rewards)
epinfo = {
"r": round(eprew, 6),
"l": eplen,
"t": round(time.time() - self.tstart, 6)
}
assert isinstance(info, dict)
info['episode'] = epinfo
self.reset()
def reset(self, **kwargs):
self.rewards = []
return self.env.reset(**kwargs)
class VectorEnv(gym.Wrapper):
def step(self, action):
ob, rew, done, info = self.env.step(action)
ob = np.array(ob)
ob = ob[np.newaxis, :]
rew = np.array([rew])
done = np.array([done])
info = [info]
return (ob, rew, done, info)
class RunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, epsilon=1e-4, shape=()):
self.mean = np.zeros(shape, 'float64')
self.var = np.ones(shape, 'float64')
self.count = epsilon
def update(self, x):
batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0)
batch_count = x.shape[0]
self.update_from_moments(batch_mean, batch_var, batch_count)
def update_from_moments(self, batch_mean, batch_var, batch_count):
self.mean, self.var, self.count = update_mean_var_count_from_moments(
self.mean, self.var, self.count, batch_mean, batch_var,
batch_count)
def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
batch_count):
delta = batch_mean - mean
tot_count = count + batch_count
new_mean = mean + delta * batch_count / tot_count
m_a = var * count
m_b = batch_var * batch_count
M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
new_var = M2 / tot_count
new_count = tot_count
return new_mean, new_var, new_count
class VecNormalize(gym.Wrapper):
def __init__(self,
env,
ob=True,
ret=True,
clipob=10.,
cliprew=10.,
gamma=0.99,
epsilon=1e-8):
Wrapper.__init__(self, env=env)
observation_space = env.observation_space.shape[0]
self.ob_rms = RunningMeanStd(shape=observation_space) if ob else None
self.ret_rms = RunningMeanStd(shape=()) if ret else None
self.clipob = clipob
self.cliprew = cliprew
self.gamma = gamma
self.epsilon = epsilon
self.ret = np.zeros(1)
self.training = True
def step(self, action):
ob, rew, new, info = self.env.step(action)
self.ret = self.ret * self.gamma + rew
# normalize observation
ob = self._obfilt(ob)
# normalize reward
if self.ret_rms:
self.ret_rms.update(self.ret)
rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon),
-self.cliprew, self.cliprew)
self.ret[new] = 0.
return ob, rew, new, info
def reset(self):
self.ret = np.zeros(1)
ob = self.env.reset()
return self._obfilt(ob)
def _obfilt(self, ob, update=True):
if self.ob_rms:
if self.training and update:
self.ob_rms.update(ob)
ob = np.clip((ob - self.ob_rms.mean) /
np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob,
self.clipob)
return ob
else:
return ob
def train(self):
self.training = True
def eval(self):
self.trainint = False
def make_env(env_name, seed, gamma):
env = gym.make(env_name)
env.seed(seed)
env = TimeLimitMask(env)
env = MonitorEnv(env)
env = VectorEnv(env)
if gamma is None:
env = VecNormalize(env, ret=False)
else:
env = VecNormalize(env, gamma=gamma)
return env
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import gym import gym
import argparse import argparse
import numpy as np import numpy as np
from parl.utils import logger, tensorboard, ReplayMemory from parl.utils import logger, summary, ReplayMemory
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
...@@ -103,8 +103,7 @@ def main(): ...@@ -103,8 +103,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm) train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
tensorboard.add_scalar('train/episode_reward', train_reward, summary.add_scalar('train/episode_reward', train_reward, total_steps)
total_steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
...@@ -112,7 +111,7 @@ def main(): ...@@ -112,7 +111,7 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) total_steps, evaluate_reward))
tensorboard.add_scalar('eval/episode_reward', evaluate_reward, summary.add_scalar('eval/episode_reward', evaluate_reward,
total_steps) total_steps)
......
minimal example
---------------------
``本教程的目标:
演示如何通过EvoKit库来解决经典的CartPole 问题。``
*本教程假定读者曾经使用过PaddlePaddle, 了解基本的进化算法迭代流程。*
CartPole 介绍
#############
CartPole又叫倒立摆。小车上放了一根杆,杆会因重力而倒下。为了不让杆倒下,我们要通过移动小车,来保持其是直立的。如下图所示。
在每一个时间步,模型的输入是一个4维的向量,表示当前小车和杆的状态,模型输出的信号用于控制小车往左或者右移动。当杆没有倒下的时候,每个时间步,环境会给1分的奖励;当杆倒下后,环境不会给任何的奖励,游戏结束。
.. image:: ../../examples/QuickStart/performance.gif
:width: 300px
step1: 生成预测网络
########################
根据上面的环境介绍,我们需要构造一个神经网络,输入为4维的向量,输出为2维的概率分布向量(表示左/右)移动的概率。
在这里,我们使用Paddle来实现预测网络,并保存到本地。
.. code-block:: python
from paddle import fluid
def net(obs, act_dim):
hid1 = fluid.layers.fc(obs, size=20)
prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
return prob
if __name__ == '__main__':
obs_dim = 4
act_dim = 2
obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
prob = net(obs, act_dim)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
fluid.io.save_inference_model(
dirname='init_model',
feeded_var_names=['obs'],
target_vars=[prob],
params_filename='params',
model_filename='model',
executor=exe)
step2: 构造ESAgent
###################
- 调用 ``load_config`` 加载配置文件。
- 调用 ``load_inference_model`` 函数加载模型参数。
- 调用 ``init_solver`` 初始化solver。
配置文件主要是用于指定进化算法类型(比如Gaussian或者CMA),使用的optimizer类型(Adam或者SGD)。
.. code-block:: c++
ESAgent agent = ESAgent();
agent.load_config(config);
agent.load_inference_model(model_dir);
agent.init_solver();
// 附:EvoKit配置项示范
solver {
type: BASIC_ES
optimizer { // 线下Adam更新
type: ADAM
base_lr: 0.05
adam {
beta1: 0.9
beta2: 0.999
epsilon: 1e-08
}
}
sampling { // 线上高斯采样
type: GAUSSIAN_SAMPLING
gaussian_sampling {
std: 0.5
cached: true
seed: 1024
cache_size : 100000
}
}
}
step3: 生成用于采样的Agent
###################
主要关注三个接口:
- 调用 ``clone`` 生成一个用于sampling的agent。
- 调用 ``add_noise`` 给这个agent的参数空间增加噪声,同时返回该噪声对应的唯一信息,这个信息得记录在log中,用于线下更新。
- 调用 ``predict`` 提供预测接口。
.. code-block:: c++
auto sampling_agent = agent.clone();
auto sampling_info = sampling_agent.add_noise();
sampling_agent.predict(feature);
step4: 用采样的数据更新模型参数
###################
用户提供两组数据:
- 采样参数过程中用于线下复现采样噪声的sampling_info
- 扰动参数后,新参数的评估结果
.. code-block:: c++
agent.update(sampling_infos, rewards);
主代码以及注释
#################
以下的代码演示通过多线程同时采样, 提升解决问题的效率。
.. code-block:: c++
int main(int argc, char* argv[]) {
std::vector<CartPole> envs;
// 构造10个环境,用于多线程训练
for (int i = 0; i < ITER; ++i) {
envs.push_back(CartPole());
}
// 初始化ESAgent
std::string model_dir = "./demo/cartpole/init_model";
std::string config_path = "./demo/cartpole/config.prototxt";
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(config_path); // 加载配置
agent->load_inference_model(FLAGS_model_dir); // 加载初始预测模型
agent->init_solver(); // 初始化solver,注意要在load_inference_model后执行
// 生成10个agent用于同时采样
std::vector<std::shared_ptr<ESAgent>> sampling_agents;
for (int i = 0; i < ITER; ++i) {
sampling_agents.push_back(agent->clone());
}
std::vector<SamplingInfo> sampling_infos;
std::vector<float> rewards(ITER, 0.0f);
sampling_infos.resize(ITER);
omp_set_num_threads(10);
// 共迭代100轮
for (int epoch = 0; epoch < 100; ++epoch) {
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < ITER; ++i) {
std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
SamplingInfo sampling_info;
sampling_agent->add_noise(sampling_info);
float reward = evaluate(envs[i], sampling_agent);
// 保存采样的sampling_info以及对应的评估结果reward
sampling_infos[i] = sampling_info;
rewards[i] = reward;
}
// 更新模型参数,注意:参数更新后会自动同步到sampling_agent中
agent->update(sampling_infos, rewards);
int reward = evaluate(envs[0], agent);
LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; // 打印每一轮reward
}
}
如何运行demo
#################
- 下载代码
在icode上clone代码,我们的仓库路径是: ``baidu/nlp/deep-es`` ``TO DO: 修改库路径``
- 编译demo
通过bcloud的云端集群编译即可,命令为: ``bb``
- 运行demo
编译完成后,我们需要增加动态库查找路径:
``export LD_LIBRARY_PATH=./output/so/:$LD_LIBRARY_PATH``
运行demo: ``./output/bin/cartpole/train``
问题解决
####################
在使用过程中有任何问题,请加hi群: 1692822 (PARL官方答疑群)进行咨询,开发同学会直接回答任何的使用问题。
Example for Online Products
#########################
``本教程的目标: 演示通过EvoKit库上线后,如何迭代算法,更新模型参数。``
在产品线中,线上无法实时拿到用户日志,经常是通过保存用户点击/时长日志,在线下根据用户数据更新模型,然后再推送到线上,完成算法的更新。
本教程继续围绕经典的CartPole环境,展示如何通过在线采样/离线更新的方式,来更新迭代ES算法。
demo的完整代码示例放在demp/online_example文件夹中。
``TO DO: 文件夹``
初始化solver
---------------------
构造solver,对它初始化,并保存到文件。初始化solver仅需在开始时调用一次。
.. code-block:: c++
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(FLAGS_config_path);
agent->load_inference_model(FLAGS_model_dir);
agent->init_solver();
agent->save_solver(FLAGS_model_dir);
线上采样
---------------------
加载模型和solver,记录线上采样返回的sampling_info以及评估的reward,并通过二进制的方式记录到log文件中。
.. code-block:: c++
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(FLAGS_config_path);
agent->load_inference_model(FLAGS_model_dir);
agent->load_solver(FLAGS_model_dir);
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < ITER; ++i) {
std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
SamplingInfo sampling_info;
sampling_agent->add_noise(sampling_info);
float reward = evaluate(envs[i], sampling_agent);
sampling_infos[i] = sampling_info;
rewards[i] = reward;
}
// save sampling information and log in binary fomrat
std::ofstream log_stream(FLAGS_log_path, std::ios::binary);
for (int i = 0; i < ITER; ++i) {
std::string data;
sampling_infos[i].SerializeToString(&data);
int size = data.size();
log_stream.write((char*) &rewards[i], sizeof(float));
log_stream.write((char*) &size, sizeof(int));
log_stream.write(data.c_str(), size);
}
log_stream.close();
线下更新
-----------------------
在加载好之前记录的log之后,调用 ``update`` 函数进行更新,然后通过 ``save_inference_model`` 和 ``save_solver`` 函数保存更新后的参数到本地,推送到线上。
.. code-block:: c++
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(FLAGS_config_path);
agent->load_inference_model(FLAGS_model_dir);
agent->load_solver(FLAGS_model_dir);
// load training data
std::vector<SamplingInfo> sampling_infos;
std::vector<float> rewards(ITER, 0.0f);
sampling_infos.resize(ITER);
std::ifstream log_stream(FLAGS_log_path);
CHECK(log_stream.good()) << "[EvoKit] cannot open log: " << FLAGS_log_path;
char buffer[1000];
for (int i = 0; i < ITER; ++i) {
int size;
log_stream.read((char*) &rewards[i], sizeof(float));
log_stream.read((char*) &size, sizeof(int));
log_stream.read(buffer, size);
buffer[size] = 0;
std::string data(buffer);
sampling_infos[i].ParseFromString(data);
}
// update model and save parameter
agent->update(sampling_infos, rewards);
agent->save_inference_model(FLAGS_updated_model_dir);
agent->save_solver(FLAGS_updated_model_dir);
主代码
-----------------------
将以上代码分别编译成可执行文件。
- 初始化solver: ``init_solver`` 。
- 线上采样: ``online_sampling`` 。
- 线下更新: ``offline update`` 。
.. code-block:: shell
#------------------------init solver------------------------
./init_solver \
--model_dir="./model_warehouse/model_dir_0" \
--config_path="config.prototxt"
for ((epoch=0;epoch<200;++epoch));do
#------------------------online sampling------------------------
./online_sampling \
--log_path="./sampling_log" \
--model_dir="./model_warehouse/model_dir_$epoch" \
--config_path="./config.prototxt"
#------------------------offline update------------------------
next_epoch=$((epoch+1))
./offline_update \
--log_path='./sampling_log' \
--model_dir="./model_warehouse/model_dir_$epoch" \
--updated_model_dir="./model_warehouse/model_dir_${next_epoch}" \
--config_path="./config.prototxt"
done
Overview
------------------
``EvoKit`` 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打 **快速上线验证** 。
.. image:: ../../evo_kit/DeepES.gif
:align: center
:width: 400px
特性
#########
**1. 多种进化算法支持。** 支持高斯采样、CMA、GA等算法,更多算法持续接入中。
**2. 主流优化器支持。** 支持SGD/Momentum/Adam等多个主流优化器,有效提升算法收敛效率。
**3. 一站式上线。** 整合了线上采样和线下更新流程, 提供Bcloud/Cmake等编译方式, 助力快速上线。
**4. 深度学习框架全系列兼容。** 裸写的网络,paddle/lego/Torch等深度学习框架,EvoKit都支持。
**5. 同步/异步更新方式。** 支持多个采样模型/多份采样数据异步更新,完美契合业务场景。
...@@ -101,3 +101,37 @@ def setup(app): ...@@ -101,3 +101,37 @@ def setup(app):
add_module_names = False add_module_names = False
latex_engine = 'xelatex'
latex_use_xindy = False
latex_elements = {
'preamble': '\\usepackage[UTF8]{ctex}\n',
}
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
'preamble':
r'''
\hypersetup{unicode=true}
\usepackage{CJKutf8}
\DeclareUnicodeCharacter{00A0}{\nobreakspace}
\DeclareUnicodeCharacter{2203}{\ensuremath{\exists}}
\DeclareUnicodeCharacter{2200}{\ensuremath{\forall}}
\DeclareUnicodeCharacter{2286}{\ensuremath{\subseteq}}
\DeclareUnicodeCharacter{2713}{x}
\DeclareUnicodeCharacter{27FA}{\ensuremath{\Longleftrightarrow}}
\DeclareUnicodeCharacter{221A}{\ensuremath{\sqrt{}}}
\DeclareUnicodeCharacter{221B}{\ensuremath{\sqrt[3]{}}}
\DeclareUnicodeCharacter{2295}{\ensuremath{\oplus}}
\DeclareUnicodeCharacter{2297}{\ensuremath{\otimes}}
\begin{CJK}{UTF8}{gbsn}
\AtEndDocument{\end{CJK}}
''',
}
...@@ -58,9 +58,10 @@ Abstractions ...@@ -58,9 +58,10 @@ Abstractions
:maxdepth: 1 :maxdepth: 1
:caption: Tutorial :caption: Tutorial
getting_started.rst tutorial/getting_started.rst
new_alg.rst tutorial/new_alg.rst
save_param.rst tutorial/save_param.rst
tutorial/tensorboard.rst
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
...@@ -83,3 +84,11 @@ Abstractions ...@@ -83,3 +84,11 @@ Abstractions
model.rst model.rst
algorithm.rst algorithm.rst
agent.rst agent.rst
.. toctree::
:maxdepth: 2
:caption: EvoKit
EvoKit/overview.rst
EvoKit/minimal_example.rst
EvoKit/online_example.rst
...@@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi ...@@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi
Summary Summary
----------- -----------
.. image:: ../examples/QuickStart/performance.gif .. image:: ../../examples/QuickStart/performance.gif
:width: 300px :width: 300px
.. image:: ./images/quickstart.png .. image:: ../images/quickstart.png
:width: 300px :width: 300px
In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem. In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem.
......
...@@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods: ...@@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods:
Args: Args:
model (parl.Model): model defining forward network of Q function model (parl.Model): model defining forward network of Q function
hyperparas (dict): (deprecated) dict of hyper parameters.
act_dim (int): dimension of the action space act_dim (int): dimension of the action space
gamma (float): discounted factor for reward computation. gamma (float): discounted factor for reward computation.
lr (float): learning rate. lr (float): learning rate.
......
summary
===============
Visualize the results with tensorboard.
add_scalar
-------------
Common used arguments:
* summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier
* scalar_value *(float or string/blobname)* – Value to save
* global_step *(int)* – Global step value to record
Example:
.. code-block:: python
from parl.utils import summary
x = range(100)
for i in x:
summary.add_scalar('y=2x', i * 2, i)
Expected result:
.. image:: add_scalar.jpg
:scale: 50 %
add_histogram
----------------
Common used arguments:
* summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier
* values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram
* global_step *(int)* – Global step value to record
Example:
.. code-block:: python
from parl.utils import summary
import numpy as np
for i in range(10):
x = np.random.random(1000)
summary.add_histogram('distribution centers', x + i, i)
Expected result:
.. image:: add_histogram.jpg
:scale: 50 %
cmake_minimum_required (VERSION 2.6)
project (EvoKit)
########## options ##########
option(WITH_PADDLE "Compile EvoKit with PaddleLite framework." OFF)
option(WITH_TORCH "Compile EvoKit with Torch framework." OFF)
message("WITH_PADDLE: "${WITH_PADDLE})
message("WITH_TORCH: "${WITH_TORCH})
if (NOT (WITH_PADDLE OR WITH_TORCH))
message("ERROR: You should choose at least one framework to compile EvoKit.")
return()
elseif(WITH_PADDLE AND WITH_TORCH)
message("ERROR: You cannot choose more than one framework to compile EvoKit.")
return()
endif()
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
file(GLOB src "core/src/*.cc" "core/proto/evo_kit/*.cc")
include_directories("core/include")
include_directories("core/proto")
include_directories("benchmark")
########## PaddleLite config ##########
if (WITH_PADDLE)
add_definitions(-g -O3 -pthread)
include_directories("paddle/include")
include_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/include"
"${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/include")
link_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/lib"
"${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/lib")
file(GLOB framework_src "paddle/src/*.cc")
set(TARGET EvoKit_paddle)
########## Torch config ##########
elseif (WITH_TORCH)
# list(APPEND CMAKE_PREFIX_PATH "./libtorch")
# find_package(Torch REQUIRED ON) # TODO: not necessary for now
include_directories("torch/include")
file(GLOB framework_src "torch/src/*.cc")
set(TARGET EvoKit_torch)
else ()
message("ERROR: You should choose at least one framework to compile EvoKit.")
endif()
add_library(${TARGET} STATIC ${src} ${framework_src})
target_link_libraries(${TARGET} gflags protobuf pthread glog)
# ########## PaddleLite libraries ##########
# if (WITH_PADDLE)
# target_link_libraries(${TARGET} -lpaddle_full_api_shared)
# target_link_libraries(${TARGET} -lmklml_intel)
# target_link_libraries(${TARGET} -ldl)
# ########## Torch libraries ##########
# elseif (WITH_TORCH)
# target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
# endif()
file(GLOB include "core/include/evo_kit/*.h")
file(GLOB proto_include "core/proto/evo_kit/*.h")
file(GLOB torch_include "torch/include/evo_kit/*.h")
file(GLOB paddle_include "paddle/include/evo_kit/*.h")
file(GLOB benchmark_include "benchmark/*.h")
file(GLOB findcmake "cmake/Torch/*.cmake")
set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/libevokit")
install(TARGETS ${TARGET} ARCHIVE DESTINATION "lib")
install(FILES ${include} ${proto_include} DESTINATION "include/evo_kit")
install(FILES ${torch_include} DESTINATION "torch/evo_kit")
install(FILES ${paddle_include} DESTINATION "paddle/evo_kit")
install(FILES ${benchmark_include} DESTINATION "include")
install(FILES ${findcmake} DESTINATION "cmake/Torch")
# EvoKit
EvoKit 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打快速上线验证 。
<p align="center">
<img src="DeepES.gif" alt="PARL" width="500"/>
</p>
## 使用示范
```c++
//实例化一个预测,根据配置文件加载模型,采样方式(Gaussian\CMA sampling..)、更新方式(SGD\Adam)等
auto agent = ESAgent(config);
for (int i = 0; i < 10; ++i) {
auto sampling_agnet = agent->clone(); // clone出一个sampling agent
SamplingInfo info;
sampling_agent->add_noise(info); // 参数扰动,同时保存随机种子到info中
int reward = evaluate(env, sampling_agent); //评估参数
noisy_info.push_back(info); // 记录随机噪声对应种子
noisy_rewards.push_back(reward); // 记录评估结果
}
//根据评估结果、随机种子更新参数,然后重复以上过程,直到收敛。
agent->update(noisy_info, noisy_rewards);
```
## 一键运行demo列表
- **PaddleLite**: sh ./scripts/build.sh paddle
- **Torch**: sh ./scripts/build.sh torch
- **裸写网络**
## 相关依赖:
- Protobuf2
- OpenMP
- [glog](https://github.com/gflags/gflags/blob/master/INSTALL.md)
- [gflag](https://github.com/google/glog)
## 额外依赖:
### 使用PaddleLite
下载PaddleLite的X86预编译库,或者编译PaddleLite源码,得到inference_lite_lib文件夹,放在当前目录中。(可参考:[PaddleLite使用X86预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html))
### 使用torch
下载[libtorch](https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip)或者编译torch源码,得到libtorch文件夹,放在当前目录中。
// Third party code
// This code is copied or modified from openai/gym's cartpole.py
#include <iostream>
#include <random>
#include <cassert>
#include <vector>
const double kPi = 3.1415926535898;
class CartPole {
public:
double gravity = 9.8;
double masscart = 1.0;
double masspole = 0.1;
double total_mass = (masspole + masscart);
double length = 0.5; // actually half the pole's length;
double polemass_length = (masspole * length);
double force_mag = 10.0;
double tau = 0.02; // seconds between state updates;
// Angle at which to fail the episode
double theta_threshold_radians = 12 * 2 * kPi / 360;
double x_threshold = 2.4;
int steps_beyond_done = -1;
std::vector<float> state = {0, 0, 0, 0};
double reward;
bool done;
int step_ = 0;
const float* getState() {
return state.data();
}
double getReward() {
return reward;
}
double isDone() {
return done;
}
void reset() {
std::random_device rd;
std::default_random_engine generator(rd());
std::uniform_real_distribution<float> distribution(-0.05, 0.05);
for (int i = 0; i < 4; ++i) {
state[i] = distribution(generator);
}
steps_beyond_done = -1;
step_ = 0;
}
CartPole() {
reset();
}
void step(int action) {
float x = state[0];
float x_dot = state[1];
float theta = state[2];
float theta_dot = state[3];
auto force = (action == 1) ? force_mag : -force_mag;
auto costheta = std::cos(theta);
auto sintheta = std::sin(theta);
auto temp = (force + polemass_length * theta_dot * theta_dot * sintheta) /
total_mass;
auto thetaacc = (gravity * sintheta - costheta * temp) /
(length * (4.0 / 3.0 - masspole * costheta * costheta / total_mass));
auto xacc = temp - polemass_length * thetaacc * costheta / total_mass;
x = x + tau * x_dot;
x_dot = x_dot + tau * xacc;
theta = theta + tau * theta_dot;
theta_dot = theta_dot + tau * thetaacc;
state = {x, x_dot, theta, theta_dot};
done = x < -x_threshold || x > x_threshold ||
theta < -theta_threshold_radians || theta > theta_threshold_radians ||
step_ > 200;
if (!done) {
reward = 1.0;
} else if (steps_beyond_done == -1) {
// Pole just fell!
steps_beyond_done = 0;
reward = 0;
} else {
if (steps_beyond_done == 0) {
assert(false); // Can't do this
}
}
step_++;
}
};
# FindEvoKit
# -------
#
# Finds the EvoKit library
#
# This will define the following variables:
#
# EVOKIT_FOUND -- True if the system has the EvoKit library
# EVOKIT_INCLUDE_DIRS -- The include directories for EvoKit
# EVOKIT_LIBRARY -- Libraries to link against
#
# and the following imported targets:
#
# EvoKit
include(FindPackageHandleStandardArgs)
if (DEFINED ENV{EVOKIT_INSTALL_PREFIX})
set(EVOKIT_INSTALL_PREFIX $ENV{EVOKIT_INSTALL_PREFIX})
else()
# Assume we are in <install-prefix>/cmake/Torch/EvoKitConfig.cmake
get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(EVOKIT_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
endif()
# Include directories.
if (EXISTS "${EVOKIT_INSTALL_PREFIX}/include")
set(EVOKIT_INCLUDE_DIRS
${EVOKIT_INSTALL_PREFIX}/include
${EVOKIT_INSTALL_PREFIX}/torch)
else()
set(EVOKIT_INCLUDE_DIRS
${EVOKIT_INSTALL_PREFIX}/include
${EVOKIT_INSTALL_PREFIX}/torch)
endif()
find_library(EVOKIT_LIBRARY libEvoKit_torch.a PATHS "${EVOKIT_INSTALL_PREFIX}/lib")
include_directories("${EVOKIT_INSTALL_PREFIX}/torch")
include_directories("${EVOKIT_INSTALL_PREFIX}/include")
find_package_handle_standard_args(EvoKit DEFAULT_MSG EVOKIT_LIBRARY EVOKIT_INCLUDE_DIRS)
message(STATUS "EVOKIT_FOUND: ${EVOKIT_FOUND}")
message(STATUS "EVOKIT_INCLUDE_DIRS: ${EVOKIT_INCLUDE_DIRS}")
message(STATUS "EVOKIT_LIBRARY: ${EVOKIT_LIBRARY}")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_ADAM_OPTIMIZER_H
#define EVO_KIT_ADAM_OPTIMIZER_H
#include <cmath>
#include <unordered_map>
#include "evo_kit/optimizer.h"
namespace evo_kit {
/*@brief AdamOptimizer.
* Implements Adam algorithm.
*
*@Args:
* base_lr: learning rate (default: 1e-3).
* beta1: coefficients used for computing running averages of gradient (default: 0.9).
* beta2: coefficients used for computing running averages of gradient's square (default: 0.999).
* epsilon: term added to the denominator to improve numerical stability (default: 1e-8).
*/
class AdamOptimizer: public Optimizer {
public:
AdamOptimizer(float base_lr, float beta1 = 0.9, float beta2 = 0.999,
float epsilon = 1e-8): Optimizer(base_lr), \
_beta1(beta1), _beta2(beta2), _epsilon(epsilon) {}
~AdamOptimizer();
protected:
void compute_step(float* gradient, int size, std::string param_name);
private:
float _beta1;
float _beta2;
float _epsilon;
std::unordered_map<std::string, float*> _momentum;
std::unordered_map<std::string, float*> _velocity;
};
}//namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
#define EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
#include <glog/logging.h>
#include <random>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "sampling_method.h"
#include "utils.h"
namespace evo_kit {
class CachedGaussianSampling: public SamplingMethod {
public:
CachedGaussianSampling();
~CachedGaussianSampling();
/*Initialize the sampling algorithm given the config with the protobuf format.
*EvoKit library uses only one configuration file for all sampling algorithms.
A defalut configuration file can be found at: . // TODO: where?
Usally you won't have to modify the configuration items of other algorithms
if you are not using them.
*/
bool load_config(const EvoKitConfig& config);
/*@brief generate Gaussian noise and the related key.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: generate Gaussian successfully or not.
*/
bool sampling(int* key, float* noise, int64_t size);
/*@brief reconstruct the Gaussion noise given the key.
* This function is often used for updating the neuron network parameters in the offline environment.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: reconstruct Gaussian successfully or not.
*/
bool resampling(int key, float* noise, int64_t size);
private:
float _std;
int _cache_size;
float* _noise_cache = nullptr;
bool _create_noise_cache();
};
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef EVO_KIT_GAUSSIAN_SAMPLING_H
#define EVO_KIT_GAUSSIAN_SAMPLING_H
#include <random>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "evo_kit/sampling_method.h"
#include "evo_kit/utils.h"
namespace evo_kit {
class GaussianSampling: public SamplingMethod {
public:
GaussianSampling() {}
~GaussianSampling() {}
/*Initialize the sampling algorithm given the config with the protobuf format.
*EvoKit library uses only one configuration file for all sampling algorithms.
A defalut configuration file can be found at: . // TODO: where?
Usally you won't have to modify the configuration items of other algorithms
if you are not using them.
*/
bool load_config(const EvoKitConfig& config);
/*@brief generate Gaussian noise and the related key.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: generate Gaussian successfully or not.
*/
bool sampling(int* key, float* noise, int64_t size);
/*@brief reconstruct the Gaussion noise given the key.
* This function is often used for updating the neuron network parameters in the offline environment.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: reconstruct Gaussian successfully or not.
*/
bool resampling(int key, float* noise, int64_t size);
private:
float _std;
};
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_OPTIMIZER_H
#define EVO_KIT_OPTIMIZER_H
#include <glog/logging.h>
#include <unordered_map>
namespace evo_kit {
/*@brief Optimizer. Base class for optimizers.
*
*@Args:
* base_lr: learning rate (default: 1e-3).
*
* .. warning: update () is based on the parameter level,
* you need to perform update () on each parameter.
*
* Subclasses are required to implement the following functions:
* 1. compute_steps
*/
class Optimizer {
public:
Optimizer() : _base_lr(1e-3), _update_times(0) {}
Optimizer(float base_lr) : _base_lr(base_lr), _update_times(0) {}
virtual ~Optimizer() {
_params_size.clear();
}
template<typename T>
bool update(T weights, float* gradient, int size, std::string param_name = "") {
/*@ Performs a single optimization step (parameter update) at the parameter level.
*
*@Args:
* weights (array): parameter weights.
* gradient (array): gradient for updating weights.
* size: size of gradient.
* param_name: the name corresponding to the weights.
*/
if (_params_size.count(param_name) == 0) {
_params_size[param_name] = size;
} else if (_params_size[param_name] != size) {
LOG(WARNING) << "[Warning] Update times: " << int(_update_times / _params_size.size()) \
<< ". Size of weights[" << param_name << "] is " << _params_size[param_name] << ", not " << size;
return false;
}
++_update_times;
compute_step(gradient, size, param_name);
for (int i = 0; i < size; ++i) {
weights[i] -= _base_lr * gradient[i];
}
return true;
} // template function
protected:
virtual void compute_step(float* graident, int size, std::string param_name = "") = 0;
float _base_lr;
float _update_times;
std::unordered_map<std::string, int> _params_size;
};
}//namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_OPTIMIZER_FACTORY_H
#define EVO_KIT_OPTIMIZER_FACTORY_H
#include <algorithm>
#include <glog/logging.h>
#include <memory>
#include "evo_kit/adam_optimizer.h"
#include "evo_kit/evo_kit.pb.h"
#include "evo_kit/optimizer.h"
#include "evo_kit/sgd_optimizer.h"
namespace evo_kit {
/* @brief: create an optimizer according to the configuration"
* @args:
* config: configuration for the optimizer
*
*/
std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config);
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_SAMPLING_FACTORY_H
#define EVO_KIT_SAMPLING_FACTORY_H
#include <algorithm>
#include <glog/logging.h>
#include <memory>
#include "evo_kit/cached_gaussian_sampling.h"
#include "evo_kit/evo_kit.pb.h"
#include "evo_kit/gaussian_sampling.h"
#include "evo_kit/sampling_method.h"
namespace evo_kit {
/* @brief: create an sampling_method according to the configuration"
* @args:
* config: configuration for the EvoKit
*
*/
std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& Config);
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_SAMPLING_METHOD_H
#define EVO_KIT_SAMPLING_METHOD_H
#include <string>
#include <random>
#include "evo_kit/evo_kit.pb.h"
namespace evo_kit {
/*Base class for sampling algorithms. All algorithms are required to override the following functions:
*
* 1. load_config
* 2. sampling
* 3. resampling
*
* View an demostrative algorithm in gaussian_sampling.h
* */
class SamplingMethod {
public:
SamplingMethod(): _seed(0) {}
virtual ~SamplingMethod() {}
/*Initialize the sampling algorithm given the config with the protobuf format.
*EvoKit library uses only one configuration file for all sampling algorithms.
A defalut configuration file can be found at: . // TODO: where?
Usally you won't have to modify the configuration items of other algorithms
if you are not using them.
*/
virtual bool load_config(const EvoKitConfig& config) = 0;
/*@brief generate Gaussian noise and the related key.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: generate Gaussian successfully or not.
*/
virtual bool sampling(int* key, float* noise, int64_t size) = 0;
/*@brief reconstruct the Gaussion noise given the key.
* This function is often used for updating the neuron network parameters in the offline environment.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: reconstruct Gaussian successfully or not.
*/
virtual bool resampling(int key, float* noise, int64_t size) = 0;
bool set_seed(int seed) {
_seed = seed;
srand(_seed);
return true;
}
int get_seed() {
return _seed;
}
protected:
int _seed;
};
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_SGD_OPTIMIZER_H
#define EVO_KIT_SGD_OPTIMIZER_H
#include <cmath>
#include <unordered_map>
#include "evo_kit/optimizer.h"
namespace evo_kit {
/*@brief SGDOptimizer.
* Implements stochastic gradient descent (optionally with momentum).
*
*@Args:
* base_lr: learning rate (default: 1e-3).
* momentum: momentum factor (default: 0.9).
*/
class SGDOptimizer: public Optimizer {
public:
SGDOptimizer(float base_lr, float momentum = 0.9): Optimizer(base_lr), _momentum(momentum) {}
~SGDOptimizer();
protected:
void compute_step(float* gradient, int size, std::string param_name);
private:
float _momentum;
std::unordered_map<std::string, float*> _velocity;
};
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_UTILS_H
#define EVO_KIT_UTILS_H
#include <algorithm>
#include <fstream>
#include <glog/logging.h>
#include <google/protobuf/text_format.h>
#include <string>
#include "evo_kit/evo_kit.pb.h"
namespace evo_kit {
/*Return ranks that is normliazed to [-0.5, 0.5] with the rewards as input.
Args:
reward: an array of rewards
*/
bool compute_centered_ranks(std::vector<float>& reward);
std::string read_file(const std::string& filename);
/* Load a protobuf-based configuration from the file.
* Args:
* config_file: file path.
* proto_config: protobuff message for configuration.
* return
*/
template<typename T>
bool load_proto_conf(const std::string& config_file, T& proto_config) {
bool success = true;
std::ifstream fin(config_file);
if (!fin || fin.fail()) {
LOG(ERROR) << "open prototxt config failed: " << config_file;
success = false;
} else {
fin.seekg(0, std::ios::end);
size_t file_size = fin.tellg();
fin.seekg(0, std::ios::beg);
char* file_content_buffer = new char[file_size];
fin.read(file_content_buffer, file_size);
std::string proto_str(file_content_buffer, file_size);
if (!google::protobuf::TextFormat::ParseFromString(proto_str, &proto_config)) {
LOG(ERROR) << "Failed to load config: " << config_file;
success = false;
}
delete[] file_content_buffer;
fin.close();
}
return success;
}
template<typename T>
bool save_proto_conf(const std::string& config_file, T& proto_config) {
bool success = true;
std::ofstream ofs(config_file, std::ofstream::out);
if (!ofs || ofs.fail()) {
LOG(ERROR) << "open prototxt config failed: " << config_file;
success = false;
} else {
std::string config_str;
success = google::protobuf::TextFormat::PrintToString(proto_config, &config_str);
if (!success) {
return success;
}
ofs << config_str;
}
return success;
}
std::vector<std::string> list_all_model_dirs(std::string path);
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package evo_kit;
message EvoKitConfig {
//sampling configuration
optional int32 seed = 1 [default = 18];
optional int32 buffer_size = 2 [default = 100000];
optional GaussianSamplingConfig gaussian_sampling = 3;
// Optimizer Configuration
optional OptimizerConfig optimizer = 4;
// AsyncESAgent Configuration
optional AsyncESConfig async_es = 5;
}
message GaussianSamplingConfig {
optional float std = 1 [default = 1.0];
optional bool cached = 2 [default = false];
optional int32 cache_size = 3 [default = 100000];
}
message OptimizerConfig{
optional string type = 1 [default = "SGD"];
optional float base_lr = 2 [default = 1e-3]; // The base learning rate.
optional float momentum = 3 [default = 0.9]; // The momentum value for SGD.
// ------------Adam Optimizer---------
optional float beta1 = 4 [default = 0.9];
optional float beta2 = 5 [default = 0.999];
optional float epsilon = 6 [default = 1e-8];
}
message SamplingInfo{
repeated int32 key = 1;
optional int32 model_iter_id = 2;
}
message AsyncESConfig{
optional string model_warehouse = 1 [default = "./model_warehouse"];
repeated string model_md5 = 2;
optional int32 max_to_keep = 3 [default = 5];
optional int32 model_iter_id = 4 [default = 0];
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/adam_optimizer.h"
namespace evo_kit {
AdamOptimizer::~AdamOptimizer() {
for (auto iter = _momentum.begin(); iter != _momentum.end(); iter++) {
delete[] iter->second;
}
for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
delete[] iter->second;
}
_momentum.clear();
_velocity.clear();
}
void AdamOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
if (_momentum.count(param_name) == 0) {
_momentum[param_name] = new float [size];
memset(_momentum[param_name], 0, size * sizeof(float));
}
if (_velocity.count(param_name) == 0) {
_velocity[param_name] = new float [size];
memset(_velocity[param_name], 0, size * sizeof(float));
}
int true_update_times = int(_update_times / _velocity.size());
float alpha = std::sqrt(1 - std::pow(_beta2, _update_times)) / (1 - std::pow(_beta1,
_update_times));
for (int i = 0; i < size; ++i) {
_momentum[param_name][i] = _beta1 * _momentum[param_name][i] + (1 - _beta1) * gradient[i];
_velocity[param_name][i] = _beta2 * _velocity[param_name][i] + (1 - _beta2) * gradient[i] *
gradient[i];
gradient[i] = alpha * _momentum[param_name][i] / (std::sqrt(_velocity[param_name][i]) + _epsilon);
}
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/cached_gaussian_sampling.h"
namespace evo_kit {
CachedGaussianSampling::CachedGaussianSampling() {}
CachedGaussianSampling::~CachedGaussianSampling() {
delete[] _noise_cache;
}
bool CachedGaussianSampling::load_config(const EvoKitConfig& config) {
bool success = true;
_std = config.gaussian_sampling().std();
success = set_seed(config.seed());
CHECK(success) << "[EvoKit] Fail to set seed while load config.";
_cache_size = config.gaussian_sampling().cache_size();
_noise_cache = new float [_cache_size];
memset(_noise_cache, 0, _cache_size * sizeof(float));
success = _create_noise_cache();
CHECK(success) << "[EvoKit] Fail to create noise_cache while load config.";
return success;
}
bool CachedGaussianSampling::sampling(int* key, float* noise, int64_t size) {
bool success = true;
if (_noise_cache == nullptr) {
LOG(ERROR) << "[EvoKit] Please use load_config() first.";
success = false;
return success;
}
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
return success;
}
if ((size >= _cache_size) || (size < 0)) {
LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
"), cache_size: " << _cache_size;
success = false;
return success;
}
int rand_key = rand();
std::default_random_engine generator(rand_key);
std::uniform_int_distribution<unsigned int> uniform(0, _cache_size - size);
int index = uniform(generator);
*key = index;
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = *(_noise_cache + index + i);
}
return success;
}
bool CachedGaussianSampling::resampling(int key, float* noise, int64_t size) {
bool success = true;
if (_noise_cache == nullptr) {
LOG(ERROR) << "[EvoKit] Please use load_config() first.";
success = false;
return success;
}
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
return success;
}
if ((size >= _cache_size) || (size < 0)) {
LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
"), cache_size: " << _cache_size;
success = false;
return success;
}
if ((key > _cache_size - size) || (key < 0)) {
LOG(ERROR) << "[EvoKit] Resampling key " << key << " is out of bounds [0, "
<< _cache_size - size <<
"], cache_size: " << _cache_size << ", size: " << size;
success = false;
return success;
}
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = *(_noise_cache + key + i);
}
return success;
}
bool CachedGaussianSampling::_create_noise_cache() {
std::default_random_engine generator(_seed);
std::normal_distribution<float> norm;
for (int64_t i = 0; i < _cache_size; ++i) {
*(_noise_cache + i) = norm(generator) * _std;
}
return true;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/gaussian_sampling.h"
namespace evo_kit {
bool GaussianSampling::load_config(const EvoKitConfig& config) {
bool success = true;
_std = config.gaussian_sampling().std();
success = set_seed(config.seed());
return success;
}
bool GaussianSampling::sampling(int* key, float* noise, int64_t size) {
bool success = true;
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
return success;
}
int rand_key = rand();
*key = rand_key;
std::default_random_engine generator(rand_key);
std::normal_distribution<float> norm;
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = norm(generator) * _std;
}
return success;
}
bool GaussianSampling::resampling(int key, float* noise, int64_t size) {
bool success = true;
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
} else {
std::default_random_engine generator(key);
std::normal_distribution<float> norm;
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = norm(generator) * _std;
}
}
return success;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/optimizer_factory.h"
namespace evo_kit {
std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config) {
std::shared_ptr<Optimizer> optimizer;
std::string opt_type = optimizer_config.type();
std::transform(opt_type.begin(), opt_type.end(), opt_type.begin(), ::tolower);
if (opt_type == "sgd") {
optimizer = std::make_shared<SGDOptimizer>(optimizer_config.base_lr(), \
optimizer_config.momentum());
} else if (opt_type == "adam") {
optimizer = std::make_shared<AdamOptimizer>(optimizer_config.base_lr(), \
optimizer_config.beta1(), \
optimizer_config.beta2(), \
optimizer_config.epsilon());
} else {
LOG(ERROR) << "type of OptimizerConfig must be SGD or Adam."; // NotImplementedError
}
return optimizer;
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/sampling_factory.h"
namespace evo_kit {
std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& config) {
std::shared_ptr<SamplingMethod> sampling_method;
bool cached = config.gaussian_sampling().cached();
if (cached) {
sampling_method = std::make_shared<CachedGaussianSampling>();
} else {
sampling_method = std::make_shared<GaussianSampling>();
}
bool success = sampling_method->load_config(config);
if (success) {
return sampling_method;
} else {
LOG(ERROR) << "[EvoKit] Fail to create sampling_method";
return nullptr;
}
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/sgd_optimizer.h"
namespace evo_kit {
SGDOptimizer::~SGDOptimizer() {
for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
delete[] iter->second;
}
_velocity.clear();
}
void SGDOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
if (_velocity.count(param_name) == 0) {
_velocity[param_name] = new float [size];
memset(_velocity[param_name], 0, size * sizeof(float));
}
for (int i = 0; i < size; ++i) {
_velocity[param_name][i] = _momentum * _velocity[param_name][i] + (1 - _momentum) * gradient[i];
gradient[i] = _velocity[param_name][i];
}
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/utils.h"
#include <dirent.h>
namespace evo_kit {
bool compute_centered_ranks(std::vector<float>& reward) {
std::vector<std::pair<float, int>> reward_index;
float gap = 1.0 / (reward.size() - 1);
float normlized_rank = -0.5;
int id = 0;
for (auto& rew : reward) {
reward_index.push_back(std::make_pair(rew, id));
++id;
}
std::sort(reward_index.begin(), reward_index.end());
for (int i = 0; i < reward.size(); ++i) {
id = reward_index[i].second;
reward[id] = normlized_rank;
normlized_rank += gap;
}
return true;
}
std::vector<std::string> list_all_model_dirs(std::string path) {
std::vector<std::string> model_dirs;
DIR* dpdf;
struct dirent* epdf;
dpdf = opendir(path.data());
if (dpdf != NULL) {
while (epdf = readdir(dpdf)) {
std::string dir(epdf->d_name);
if (dir.find("model_iter_id") != std::string::npos) {
model_dirs.push_back(path + "/" + dir);
}
}
}
closedir(dpdf);
return model_dirs;
}
std::string read_file(const std::string& filename) {
std::ifstream ifile(filename.c_str());
if (!ifile.is_open()) {
LOG(ERROR) << "Open file: [" << filename << "] failed.";
return "";
}
std::ostringstream buf;
char ch = '\n';
while (buf && ifile.get(ch)) {
buf.put(ch);
}
ifile.close();
return buf.str();
}
}//namespace
seed: 1024
gaussian_sampling {
std: 0.5
cached: true
cache_size: 100000
}
optimizer {
type: "Adam"
base_lr: 0.05
momentum: 0.9
beta1: 0.9
beta2: 0.999
epsilon: 1e-08
}
async_es {
model_iter_id: 0
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <glog/logging.h>
#include <omp.h>
#include "evo_kit/async_es_agent.h"
#include "cartpole.h"
#include "paddle_api.h"
using namespace evo_kit;
using namespace paddle::lite_api;
const int ITER = 10;
// Use PaddlePredictor of CartPole model to predict the action.
std::vector<float> forward(std::shared_ptr<PaddlePredictor> predictor, const float* obs) {
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize({1, 4});
input_tensor->CopyFromCpu(obs);
predictor->Run();
std::vector<float> probs(2, 0.0);
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
output_tensor->CopyToCpu(probs.data());
return probs;
}
int arg_max(const std::vector<float>& vec) {
return static_cast<int>(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())));
}
float evaluate(CartPole& env, std::shared_ptr<AsyncESAgent> agent) {
float total_reward = 0.0;
env.reset();
const float* obs = env.getState();
std::shared_ptr<PaddlePredictor> paddle_predictor;
paddle_predictor = agent->get_predictor();
while (true) {
std::vector<float> probs = forward(paddle_predictor, obs);
int act = arg_max(probs);
env.step(act);
float reward = env.getReward();
bool done = env.isDone();
total_reward += reward;
if (done) {
break;
}
obs = env.getState();
}
return total_reward;
}
int main(int argc, char* argv[]) {
std::vector<CartPole> envs;
for (int i = 0; i < ITER; ++i) {
envs.push_back(CartPole());
}
std::shared_ptr<AsyncESAgent> agent =
std::make_shared<AsyncESAgent>("./demo/paddle/cartpole_init_model",
"./demo/cartpole_config.prototxt");
// Clone agents to sample (explore).
std::vector< std::shared_ptr<AsyncESAgent> > sampling_agents;
for (int i = 0; i < ITER; ++i) {
sampling_agents.push_back(agent->clone());
}
std::vector<SamplingInfo> noisy_info;
std::vector<SamplingInfo> last_noisy_info;
std::vector<float> noisy_rewards(ITER, 0.0f);
std::vector<float> last_noisy_rewards;
noisy_info.resize(ITER);
omp_set_num_threads(10);
for (int epoch = 0; epoch < 100; ++epoch) {
last_noisy_info.clear();
last_noisy_rewards.clear();
if (epoch != 0) {
for (int i = 0; i < ITER; ++i) {
last_noisy_info.push_back(noisy_info[i]);
last_noisy_rewards.push_back(noisy_rewards[i]);
}
}
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < ITER; ++i) {
std::shared_ptr<AsyncESAgent> sampling_agent = sampling_agents[i];
SamplingInfo info;
bool success = sampling_agent->add_noise(info);
float reward = evaluate(envs[i], sampling_agent);
noisy_info[i] = info;
noisy_rewards[i] = reward;
}
for (int i = 0; i < ITER; ++i) {
last_noisy_info.push_back(noisy_info[i]);
last_noisy_rewards.push_back(noisy_rewards[i]);
}
// NOTE: all parameters of sampling_agents will be updated
bool success = agent->update(last_noisy_info, last_noisy_rewards);
int reward = evaluate(envs[0], agent);
LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <algorithm>
#include <glog/logging.h>
#include <omp.h>
#include "cartpole.h"
#include "evo_kit/es_agent.h"
#include "paddle_api.h"
using namespace evo_kit;
using namespace paddle::lite_api;
const int ITER = 10;
// Use PaddlePredictor of CartPole model to predict the action.
std::vector<float> forward(std::shared_ptr<PaddlePredictor> predictor, const float* obs) {
std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
input_tensor->Resize({1, 4});
input_tensor->CopyFromCpu(obs);
predictor->Run();
std::vector<float> probs(2, 0.0);
std::unique_ptr<const Tensor> output_tensor(
std::move(predictor->GetOutput(0)));
output_tensor->CopyToCpu(probs.data());
return probs;
}
int arg_max(const std::vector<float>& vec) {
return static_cast<int>(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())));
}
float evaluate(CartPole& env, std::shared_ptr<ESAgent> agent) {
float total_reward = 0.0;
env.reset();
const float* obs = env.getState();
std::shared_ptr<PaddlePredictor> paddle_predictor;
paddle_predictor = agent->get_predictor();
while (true) {
std::vector<float> probs = forward(paddle_predictor, obs);
int act = arg_max(probs);
env.step(act);
float reward = env.getReward();
bool done = env.isDone();
total_reward += reward;
if (done) {
break;
}
obs = env.getState();
}
return total_reward;
}
int main(int argc, char* argv[]) {
std::vector<CartPole> envs;
for (int i = 0; i < ITER; ++i) {
envs.push_back(CartPole());
}
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>("./demo/paddle/cartpole_init_model",
"./demo/cartpole_config.prototxt");
// Clone agents to sample (explore).
std::vector< std::shared_ptr<ESAgent> > sampling_agents;
for (int i = 0; i < ITER; ++i) {
sampling_agents.push_back(agent->clone());
}
std::vector<SamplingInfo> noisy_keys;
std::vector<float> noisy_rewards(ITER, 0.0f);
noisy_keys.resize(ITER);
omp_set_num_threads(10);
for (int epoch = 0; epoch < 100; ++epoch) {
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < ITER; ++i) {
std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
SamplingInfo key;
bool success = sampling_agent->add_noise(key);
float reward = evaluate(envs[i], sampling_agent);
noisy_keys[i] = key;
noisy_rewards[i] = reward;
}
// NOTE: all parameters of sampling_agents will be updated
bool success = agent->update(noisy_keys, noisy_rewards);
int reward = evaluate(envs[0], agent);
LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
}
}
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from paddle import fluid
def net(obs, act_dim):
hid1_size = act_dim * 10
hid1 = fluid.layers.fc(obs, size=hid1_size)
prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
return prob
if __name__ == '__main__':
obs_dim = 4
act_dim = 2
obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
prob = net(obs, act_dim)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
fluid.io.save_inference_model(
dirname='cartpole_init_model',
feeded_var_names=['obs'],
target_vars=[prob],
params_filename='params',
model_filename='model',
executor=exe)
cmake_minimum_required (VERSION 2.6)
project (EvoKit_demo)
set(TARGET parallel_main)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
list(APPEND CMAKE_PREFIX_PATH "./libtorch")
find_package(Torch REQUIRED ON)
set(demo "${PROJECT_SOURCE_DIR}/cartpole_solver_parallel.cc")
########## main ##########
add_executable(${TARGET} ${demo} ${framework_src})
target_link_libraries(${TARGET} gflags protobuf pthread glog)
########## Torch libraries ##########
target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
########## EvoKit libraries ##########
list(APPEND CMAKE_PREFIX_PATH "./libevokit/cmake/Torch")
find_package(EvoKit)
target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <torch/torch.h>
#include <memory>
#include <algorithm>
#include <glog/logging.h>
#include <omp.h>
#include "evo_kit/gaussian_sampling.h"
#include "evo_kit/es_agent.h"
#include "cartpole.h"
#include "model.h"
using namespace evo_kit;
const int ITER = 10;
float evaluate(CartPole& env, std::shared_ptr<ESAgent<Model>> agent) {
float total_reward = 0.0;
env.reset();
const float* obs = env.getState();
while (true) {
torch::Tensor obs_tensor = torch::tensor({obs[0], obs[1], obs[2], obs[3]});
torch::Tensor action = agent->predict(obs_tensor);
int act = std::get<1>(action.max(-1)).item<long>();
env.step(act);
float reward = env.getReward();
auto done = env.isDone();
total_reward += reward;
if (done) break;
obs = env.getState();
}
return total_reward;
}
int main(int argc, char* argv[]) {
//google::InitGoogleLogging(argv[0]);
std::vector<CartPole> envs;
for (int i = 0; i < ITER; ++i) {
envs.push_back(CartPole());
}
auto model = std::make_shared<Model>(4, 2);
std::shared_ptr<ESAgent<Model>> agent = std::make_shared<ESAgent<Model>>(model,
"./cartpole_config.prototxt");
// Clone agents to sample (explore).
std::vector<std::shared_ptr<ESAgent<Model>>> sampling_agents;
for (int i = 0; i < ITER; ++i) {
sampling_agents.push_back(agent->clone());
}
std::vector<SamplingInfo> noisy_info;
std::vector<float> noisy_rewards(ITER, 0.0f);
noisy_info.resize(ITER);
for (int epoch = 0; epoch < 100; ++epoch) {
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < ITER; ++i) {
auto sampling_agent = sampling_agents[i];
SamplingInfo info;
bool success = sampling_agent->add_noise(info);
float reward = evaluate(envs[i], sampling_agent);
noisy_info[i] = info;
noisy_rewards[i] = reward;
}
// Will also update parameters of sampling_agents
bool success = agent->update(noisy_info, noisy_rewards);
// Use original agent to evalute (without noise).
int reward = evaluate(envs[0], agent);
LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _MODEL_H
#define _MODEL_H
#include <torch/torch.h>
struct Model : public torch::nn::Module{
Model() = delete;
Model(const int obs_dim, const int act_dim) {
_obs_dim = obs_dim;
_act_dim = act_dim;
int hid1_size = act_dim * 10;
fc1 = register_module("fc1", torch::nn::Linear(obs_dim, hid1_size));
fc2 = register_module("fc2", torch::nn::Linear(hid1_size, act_dim));
}
torch::Tensor forward(torch::Tensor x) {
x = x.reshape({-1, _obs_dim});
x = torch::tanh(fc1->forward(x));
x = torch::softmax(fc2->forward(x), 1);
return x;
}
std::shared_ptr<Model> clone() {
std::shared_ptr<Model> model = std::make_shared<Model>(_obs_dim, _act_dim);
std::vector<torch::Tensor> parameters1 = parameters();
std::vector<torch::Tensor> parameters2 = model->parameters();
for (int i = 0; i < parameters1.size(); ++i) {
torch::Tensor src = parameters1[i].view({-1});
torch::Tensor des = parameters2[i].view({-1});
auto src_a = src.accessor<float, 1>();
auto des_a = des.accessor<float, 1>();
for (int j = 0; j < src.size(0); ++j) {
des_a[j] = src_a[j];
}
}
return model;
}
int _act_dim;
int _obs_dim;
torch::nn::Linear fc1{nullptr}, fc2{nullptr};
};
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_ASYNC_ES_AGENT_H
#define EVO_KIT_ASYNC_ES_AGENT_H
#include <stdlib.h>
#include <unordered_map>
#include "evo_kit/es_agent.h"
namespace evo_kit {
/* EvoKit agent with PaddleLite as backend. This agent supports asynchronous update.
* Users mainly focus on the following functions:
* 1. clone: clone an agent for multi-thread evaluation
* 2. add_noise: add noise into parameters.
* 3. update: update parameters given data collected during evaluation.
*/
class AsyncESAgent: public ESAgent {
public:
AsyncESAgent() {}
~AsyncESAgent();
/**
* @args:
* predictor: predictor created by users for prediction.
* config_path: the path of configuration file.
* Note that AsyncESAgent will update the configuration file after calling the update function.
* Please use the up-to-date configuration.
*/
AsyncESAgent(
const std::string& model_dir,
const std::string& config_path);
/**
* @brief: Clone an agent for sampling.
*/
std::shared_ptr<AsyncESAgent> clone();
/**
* @brief: update parameters given data collected during evaluation.
* @args:
* noisy_info: sampling information returned by add_noise function.
* noisy_reward: evaluation rewards.
*/
bool update(
std::vector<SamplingInfo>& noisy_info,
std::vector<float>& noisy_rewards);
private:
std::unordered_map<int, std::shared_ptr<PaddlePredictor>> _previous_predictors;
std::unordered_map<int, float*> _param_delta;
std::string _config_path;
/**
* @brief: parse model_iter_id given a string of model directory.
* @return: an integer indicating the model_iter_id
*/
int _parse_model_iter_id(const std::string&);
/**
* @brief: compute the distance between current parameter and previous models.
*/
bool _compute_model_diff();
/**
* @brief: remove expired models to avoid overuse of disk space.
* @args:
* max_to_keep: the maximum number of models to keep locally.
*/
bool _remove_expired_model(int max_to_keep);
/**
* @brief: save up-to-date parameters to the disk.
*/
bool _save();
/**
* @brief: load all models in the model warehouse.
*/
bool _load();
/**
* @brief: load a model given the model directory.
*/
std::shared_ptr<PaddlePredictor> _load_previous_model(std::string model_dir);
};
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_
#define EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_
#include <vector>
#include "evo_kit/evo_kit.pb.h"
#include "evo_kit/optimizer_factory.h"
#include "evo_kit/sampling_factory.h"
#include "evo_kit/utils.h"
#include "paddle_api.h"
namespace evo_kit {
typedef paddle::lite_api::PaddlePredictor PaddlePredictor;
typedef paddle::lite_api::CxxConfig CxxConfig;
typedef paddle::lite_api::Tensor Tensor;
int64_t ShapeProduction(const paddle::lite_api::shape_t& shape);
/**
* @brief EvoKit agent with PaddleLite as backend.
* Users mainly focus on the following functions:
* 1. clone: clone an agent for multi-thread evaluation
* 2. add_noise: add noise into parameters.
* 3. update: update parameters given data collected during evaluation.
*
*/
class ESAgent {
public:
ESAgent() {}
~ESAgent();
ESAgent(const std::string& model_dir, const std::string& config_path);
/**
* @breif Clone a sampling agent
*
* Only cloned ESAgent can call `add_noise` function.
* Each cloned ESAgent will have a copy of original parameters.
* (support sampling in multi-thread way)
*/
std::shared_ptr<ESAgent> clone();
/**
* @brief Update parameters of predictor based on ES algorithm.
*
* Only not cloned ESAgent can call `update` function.
* Parameters of cloned agents will also be updated.
*/
bool update(
std::vector<SamplingInfo>& noisy_info,
std::vector<float>& noisy_rewards);
// copied parameters = original parameters + noise
bool add_noise(SamplingInfo& sampling_info);
/**
* @brief Get paddle predict
*
* if _is_sampling_agent is true, will return predictor with added noise;
* if _is_sampling_agent is false, will return predictor without added noise.
*/
std::shared_ptr<PaddlePredictor> get_predictor();
// get param size of model
int64_t param_size() {
return _param_size;
}
protected:
int64_t _calculate_param_size();
std::shared_ptr<PaddlePredictor> _predictor;
std::shared_ptr<PaddlePredictor> _sampling_predictor;
std::shared_ptr<SamplingMethod> _sampling_method;
std::shared_ptr<Optimizer> _optimizer;
std::shared_ptr<EvoKitConfig> _config;
std::shared_ptr<CxxConfig> _cxx_config;
std::vector<std::string> _param_names;
// malloc memory of noise and neg_gradients in advance.
float* _noise;
float* _neg_gradients;
int64_t _param_size;
bool _is_sampling_agent;
};
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/async_es_agent.h"
namespace evo_kit {
AsyncESAgent::AsyncESAgent(
const std::string& model_dir,
const std::string& config_path): ESAgent(model_dir, config_path) {
_config_path = config_path;
}
AsyncESAgent::~AsyncESAgent() {
for (const auto kv : _param_delta) {
float* delta = kv.second;
delete[] delta;
}
}
bool AsyncESAgent::_save() {
using namespace paddle::lite_api;
bool success = true;
if (_is_sampling_agent) {
LOG(ERROR) <<
"[EvoKit] Cloned AsyncESAgent cannot call `save`.Please use original AsyncESAgent.";
success = false;
return success;
}
int model_iter_id = _config->async_es().model_iter_id() + 1;
//current time
time_t rawtime;
struct tm* timeinfo;
char buffer[80];
time(&rawtime);
timeinfo = localtime(&rawtime);
std::string model_name = "model_iter_id-" + std::to_string(model_iter_id);
std::string model_path = _config->async_es().model_warehouse() + "/" + model_name;
LOG(INFO) << "[save]model_path: " << model_path;
_predictor->SaveOptimizedModel(model_path, LiteModelType::kProtobuf);
// save config
auto async_es = _config->mutable_async_es();
async_es->set_model_iter_id(model_iter_id);
success = save_proto_conf(_config_path, *_config);
if (!success) {
LOG(ERROR) << "[]unable to save config for AsyncESAgent";
success = false;
return success;
}
int max_to_keep = _config->async_es().max_to_keep();
success = _remove_expired_model(max_to_keep);
return success;
}
bool AsyncESAgent::_remove_expired_model(int max_to_keep) {
bool success = true;
std::string model_path = _config->async_es().model_warehouse();
std::vector<std::string> model_dirs = list_all_model_dirs(model_path);
int model_iter_id = _config->async_es().model_iter_id() + 1;
for (const auto& dir : model_dirs) {
int dir_model_iter_id = _parse_model_iter_id(dir);
if (model_iter_id - dir_model_iter_id >= max_to_keep) {
std::string rm_command = std::string("rm -rf ") + dir;
int ret = system(rm_command.c_str());
if (ret == 0) {
LOG(INFO) << "[EvoKit] remove expired Model: " << dir;
} else {
LOG(ERROR) << "[EvoKit] fail to remove expired Model: " << dir;
success = false;
return success;
}
}
}
return success;
}
bool AsyncESAgent::_compute_model_diff() {
bool success = true;
for (const auto& kv : _previous_predictors) {
int model_iter_id = kv.first;
std::shared_ptr<PaddlePredictor> old_predictor = kv.second;
float* diff = new float[_param_size];
memset(diff, 0, _param_size * sizeof(float));
int offset = 0;
for (const std::string& param_name : _param_names) {
auto des_tensor = old_predictor->GetTensor(param_name);
auto src_tensor = _predictor->GetTensor(param_name);
const float* des_data = des_tensor->data<float>();
const float* src_data = src_tensor->data<float>();
int64_t tensor_size = ShapeProduction(src_tensor->shape());
for (int i = 0; i < tensor_size; ++i) {
diff[i + offset] = des_data[i] - src_data[i];
}
offset += tensor_size;
}
_param_delta[model_iter_id] = diff;
}
return success;
}
bool AsyncESAgent::_load() {
bool success = true;
std::string model_path = _config->async_es().model_warehouse();
std::vector<std::string> model_dirs = list_all_model_dirs(model_path);
if (model_dirs.size() == 0) {
int model_iter_id = _config->async_es().model_iter_id();
success = model_iter_id == 0 ? true : false;
if (!success) {
LOG(WARNING) << "[EvoKit] current_model_iter_id is nonzero, but no model is \
found at the dir: " << model_path;
}
return success;
}
for (auto& dir : model_dirs) {
int model_iter_id = _parse_model_iter_id(dir);
if (model_iter_id == -1) {
LOG(WARNING) << "[EvoKit] fail to parse model_iter_id: " << dir;
success = false;
return success;
}
std::shared_ptr<PaddlePredictor> predictor = _load_previous_model(dir);
if (predictor == nullptr) {
success = false;
LOG(WARNING) << "[EvoKit] fail to load model: " << dir;
return success;
}
_previous_predictors[model_iter_id] = predictor;
}
success = _compute_model_diff();
return success;
}
std::shared_ptr<PaddlePredictor> AsyncESAgent::_load_previous_model(std::string model_dir) {
using namespace paddle::lite_api;
// 1. Create CxxConfig
CxxConfig config;
config.set_model_file(model_dir + "/model");
config.set_param_file(model_dir + "/params");
config.set_valid_places({
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kHost), PRECISION(kFloat)}
});
// 2. Create PaddlePredictor by CxxConfig
std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<CxxConfig>(config);
return predictor;
}
std::shared_ptr<AsyncESAgent> AsyncESAgent::clone() {
std::shared_ptr<AsyncESAgent> new_agent = std::make_shared<AsyncESAgent>();
float* noise = new float [_param_size];
new_agent->_predictor = _predictor;
new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor<CxxConfig>(*_cxx_config);
new_agent->_is_sampling_agent = true;
new_agent->_sampling_method = _sampling_method;
new_agent->_param_names = _param_names;
new_agent->_param_size = _param_size;
new_agent->_config = _config;
new_agent->_noise = noise;
return new_agent;
}
bool AsyncESAgent::update(
std::vector<SamplingInfo>& noisy_info,
std::vector<float>& noisy_rewards) {
CHECK(!_is_sampling_agent) << "[EvoKit] Cloned ESAgent cannot call update function. \
Please use original ESAgent.";
bool success = _load();
CHECK(success) << "[EvoKit] fail to load previous models.";
int current_model_iter_id = _config->async_es().model_iter_id();
// validate model_iter_id for each sample before the update
for (int i = 0; i < noisy_info.size(); ++i) {
int model_iter_id = noisy_info[i].model_iter_id();
if (model_iter_id != current_model_iter_id
&& _previous_predictors.count(model_iter_id) == 0) {
LOG(WARNING) << "[EvoKit] The sample with model_dir_id: " << model_iter_id \
<< " cannot match any local model";
success = false;
return success;
}
}
compute_centered_ranks(noisy_rewards);
memset(_neg_gradients, 0, _param_size * sizeof(float));
for (int i = 0; i < noisy_info.size(); ++i) {
int key = noisy_info[i].key(0);
float reward = noisy_rewards[i];
int model_iter_id = noisy_info[i].model_iter_id();
bool success = _sampling_method->resampling(key, _noise, _param_size);
CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i;
float* delta = _param_delta[model_iter_id];
// compute neg_gradients
if (model_iter_id == current_model_iter_id) {
for (int64_t j = 0; j < _param_size; ++j) {
_neg_gradients[j] += _noise[j] * reward;
}
} else {
for (int64_t j = 0; j < _param_size; ++j) {
_neg_gradients[j] += (_noise[j] + delta[j]) * reward;
}
}
}
for (int64_t j = 0; j < _param_size; ++j) {
_neg_gradients[j] /= -1.0 * noisy_info.size();
}
//update
int64_t counter = 0;
for (std::string param_name : _param_names) {
std::unique_ptr<Tensor> tensor = _predictor->GetMutableTensor(param_name);
float* tensor_data = tensor->mutable_data<float>();
int64_t tensor_size = ShapeProduction(tensor->shape());
_optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name);
counter += tensor_size;
}
success = _save();
CHECK(success) << "[EvoKit] fail to save model.";
return true;
}
int AsyncESAgent::_parse_model_iter_id(const std::string& model_path) {
int model_iter_id = -1;
int pow = 1;
for (int i = model_path.size() - 1; i >= 0; --i) {
if (model_path[i] >= '0' && model_path[i] <= '9') {
if (model_iter_id == -1) {
model_iter_id = 0;
}
} else {
break;
}
model_iter_id += pow * (model_path[i] - '0');
pow *= 10;
}
return model_iter_id;
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/es_agent.h"
#include <ctime>
namespace evo_kit {
int64_t ShapeProduction(const paddle::lite_api::shape_t& shape) {
int64_t res = 1;
for (auto i : shape) {
res *= i;
}
return res;
}
ESAgent::~ESAgent() {
delete[] _noise;
if (!_is_sampling_agent) {
delete[] _neg_gradients;
}
}
ESAgent::ESAgent(const std::string& model_dir, const std::string& config_path) {
using namespace paddle::lite_api;
// 1. Create CxxConfig
_cxx_config = std::make_shared<CxxConfig>();
std::string model_path = model_dir + "/model";
std::string param_path = model_dir + "/param";
std::string model_buffer = read_file(model_path);
std::string param_buffer = read_file(param_path);
_cxx_config->set_model_buffer(model_buffer.c_str(), model_buffer.size(),
param_buffer.c_str(), param_buffer.size());
_cxx_config->set_valid_places({
Place{TARGET(kX86), PRECISION(kFloat)},
Place{TARGET(kHost), PRECISION(kFloat)}
});
_predictor = CreatePaddlePredictor<CxxConfig>(*_cxx_config);
_is_sampling_agent = false;
// Original agent can't be used to sample, so keep it same with _predictor for evaluating.
_sampling_predictor = _predictor;
_config = std::make_shared<EvoKitConfig>();
load_proto_conf(config_path, *_config);
_sampling_method = create_sampling_method(*_config);
_optimizer = create_optimizer(_config->optimizer());
_param_names = _predictor->GetParamNames();
_param_size = _calculate_param_size();
_noise = new float [_param_size];
_neg_gradients = new float [_param_size];
}
std::shared_ptr<ESAgent> ESAgent::clone() {
if (_is_sampling_agent) {
LOG(ERROR) << "[EvoKit] only original ESAgent can call `clone` function.";
return nullptr;
}
std::shared_ptr<ESAgent> new_agent = std::make_shared<ESAgent>();
float* noise = new float [_param_size];
new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor<CxxConfig>(*_cxx_config);
new_agent->_predictor = _predictor;
new_agent->_cxx_config = _cxx_config;
new_agent->_is_sampling_agent = true;
new_agent->_sampling_method = _sampling_method;
new_agent->_param_names = _param_names;
new_agent->_config = _config;
new_agent->_param_size = _param_size;
new_agent->_noise = noise;
return new_agent;
}
bool ESAgent::update(
std::vector<SamplingInfo>& noisy_info,
std::vector<float>& noisy_rewards) {
if (_is_sampling_agent) {
LOG(ERROR) << "[EvoKit] Cloned ESAgent cannot call update function, please use original ESAgent.";
return false;
}
compute_centered_ranks(noisy_rewards);
memset(_neg_gradients, 0, _param_size * sizeof(float));
for (int i = 0; i < noisy_info.size(); ++i) {
int key = noisy_info[i].key(0);
float reward = noisy_rewards[i];
bool success = _sampling_method->resampling(key, _noise, _param_size);
CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i;
for (int64_t j = 0; j < _param_size; ++j) {
_neg_gradients[j] += _noise[j] * reward;
}
}
for (int64_t j = 0; j < _param_size; ++j) {
_neg_gradients[j] /= -1.0 * noisy_info.size();
}
//update
int64_t counter = 0;
for (std::string param_name : _param_names) {
std::unique_ptr<Tensor> tensor = _predictor->GetMutableTensor(param_name);
float* tensor_data = tensor->mutable_data<float>();
int64_t tensor_size = ShapeProduction(tensor->shape());
_optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name);
counter += tensor_size;
}
return true;
}
bool ESAgent::add_noise(SamplingInfo& sampling_info) {
bool success = true;
if (!_is_sampling_agent) {
LOG(ERROR) <<
"[EvoKit] Original ESAgent cannot call add_noise function, please use cloned ESAgent.";
success = false;
return success;
}
int key = 0;
success = _sampling_method->sampling(&key, _noise, _param_size);
CHECK(success) << "[EvoKit] sampling error occurs while add_noise.";
int model_iter_id = _config->async_es().model_iter_id();
sampling_info.add_key(key);
sampling_info.set_model_iter_id(model_iter_id);
int64_t counter = 0;
for (std::string param_name : _param_names) {
std::unique_ptr<Tensor> sample_tensor = _sampling_predictor->GetMutableTensor(param_name);
std::unique_ptr<const Tensor> tensor = _predictor->GetTensor(param_name);
int64_t tensor_size = ShapeProduction(tensor->shape());
for (int64_t j = 0; j < tensor_size; ++j) {
sample_tensor->mutable_data<float>()[j] = tensor->data<float>()[j] + _noise[counter + j];
}
counter += tensor_size;
}
return success;
}
std::shared_ptr<PaddlePredictor> ESAgent::get_predictor() {
return _sampling_predictor;
}
int64_t ESAgent::_calculate_param_size() {
int64_t param_size = 0;
for (std::string param_name : _param_names) {
std::unique_ptr<const Tensor> tensor = _predictor->GetTensor(param_name);
param_size += ShapeProduction(tensor->shape());
}
return param_size;
}
}//namespace
#!/bin/bash
cd demo/torch
#---------------libtorch-------------#
if [ ! -d "./libtorch" ];then
echo "Cannot find the torch library: ./libtorch"
echo "Downloading Torch library"
wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
echo "Torch library Downloaded"
fi
#---------------libevokit-------------#
cp -r ../../libevokit ./
if [ ! -d "./libevokit" ];then
echo "Cannot find the EvoKit library: ./libevokit"
echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme
exit 1
fi
# proto
cp ../cartpole_config.prototxt ./
#----------------build---------------#
rm -rf build
mkdir build
cd build
cmake ../
make -j10
cd -
#-----------------run----------------#
./build/parallel_main
cd ../..
#!/bin/bash
if [ $# != 1 ]; then
echo "You must choose one framework (paddle/torch) to compile EvoKit."
exit 0
fi
if [ $1 = "paddle" ]; then
#---------------paddlelite-------------#
if [ ! -d "./inference_lite_lib" ];then
echo "Cannot find the PaddleLite library: ./inference_lite_lib"
echo "Please put the PaddleLite libraray to current folder according the instruction in README"
exit 1
fi
# Initialization model
if [ ! -d ./demo/paddle/cartpole_init_model ]; then
unzip ./demo/paddle/cartpole_init_model.zip -d ./demo/paddle/
fi
FLAGS=" -DWITH_PADDLE=ON"
elif [ $1 = "torch" ]; then
FLAGS=" -DWITH_TORCH=ON"
else
echo "Invalid arguments. [paddle/torch]"
exit 0
fi
#----------------protobuf-------------#
cd core/proto/
protoc evo_kit/evo_kit.proto --cpp_out .
cd -
#----------------build---------------#
echo ${FLAGS}
rm -rf build
mkdir build
cd build
cmake ../ ${FLAGS}
make -j10
make install
cd -
cmake_minimum_required (VERSION 2.6)
project (EvoKit_demo)
set(TARGET unit_test_main)
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
find_package(GTest REQUIRED)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
# Torch lib
list(APPEND CMAKE_PREFIX_PATH "../libtorch")
find_package(Torch REQUIRED ON)
# include and source
include_directories("${PROJECT_SOURCE_DIR}/include")
file(GLOB test_src "${PROJECT_SOURCE_DIR}/src/*.cc")
# make
add_executable(${TARGET} "unit_test.cc" ${core_src} ${agent_src} ${test_src})
target_link_libraries(${TARGET} gflags protobuf pthread glog gtest "${TORCH_LIBRARIES}")
########## EvoKit libraries ##########
list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/libevokit/cmake/Torch")
find_package(EvoKit)
target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef _TORCH_DEMO_MODEL_H
#define _TORCH_DEMO_MODEL_H
#include <torch/torch.h>
struct Model : public torch::nn::Module{
Model() = delete;
Model(const int obs_dim, const int act_dim, const int h1_size, const int h2_size) {
_obs_dim = obs_dim;
_act_dim = act_dim;
_h1_size = h1_size;
_h2_size = h2_size;
fc1 = register_module("fc1", torch::nn::Linear(obs_dim, h1_size));
fc2 = register_module("fc2", torch::nn::Linear(h1_size, h2_size));
fc3 = register_module("fc3", torch::nn::Linear(h2_size, act_dim));
}
torch::Tensor forward(torch::Tensor x) {
x = x.reshape({-1, _obs_dim});
x = torch::tanh(fc1->forward(x));
x = torch::tanh(fc2->forward(x));
x = torch::tanh(fc3->forward(x));
return x;
}
std::shared_ptr<Model> clone() {
std::shared_ptr<Model> model = std::make_shared<Model>(_obs_dim, _act_dim, _h1_size, _h2_size);
std::vector<torch::Tensor> parameters1 = parameters();
std::vector<torch::Tensor> parameters2 = model->parameters();
for (int i = 0; i < parameters1.size(); ++i) {
torch::Tensor src = parameters1[i].view({-1});
torch::Tensor des = parameters2[i].view({-1});
auto src_a = src.accessor<float, 1>();
auto des_a = des.accessor<float, 1>();
for (int j = 0; j < src.size(0); ++j) {
des_a[j] = src_a[j];
}
}
return model;
}
int _act_dim;
int _obs_dim;
int _h1_size;
int _h2_size;
torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
};
#endif
seed : 1024
gaussian_sampling {
std: 0.005
cached: true
cache_size : 100000
}
optimizer {
type: "Adam",
base_lr: 0.005,
momentum: 0.9,
beta1: 0.9,
beta2: 0.999,
epsilon: 1e-8,
}
seed : 1024
gaussian_sampling {
std: 0.005
cached: false
}
optimizer {
type: "Adam",
base_lr: 0.005,
momentum: 0.9,
beta1: 0.9,
beta2: 0.999,
epsilon: 1e-8,
}
#!/bin/bash
export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
#---------------libtorch-------------#
if [ ! -d "./libtorch" ];then
echo "Cannot find the torch library: ../libtorch"
echo "Downloading Torch library"
wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
echo "Torch library Downloaded"
fi
#----------------protobuf-------------#
cd core/proto/
protoc evo_kit/evo_kit.proto --cpp_out .
cd -
#----------------build---------------#
sh scripts/lib_install.sh torch
#----------------build test---------------#
cd test
cp -r ../libevokit ./
if [ ! -d "./libevokit" ];then
echo "Cannot find the EvoKit library: ./libevokit"
echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme
exit 1
fi
rm -rf build
mkdir build
cd build
cmake ../
make -j10
#-----------------run----------------#
./unit_test_main
cd ..
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include <vector>
#include "evo_kit/optimizer_factory.h"
#include <memory>
namespace evo_kit {
TEST(SGDOptimizersTest, Method_update) {
std::shared_ptr<EvoKitConfig> config = std::make_shared<EvoKitConfig>();
auto optimizer_config = config->mutable_optimizer();
optimizer_config->set_base_lr(1.0);
optimizer_config->set_type("sgd");
std::shared_ptr<Optimizer> optimizer = create_optimizer(config->optimizer());
float sgd_wei[10] = { 0.0 , 0.0 , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081};
float sgd_grad[10] = {-0.11992419,-0.0 , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143};
float sgd_new[10] = { 0.01199242, 0.0 , 0.0344831 , 0.05776198, 0.04206595, 0.00973154, 0.09637211,-0.03477474, 0.014892306, 0.03129495};
EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1"));
for (int i = 0; i < 10; ++i) {
EXPECT_FLOAT_EQ(sgd_new[i], sgd_wei[i]) << " i: " << i ;
}
EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1"));
EXPECT_FALSE(optimizer->update(sgd_wei, sgd_grad, 9, "fc1"));
}
TEST(AdamOptimizersTest, Method_update) {
std::shared_ptr<EvoKitConfig> config = std::make_shared<EvoKitConfig>();
auto optimizer_config = config->mutable_optimizer();
optimizer_config->set_base_lr(1.0);
optimizer_config->set_type("adam");
std::shared_ptr<Optimizer> optimizer = create_optimizer(config->optimizer());
float adam_wei[10] = { 0.0 , 0.0 , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081};
float adam_grad[10] = {-0.11992419,-0.0 , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143};
float adam_new[10] = { 0.99999736, 0. ,-0.95783144, 1.05114082,-0.95755763,-0.98908256, 1.06569656,-0.99872491, 0.99906968, 1.01127923};
EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1"));
for (int i = 0; i < 10; ++i) {
EXPECT_FLOAT_EQ(adam_new[i], adam_wei[i]) << " i: " << i ;
}
EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1"));
EXPECT_FALSE(optimizer->update(adam_wei, adam_grad, 9, "fc1"));
}
} // namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include <vector>
#include "evo_kit/sampling_method.h"
#include "evo_kit/gaussian_sampling.h"
#include "evo_kit/cached_gaussian_sampling.h"
#include <memory>
namespace evo_kit {
class SamplingTest : public ::testing::Test {
protected:
void init_sampling_method(bool cached) {
config = std::make_shared<EvoKitConfig>();
config->set_seed(1024);
auto sampling_config = config->mutable_gaussian_sampling();
sampling_config->set_std(1.0);
sampling_config->set_cached(cached);
sampling_config->set_cache_size(cache_size);
if (cached) {
sampler = std::make_shared<CachedGaussianSampling>();
} else {
sampler = std::make_shared<GaussianSampling>();
}
}
std::shared_ptr<SamplingMethod> sampler;
std::shared_ptr<EvoKitConfig> config;
float array[3] = {1.0, 2.0, 3.0};
int cache_size = 100; // default cache_size 100
int key = 0;
};
TEST_F(SamplingTest, GaussianSampling_load_config) {
init_sampling_method(false);
EXPECT_TRUE(sampler->load_config(*config));
}
TEST_F(SamplingTest, GaussianSampling_sampling) {
init_sampling_method(false);
sampler->load_config(*config);
EXPECT_FALSE(sampler->sampling(&key, nullptr, 0));
EXPECT_TRUE(sampler->sampling(&key, array, 3));
}
TEST_F(SamplingTest, GaussianSampling_resampling) {
init_sampling_method(false);
sampler->load_config(*config);
EXPECT_FALSE(sampler->resampling(0, nullptr, 0));
EXPECT_TRUE(sampler->resampling(0, array, 3));
}
TEST_F(SamplingTest, CachedGaussianSampling_load_config) {
init_sampling_method(true);
EXPECT_TRUE(sampler->load_config(*config));
}
TEST_F(SamplingTest, CachedGaussianSampling_sampling) {
init_sampling_method(true);
EXPECT_FALSE(sampler->sampling(&key, array, 0));
sampler->load_config(*config);
EXPECT_FALSE(sampler->sampling(&key, nullptr, 0));
EXPECT_FALSE(sampler->sampling(&key, array, -1));
EXPECT_FALSE(sampler->sampling(&key, array, cache_size));
EXPECT_TRUE(sampler->sampling(&key, array, 0));
EXPECT_TRUE(sampler->sampling(&key, array, 3));
}
TEST_F(SamplingTest, CachedGaussianSampling_resampling) {
init_sampling_method(true);
EXPECT_FALSE(sampler->resampling(0, array, 0));
sampler->load_config(*config);
EXPECT_FALSE(sampler->resampling(0, nullptr, 0));
EXPECT_FALSE(sampler->resampling(0, array, -1));
EXPECT_FALSE(sampler->resampling(0, array, cache_size));
EXPECT_TRUE(sampler->resampling(0, array, 0));
EXPECT_TRUE(sampler->resampling(0, array, 1));
EXPECT_TRUE(sampler->resampling(0, array, 2));
EXPECT_FALSE(sampler->resampling(-1, array, 3));
EXPECT_TRUE(sampler->resampling(0, array, 3));
EXPECT_TRUE(sampler->resampling(1, array, 3));
EXPECT_TRUE(sampler->resampling(2, array, 3));
EXPECT_TRUE(sampler->resampling(cache_size-3, array, 3));
EXPECT_FALSE(sampler->resampling(cache_size-2, array, 3));
EXPECT_FALSE(sampler->resampling(cache_size-1, array, 3));
EXPECT_FALSE(sampler->resampling(cache_size, array, 3));
EXPECT_FALSE(sampler->resampling(cache_size-3, array, cache_size-1));
}
} // namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include <torch/torch.h>
#include <glog/logging.h>
#include <omp.h>
#include "evo_kit/gaussian_sampling.h"
#include "evo_kit/es_agent.h"
#include "torch_demo_model.h"
#include <memory>
#include <vector>
#include <random>
#include <math.h>
namespace evo_kit {
// The fixture for testing class Foo.
class TorchDemoTest : public ::testing::Test {
protected:
float evaluate(std::vector<float>& x_list, std::vector<float>& y_list, int size, std::shared_ptr<ESAgent<Model>> agent) {
float total_loss = 0.0;
for (int i = 0; i < size; ++i) {
torch::Tensor x_input = torch::tensor(x_list[i], torch::dtype(torch::kFloat32));
torch::Tensor predict_y = agent->predict(x_input);
auto pred_y = predict_y.accessor<float,2>();
float loss = pow((pred_y[0][0] - y_list[i]), 2);
total_loss += loss;
}
return -total_loss / float(size);
}
float train_loss() {
return -1.0 * evaluate(x_list, y_list, train_data_size, agent);
}
float test_loss() {
return -1.0 * evaluate(test_x_list, test_y_list, test_data_size, agent);
}
float train_test_gap() {
float train_lo = train_loss();
float test_lo = test_loss();
if ( train_lo > test_lo) {
return train_lo - test_lo;
} else {
return test_lo - train_lo;
}
}
void init_agent(const int in_dim, const int out_dim, const int h1_size, const int h2_size) {
std::shared_ptr<Model> model = std::make_shared<Model>(in_dim, out_dim, h1_size, h2_size);
agent = std::make_shared<ESAgent<Model>>(model, "../prototxt/torch_sin_config.prototxt");
}
void train_agent(std::string config_path) {
std::default_random_engine generator(0); // fix seed
std::uniform_real_distribution<float> uniform(-3.0, 9.0);
std::normal_distribution<float> norm;
for (int i = 0; i < train_data_size; ++i) {
float x_i = uniform(generator); // generate data between [-3, 9]
float y_i = sin(x_i) + norm(generator) * 0.05; // label noise std 0.05
x_list.push_back(x_i);
y_list.push_back(y_i);
}
for (int i= 0; i < test_data_size; ++i) {
float x_i = uniform(generator);
float y_i = sin(x_i);
test_x_list.push_back(x_i);
test_y_list.push_back(y_i);
}
std::shared_ptr<Model> model = std::make_shared<Model>(1, 1, 10, 5);
agent = std::make_shared<ESAgent<Model>>(model, config_path);
// Clone agents to sample (explore).
std::vector<std::shared_ptr<ESAgent<Model>>> sampling_agents;
for (int i = 0; i < iter; ++i) {
sampling_agents.push_back(agent->clone());
}
std::vector<SamplingInfo> noisy_keys;
std::vector<float> noisy_rewards(iter, 0.0f);
noisy_keys.resize(iter);
LOG(INFO) << "start training...";
for (int epoch = 0; epoch < 1001; ++epoch) {
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < iter; ++i) {
auto sampling_agent = sampling_agents[i];
SamplingInfo key;
bool success = sampling_agent->add_noise(key);
float reward = evaluate(x_list, y_list, train_data_size, sampling_agent);
noisy_keys[i] = key;
noisy_rewards[i] = reward;
}
bool success = agent->update(noisy_keys, noisy_rewards);
if (epoch % 100 == 0) {
float reward = evaluate(test_x_list, test_y_list, test_data_size, agent);
float train_reward = evaluate(x_list, y_list, train_data_size, agent);
LOG(INFO) << "Epoch:" << epoch << " Loss: " << -reward << ", Train loss" << -train_reward;
}
}
}
// Class members declared here can be used by all tests in the test suite
int train_data_size = 300;
int test_data_size = 100;
int iter = 10;
std::vector<float> x_list;
std::vector<float> y_list;
std::vector<float> test_x_list;
std::vector<float> test_y_list;
std::shared_ptr<ESAgent<Model>> agent;
};
TEST_F(TorchDemoTest, TrainingEffectUseNormalSampling) {
train_agent("../prototxt/torch_sin_config.prototxt");
EXPECT_LT(train_loss(), 0.05);
EXPECT_LT(test_loss(), 0.05);
EXPECT_LT(train_test_gap(), 0.03);
}
TEST_F(TorchDemoTest, TrainingEffectTestUseTableSampling) {
train_agent("../prototxt/torch_sin_cached_config.prototxt");
EXPECT_LT(train_loss(), 0.05);
EXPECT_LT(test_loss(), 0.05);
EXPECT_LT(train_test_gap(), 0.03);
}
TEST_F(TorchDemoTest,ParamSizeTest) {
init_agent(1, 1, 10, 5);
EXPECT_EQ(agent->param_size(), 81);
init_agent(2, 3, 10, 5);
EXPECT_EQ(agent->param_size(), 103);
init_agent(1, 1, 1, 1);
EXPECT_EQ(agent->param_size(), 6);
init_agent(100, 2, 256, 64);
EXPECT_EQ(agent->param_size(), 42434);
}
} // namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
#include <vector>
#include "evo_kit/utils.h"
namespace evo_kit {
// Tests that the Utils::compute_centered_rank() method.
TEST(UtilsTest, Method_compute_centered_ranks) {
float a[5] = {9.0, 8.0, 7.0, 6.0, 5.0};
std::vector<float> reward_vec(a, a+5);
EXPECT_EQ(compute_centered_ranks(reward_vec), true);
}
} // namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "gtest/gtest.h"
int main(int argc, char **argv) {
::testing::InitGoogleTest(&argc, argv);
return RUN_ALL_TESTS();
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef TORCH_ESAGENT_H
#define TORCH_ESAGENT_H
#include <memory>
#include <string>
#include "evo_kit/optimizer_factory.h"
#include "evo_kit/sampling_factory.h"
#include "evo_kit/utils.h"
#include "evo_kit/evo_kit.pb.h"
namespace evo_kit{
/**
* @brief DeepES agent for Torch.
*
* Our implemtation is flexible to support any model that subclass torch::nn::Module.
* That is, we can instantiate an agent by: es_agent = ESAgent<Model>(model);
* After that, users can clone an agent for multi-thread processing, add parametric noise for exploration,
* and update the parameteres, according to the evaluation resutls of noisy parameters.
*/
template <class T>
class ESAgent{
public:
ESAgent() {}
~ESAgent() {
delete[] _noise;
if (!_is_sampling_agent)
delete[] _neg_gradients;
}
ESAgent(std::shared_ptr<T> model, std::string config_path): _model(model) {
_is_sampling_agent = false;
_config = std::make_shared<EvoKitConfig>();
load_proto_conf(config_path, *_config);
_sampling_method = create_sampling_method(*_config);
_optimizer = create_optimizer(_config->optimizer());
// Origin agent can't be used to sample, so keep it same with _model for evaluating.
_sampling_model = model;
_param_size = _calculate_param_size();
_noise = new float [_param_size];
_neg_gradients = new float [_param_size];
}
/**
* @breif Clone a sampling agent
*
* Only cloned ESAgent can call `add_noise` function.
* Each cloned ESAgent will have a copy of original parameters.
* (support sampling in multi-thread way)
*/
std::shared_ptr<ESAgent> clone() {
std::shared_ptr<ESAgent> new_agent = std::make_shared<ESAgent>();
new_agent->_model = _model;
std::shared_ptr<T> new_model = _model->clone();
new_agent->_sampling_model = new_model;
new_agent->_is_sampling_agent = true;
new_agent->_sampling_method = _sampling_method;
new_agent->_param_size = _param_size;
float* new_noise = new float [_param_size];
new_agent->_noise = new_noise;
return new_agent;
}
/**
* @brief Use the model to predict.
*
* if _is_sampling_agent is true, will use the sampling model with added noise;
* if _is_sampling_agent is false, will use the original model without added noise.
*/
torch::Tensor predict(const torch::Tensor& x) {
return _sampling_model->forward(x);
}
/**
* @brief Update parameters of model based on ES algorithm.
*
* Only not cloned ESAgent can call `update` function.
* Parameters of cloned agents will also be updated.
*/
bool update(std::vector<SamplingInfo>& noisy_info, std::vector<float>& noisy_rewards) {
if (_is_sampling_agent) {
LOG(ERROR) << "[DeepES] Cloned ESAgent cannot call update function, please use original ESAgent.";
return false;
}
compute_centered_ranks(noisy_rewards);
memset(_neg_gradients, 0, _param_size * sizeof(float));
for (int i = 0; i < noisy_info.size(); ++i) {
int key = noisy_info[i].key(0);
float reward = noisy_rewards[i];
bool success = _sampling_method->resampling(key, _noise, _param_size);
CHECK(success) << "[DeepES] resampling error occurs at sample: " << i;
for (int64_t j = 0; j < _param_size; ++j) {
_neg_gradients[j] += _noise[j] * reward;
}
}
for (int64_t j = 0; j < _param_size; ++j) {
_neg_gradients[j] /= -1.0 * noisy_info.size();
}
//update
auto params = _model->named_parameters();
int64_t counter = 0;
for (auto& param: params) {
torch::Tensor tensor = param.value().view({-1});
auto tensor_a = tensor.accessor<float,1>();
_optimizer->update(tensor_a, _neg_gradients+counter, tensor.size(0), param.key());
counter += tensor.size(0);
}
return true;
}
// copied parameters = original parameters + noise
bool add_noise(SamplingInfo& sampling_info) {
bool success = true;
if (!_is_sampling_agent) {
LOG(ERROR) << "[DeepES] Original ESAgent cannot call add_noise function, please use cloned ESAgent.";
success = false;
return success;
}
auto sampling_params = _sampling_model->named_parameters();
auto params = _model->named_parameters();
int key = 0;
success = _sampling_method->sampling(&key, _noise, _param_size);
CHECK(success) << "[EvoKit] sampling error occurs while add_noise.";
sampling_info.add_key(key);
int64_t counter = 0;
for (auto& param: sampling_params) {
torch::Tensor sampling_tensor = param.value().view({-1});
std::string param_name = param.key();
torch::Tensor tensor = params.find(param_name)->view({-1});
auto sampling_tensor_a = sampling_tensor.accessor<float,1>();
auto tensor_a = tensor.accessor<float,1>();
for (int64_t j = 0; j < tensor.size(0); ++j) {
sampling_tensor_a[j] = tensor_a[j] + _noise[counter + j];
}
counter += tensor.size(0);
}
return success;
}
// get param size of model
int64_t param_size() {
return _param_size;
}
private:
int64_t _calculate_param_size() {
_param_size = 0;
auto params = _model->named_parameters();
for (auto& param: params) {
torch::Tensor tensor = param.value().view({-1});
_param_size += tensor.size(0);
}
return _param_size;
}
std::shared_ptr<T> _model;
std::shared_ptr<T> _sampling_model;
bool _is_sampling_agent;
std::shared_ptr<SamplingMethod> _sampling_method;
std::shared_ptr<Optimizer> _optimizer;
std::shared_ptr<EvoKitConfig> _config;
int64_t _param_size;
// malloc memory of noise and neg_gradients in advance.
float* _noise;
float* _neg_gradients;
};
}
#endif /* TORCH_ESAGENT_H */
...@@ -20,7 +20,7 @@ Performance of A2C on various envrionments ...@@ -20,7 +20,7 @@ Performance of A2C on various envrionments
## How to use ## How to use
### Dependencies ### Dependencies
+ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) + [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL) + [parl>=1.2.1](https://github.com/PaddlePaddle/PARL)
+ gym==0.12.1 + gym==0.12.1
+ atari-py==0.1.7 + atari-py==0.1.7
......
...@@ -71,7 +71,10 @@ class AtariAgent(parl.Agent): ...@@ -71,7 +71,10 @@ class AtariAgent(parl.Agent):
lr = layers.data( lr = layers.data(
name='lr', shape=[1], dtype='float32', append_batch_size=False) name='lr', shape=[1], dtype='float32', append_batch_size=False)
entropy_coeff = layers.data( entropy_coeff = layers.data(
name='entropy_coeff', shape=[], dtype='float32') name='entropy_coeff',
shape=[1],
dtype='float32',
append_batch_size=False)
total_loss, pi_loss, vf_loss, entropy = self.alg.learn( total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
obs, actions, advantages, target_values, lr, entropy_coeff) obs, actions, advantages, target_values, lr, entropy_coeff)
......
...@@ -25,7 +25,7 @@ from atari_agent import AtariAgent ...@@ -25,7 +25,7 @@ from atari_agent import AtariAgent
from collections import defaultdict from collections import defaultdict
from parl.env.atari_wrappers import wrap_deepmind from parl.env.atari_wrappers import wrap_deepmind
from parl.utils import logger, get_gpu_count, tensorboard from parl.utils import logger, get_gpu_count, summary
from parl.utils.scheduler import PiecewiseScheduler from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
...@@ -55,11 +55,6 @@ class Learner(object): ...@@ -55,11 +55,6 @@ class Learner(object):
assert get_gpu_count() == 1, 'Only support training in single GPU,\ assert get_gpu_count() == 1, 'Only support training in single GPU,\
Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .' Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .'
else:
cpu_num = os.environ.get('CPU_NUM')
assert cpu_num is not None and cpu_num == '1', 'Only support training in single CPU,\
Please set environment variable: `export CPU_NUM=1`.'
#========== Learner ========== #========== Learner ==========
self.total_loss_stat = WindowStat(100) self.total_loss_stat = WindowStat(100)
...@@ -191,7 +186,7 @@ class Learner(object): ...@@ -191,7 +186,7 @@ class Learner(object):
min_episode_steps = np.min(np.array(episode_steps).flatten()) min_episode_steps = np.min(np.array(episode_steps).flatten())
metric = { metric = {
'Sample steps': self.sample_total_steps, 'sample_steps': self.sample_total_steps,
'max_episode_rewards': max_episode_rewards, 'max_episode_rewards': max_episode_rewards,
'mean_episode_rewards': mean_episode_rewards, 'mean_episode_rewards': mean_episode_rewards,
'min_episode_rewards': min_episode_rewards, 'min_episode_rewards': min_episode_rewards,
...@@ -210,7 +205,7 @@ class Learner(object): ...@@ -210,7 +205,7 @@ class Learner(object):
for key, value in metric.items(): for key, value in metric.items():
if value is not None: if value is not None:
tensorboard.add_scalar(key, value, self.sample_total_steps) summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
...@@ -55,6 +55,7 @@ class MujocoAgent(parl.Agent): ...@@ -55,6 +55,7 @@ class MujocoAgent(parl.Agent):
act = self.fluid_executor.run( act = self.fluid_executor.run(
self.pred_program, feed={'obs': obs}, self.pred_program, feed={'obs': obs},
fetch_list=[self.pred_act])[0] fetch_list=[self.pred_act])[0]
act = np.squeeze(act)
return act return act
def learn(self, obs, act, reward, next_obs, terminal): def learn(self, obs, act, reward, next_obs, terminal):
......
...@@ -45,7 +45,6 @@ class ActorModel(parl.Model): ...@@ -45,7 +45,6 @@ class ActorModel(parl.Model):
hid1 = self.fc1(obs) hid1 = self.fc1(obs)
hid2 = self.fc2(hid1) hid2 = self.fc2(hid1)
means = self.fc3(hid2) means = self.fc3(hid2)
means = means
return means return means
......
...@@ -21,14 +21,12 @@ from mujoco_agent import MujocoAgent ...@@ -21,14 +21,12 @@ from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from parl.utils import logger, action_mapping, ReplayMemory from parl.utils import logger, action_mapping, ReplayMemory
MAX_EPISODES = 5000
TEST_EVERY_EPISODES = 20
ACTOR_LR = 1e-4 ACTOR_LR = 1e-4
CRITIC_LR = 1e-3 CRITIC_LR = 1e-3
GAMMA = 0.99 GAMMA = 0.99
TAU = 0.001 TAU = 0.001
MEMORY_SIZE = int(1e6) MEMORY_SIZE = int(1e6)
MIN_LEARN_SIZE = 1e4 MEMORY_WARMUP_SIZE = 1e4
BATCH_SIZE = 128 BATCH_SIZE = 128
REWARD_SCALE = 0.1 REWARD_SCALE = 0.1
ENV_SEED = 1 ENV_SEED = 1
...@@ -37,12 +35,9 @@ ENV_SEED = 1 ...@@ -37,12 +35,9 @@ ENV_SEED = 1
def run_train_episode(env, agent, rpm): def run_train_episode(env, agent, rpm):
obs = env.reset() obs = env.reset()
total_reward = 0 total_reward = 0
steps = 0
while True: while True:
steps += 1
batch_obs = np.expand_dims(obs, axis=0) batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32')) action = agent.predict(batch_obs.astype('float32'))
action = np.squeeze(action)
# Add exploration noise, and clip to [-1.0, 1.0] # Add exploration noise, and clip to [-1.0, 1.0]
action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0) action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
...@@ -53,7 +48,7 @@ def run_train_episode(env, agent, rpm): ...@@ -53,7 +48,7 @@ def run_train_episode(env, agent, rpm):
rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done) rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)
if rpm.size() > MIN_LEARN_SIZE: if rpm.size() > MEMORY_WARMUP_SIZE:
batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch( batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
BATCH_SIZE) BATCH_SIZE)
agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs, agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
...@@ -64,7 +59,7 @@ def run_train_episode(env, agent, rpm): ...@@ -64,7 +59,7 @@ def run_train_episode(env, agent, rpm):
if done: if done:
break break
return total_reward, steps return total_reward
def run_evaluate_episode(env, agent): def run_evaluate_episode(env, agent):
...@@ -73,7 +68,6 @@ def run_evaluate_episode(env, agent): ...@@ -73,7 +68,6 @@ def run_evaluate_episode(env, agent):
while True: while True:
batch_obs = np.expand_dims(obs, axis=0) batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32')) action = agent.predict(batch_obs.astype('float32'))
action = np.squeeze(action)
action = action_mapping(action, env.action_space.low[0], action = action_mapping(action, env.action_space.low[0],
env.action_space.high[0]) env.action_space.high[0])
...@@ -101,19 +95,19 @@ def main(): ...@@ -101,19 +95,19 @@ def main():
rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim) rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)
test_flag = 0 while rpm.size() < MEMORY_WARMUP_SIZE:
total_steps = 0 run_train_episode(env, agent, rpm)
while total_steps < args.train_total_steps:
train_reward, steps = run_train_episode(env, agent, rpm) episode = 0
total_steps += steps while episode < args.train_total_episode:
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) for i in range(50):
train_reward = run_train_episode(env, agent, rpm)
episode += 1
logger.info('Episode: {} Reward: {}'.format(episode, train_reward))
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Episode {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) episode, evaluate_reward))
if __name__ == '__main__': if __name__ == '__main__':
...@@ -121,15 +115,10 @@ if __name__ == '__main__': ...@@ -121,15 +115,10 @@ if __name__ == '__main__':
parser.add_argument( parser.add_argument(
'--env', help='Mujoco environment name', default='HalfCheetah-v2') '--env', help='Mujoco environment name', default='HalfCheetah-v2')
parser.add_argument( parser.add_argument(
'--train_total_steps', '--train_total_episode',
type=int,
default=int(1e7),
help='maximum training steps')
parser.add_argument(
'--test_every_steps',
type=int, type=int,
default=int(1e4), default=int(1e4),
help='the step interval between two consecutive evaluations') help='maximum training episodes')
args = parser.parse_args() args = parser.parse_args()
......
## Reproduce DQN with PARL ## Reproduce DQN with PARL
Based on PARL, the DQN algorithm of deep reinforcement learning has been reproduced, reaching the same level of indicators as the paper in Atari benchmarks. Based on PARL, we provide a simple demonstration of DQN.
+ DQN in + DQN in
[Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html) [Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
### Atari games introduction ### Result
Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari games.
### Benchmark result Performance of DQN playing CartPole-v0
Mean episode rewards for 10 million training steps. <p align="left">
<img src="../QuickStart/performance.gif" alt="result" height="175"/>
<img src=".benchmark/merge.png" width = "1150" height ="230" alt="pong" /> <img src="cartpole.jpg" alt="result" height="175"/>
Performance of DQN on various environments
<p align="center">
<img src=".benchmark/table.png" alt="result" width="700"/>
</p> </p>
## How to use ## How to use
...@@ -25,13 +19,14 @@ Performance of DQN on various environments ...@@ -25,13 +19,14 @@ Performance of DQN on various environments
+ [parl](https://github.com/PaddlePaddle/PARL) + [parl](https://github.com/PaddlePaddle/PARL)
+ gym + gym
+ tqdm + tqdm
+ atari-py
+ [ale_python_interface](https://github.com/mgbellemare/Arcade-Learning-Environment)
### Start Training: ### Start Training:
``` ```
# To train a model for Pong game # To train a model for CartPole-v0 game
python train.py --rom ./rom_files/pong.bin python train.py
``` ```
> To train more games, you can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms).
## DQN-Variants
For DQN variants such as Double DQN and Dueling DQN, please check [here](https://github.com/PaddlePaddle/PARL/tree/develop/examples/DQN_variant)
...@@ -12,47 +12,30 @@ ...@@ -12,47 +12,30 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import numpy as np import numpy as np
import numpy.random as random
import paddle.fluid as fluid import paddle.fluid as fluid
import parl
from parl import layers from parl import layers
from parl import Agent
from parl.utils import get_gpu_count, machine_info
class ElevatorAgent(Agent): class CartpoleAgent(parl.Agent):
def __init__(self, algorithm, obs_dim, action_dim): def __init__(self,
self._action_dim = action_dim algorithm,
self._obs_dim = obs_dim obs_dim,
self._update_target_steps = 1000 act_dim,
e_greed=0.1,
e_greed_decrement=0):
assert isinstance(obs_dim, int)
assert isinstance(act_dim, int)
self.obs_dim = obs_dim
self.act_dim = act_dim
super(CartpoleAgent, self).__init__(algorithm)
self._global_step = 0 self.global_step = 0
self.exploration_ratio = 0.9 self.update_target_steps = 200
self.exploration_decre = 1e-7
self.exploration_min = 0.1
super(ElevatorAgent, self).__init__(algorithm)
use_cuda = machine_info.is_gpu_available() self.e_greed = e_greed
if self.gpu_id >= 0: self.e_greed_decrement = e_greed_decrement
assert get_gpu_count() == 1, 'Only support training in single GPU,\
Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_YOU_WANT_TO_USE]` .'
else:
os.environ['CPU_NUM'] = str(1)
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_threads = 1
exec_strategy.num_iteration_per_drop_scope = 10
build_strategy = fluid.BuildStrategy()
build_strategy.remove_unnecessary_lock = False
self.learn_pe = fluid.ParallelExecutor(
use_cuda=use_cuda,
main_program=self.learn_program,
build_strategy=build_strategy,
exec_strategy=exec_strategy,
)
def build_program(self): def build_program(self):
self.pred_program = fluid.Program() self.pred_program = fluid.Program()
...@@ -60,52 +43,51 @@ class ElevatorAgent(Agent): ...@@ -60,52 +43,51 @@ class ElevatorAgent(Agent):
with fluid.program_guard(self.pred_program): with fluid.program_guard(self.pred_program):
obs = layers.data( obs = layers.data(
name='obs', shape=[self._obs_dim], dtype='float32') name='obs', shape=[self.obs_dim], dtype='float32')
self._value = self.alg.define_predict(obs) self.value = self.alg.predict(obs)
with fluid.program_guard(self.learn_program): with fluid.program_guard(self.learn_program):
obs = layers.data( obs = layers.data(
name='obs', shape=[self._obs_dim], dtype='float32') name='obs', shape=[self.obs_dim], dtype='float32')
action = layers.data(name='act', shape=[1], dtype='int32') action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32') reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data( next_obs = layers.data(
name='next_obs', shape=[self._obs_dim], dtype='float32') name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool') terminal = layers.data(name='terminal', shape=[], dtype='bool')
self._cost = self.alg.define_learn(obs, action, reward, next_obs, self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)
terminal)
def sample(self, obs): def sample(self, obs):
if self.exploration_ratio > self.exploration_min: sample = np.random.rand()
self.exploration_ratio -= self.exploration_decre if sample < self.e_greed:
q_values = self.predict(obs) act = np.random.randint(self.act_dim)
ret_actions = list()
for i in range(len(q_values)): # number of elevators
if (random.random() < self.exploration_ratio):
action = random.randint(0, self._action_dim)
else: else:
action = np.argmax(q_values[i]) act = self.predict(obs)
ret_actions.append(int(action)) self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)
return ret_actions return act
def predict(self, obs): def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
pred_Q = self.fluid_executor.run( pred_Q = self.fluid_executor.run(
self.pred_program, self.pred_program,
feed={'obs': obs.astype('float32')}, feed={'obs': obs.astype('float32')},
fetch_list=[self._value]) fetch_list=[self.value])[0]
return pred_Q[0] pred_Q = np.squeeze(pred_Q, axis=0)
act = np.argmax(pred_Q)
return act
def learn(self, obs, act, reward, next_obs, terminal): def learn(self, obs, act, reward, next_obs, terminal):
self._global_step += 1 if self.global_step % self.update_target_steps == 0:
if self._global_step % self._update_target_steps == 0: self.alg.sync_target()
self.alg.sync_target(self.gpu_id) self.global_step += 1
act = np.expand_dims(act, -1)
feed = { feed = {
'obs': obs.astype('float32'), 'obs': obs.astype('float32'),
'act': act.astype('int32'), 'act': act.astype('int32'),
'reward': reward, 'reward': reward,
'next_obs': next_obs.astype('float32'), 'next_obs': next_obs.astype('float32'),
'terminal': terminal 'terminal': terminal,
} }
cost = self.learn_pe.run(feed=feed, fetch_list=[self._cost.name])[0] cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0]
return cost return cost
...@@ -12,24 +12,21 @@ ...@@ -12,24 +12,21 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
import paddle.fluid as fluid import paddle.fluid as fluid
from parl import layers
import numpy as np
import parl import parl
from parl import layers
class RLDispatcherModel(parl.Model): class CartpoleModel(parl.Model):
def __init__(self, act_dim): def __init__(self, act_dim):
self._act_dim = act_dim hid1_size = 128
self._fc_1 = layers.fc(size=512, act='relu') hid2_size = 128
self._fc_2 = layers.fc(size=256, act='relu') self.fc1 = layers.fc(size=hid1_size, act='relu')
self._fc_3 = layers.fc(size=128, act='tanh') self.fc2 = layers.fc(size=hid2_size, act='relu')
self._output = layers.fc(size=act_dim) self.fc3 = layers.fc(size=act_dim, act=None)
def value(self, obs): def value(self, obs):
_h_1 = self._fc_1(obs) h1 = self.fc1(obs)
_h_2 = self._fc_2(_h_1) h2 = self.fc2(h1)
_h_3 = self._fc_3(_h_2) Q = self.fc3(h2)
self._pred = self._output(_h_3) return Q
return self._pred
...@@ -12,103 +12,35 @@ ...@@ -12,103 +12,35 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import numpy as np # Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
import copy
from collections import deque, namedtuple
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) import random
import collections
import numpy as np
class ReplayMemory(object): class ReplayMemory(object):
def __init__(self, max_size, state_shape, context_len): def __init__(self, max_size):
self.max_size = int(max_size) self.buffer = collections.deque(maxlen=max_size)
self.state_shape = state_shape
self.context_len = int(context_len)
self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
self.action = np.zeros((self.max_size, ), dtype='int32')
self.reward = np.zeros((self.max_size, ), dtype='float32')
self.isOver = np.zeros((self.max_size, ), dtype='bool')
self._curr_size = 0
self._curr_pos = 0
self._context = deque(maxlen=context_len - 1)
def append(self, exp): def append(self, exp):
"""append a new experience into replay memory self.buffer.append(exp)
"""
if self._curr_size < self.max_size:
self._assign(self._curr_pos, exp)
self._curr_size += 1
else:
self._assign(self._curr_pos, exp)
self._curr_pos = (self._curr_pos + 1) % self.max_size
if exp.isOver:
self._context.clear()
else:
self._context.append(exp)
def recent_state(self):
""" maintain recent state for training"""
lst = list(self._context)
states = [np.zeros(self.state_shape, dtype='uint8')] * \
(self._context.maxlen - len(lst))
states.extend([k.state for k in lst])
return states
def sample(self, idx): def sample(self, batch_size):
""" return state, action, reward, isOver, mini_batch = random.sample(self.buffer, batch_size)
note that some frames in state may be generated from last episode, obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
they should be removed from state
"""
state = np.zeros(
(self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
state_idx = np.arange(idx,
idx + self.context_len + 1) % self._curr_size
# confirm that no frame was generated from last episode for experience in mini_batch:
has_last_episode = False s, a, r, s_p, done = experience
for k in range(self.context_len - 2, -1, -1): obs_batch.append(s)
to_check_idx = state_idx[k] action_batch.append(a)
if self.isOver[to_check_idx]: reward_batch.append(r)
has_last_episode = True next_obs_batch.append(s_p)
state_idx = state_idx[k + 1:] done_batch.append(done)
state[k + 1:] = self.state[state_idx]
break
if not has_last_episode: return np.array(obs_batch).astype('float32'), \
state = self.state[state_idx] np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
real_idx = (idx + self.context_len - 1) % self._curr_size
action = self.action[real_idx]
reward = self.reward[real_idx]
isOver = self.isOver[real_idx]
return state, reward, action, isOver
def __len__(self): def __len__(self):
return self._curr_size return len(self.buffer)
def size(self):
return self._curr_size
def _assign(self, pos, exp):
self.state[pos] = exp.state
self.reward[pos] = exp.reward
self.action[pos] = exp.action
self.isOver[pos] = exp.isOver
def sample_batch(self, batch_size):
"""sample a batch from replay memory for training
"""
batch_idx = np.random.randint(
self._curr_size - self.context_len - 1, size=batch_size)
batch_idx = (self._curr_pos + batch_idx) % self._curr_size
batch_exp = [self.sample(i) for i in batch_idx]
return self._process_batch(batch_exp)
def _process_batch(self, batch_exp):
state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
action = np.asarray([e[2] for e in batch_exp], dtype='int8')
isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
return [state, action, reward, isOver]
...@@ -12,160 +12,100 @@ ...@@ -12,160 +12,100 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import argparse
import gym import gym
import paddle.fluid as fluid
import numpy as np import numpy as np
import os
import parl import parl
from atari_agent import AtariAgent from parl.utils import logger
from atari_model import AtariModel
from datetime import datetime
from replay_memory import ReplayMemory, Experience
from parl.utils import tensorboard, logger
from tqdm import tqdm
from utils import get_player
MEMORY_SIZE = 1e6
MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20
IMAGE_SIZE = (84, 84)
CONTEXT_LEN = 4
FRAME_SKIP = 4
UPDATE_FREQ = 4
GAMMA = 0.99
LEARNING_RATE = 3e-4
def run_train_episode(env, agent, rpm):
total_reward = 0
all_cost = []
state = env.reset()
steps = 0
while True:
steps += 1
context = rpm.recent_state()
context.append(state)
context = np.stack(context, axis=0)
action = agent.sample(context)
next_state, reward, isOver, _ = env.step(action)
rpm.append(Experience(state, action, reward, isOver))
# start training
if rpm.size() > MEMORY_WARMUP_SIZE:
if steps % UPDATE_FREQ == 0:
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
batch_next_state = batch_all_state[:, 1:, :, :]
cost = agent.learn(batch_state, batch_action, batch_reward,
batch_next_state, batch_isOver)
all_cost.append(float(cost))
total_reward += reward
state = next_state
if isOver:
break
if all_cost:
logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
total_reward, np.mean(all_cost)))
return total_reward, steps, np.mean(all_cost)
from cartpole_model import CartpoleModel
from cartpole_agent import CartpoleAgent
def run_evaluate_episode(env, agent): from replay_memory import ReplayMemory
state = env.reset()
LEARN_FREQ = 5 # update parameters every 5 steps
MEMORY_SIZE = 20000 # replay memory size
MEMORY_WARMUP_SIZE = 200 # store some experiences in the replay memory in advance
BATCH_SIZE = 32
LEARNING_RATE = 0.0005
GAMMA = 0.99 # discount factor of reward
def run_episode(agent, env, rpm):
total_reward = 0 total_reward = 0
obs = env.reset()
step = 0
while True: while True:
action = agent.predict(state) step += 1
state, reward, isOver, info = env.step(action) action = agent.sample(obs)
next_obs, reward, isOver, _ = env.step(action)
rpm.append((obs, action, reward, next_obs, isOver))
# train model
if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
(batch_obs, batch_action, batch_reward, batch_next_obs,
batch_isOver) = rpm.sample(BATCH_SIZE)
train_loss = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs, batch_isOver)
total_reward += reward total_reward += reward
obs = next_obs
if isOver: if isOver:
break break
return total_reward return total_reward
def evaluate(agent, env, render=False):
# test part, run 5 episodes and average
eval_reward = []
for i in range(5):
obs = env.reset()
episode_reward = 0
isOver = False
while not isOver:
action = agent.predict(obs)
if render:
env.render()
obs, reward, isOver, _ = env.step(action)
episode_reward += reward
eval_reward.append(episode_reward)
return np.mean(eval_reward)
def main(): def main():
env = get_player( env = gym.make('CartPole-v0')
args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP) action_dim = env.action_space.n
test_env = get_player( obs_shape = env.observation_space.shape
args.rom,
image_size=IMAGE_SIZE, rpm = ReplayMemory(MEMORY_SIZE)
frame_skip=FRAME_SKIP,
context_len=CONTEXT_LEN) model = CartpoleModel(act_dim=action_dim)
rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN) algorithm = parl.algorithms.DQN(
act_dim = env.action_space.n model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
agent = CartpoleAgent(
model = AtariModel(act_dim, args.algo)
if args.algo == 'Double':
algorithm = parl.algorithms.DDQN(model, act_dim=act_dim, gamma=GAMMA)
elif args.algo in ['DQN', 'Dueling']:
algorithm = parl.algorithms.DQN(model, act_dim=act_dim, gamma=GAMMA)
agent = AtariAgent(
algorithm, algorithm,
act_dim=act_dim, obs_dim=obs_shape[0],
start_lr=LEARNING_RATE, act_dim=action_dim,
total_step=args.train_total_steps, e_greed=0.1, # explore
update_freq=UPDATE_FREQ) e_greed_decrement=1e-6
) # probability of exploring is decreasing during training
with tqdm(
total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar: while len(rpm) < MEMORY_WARMUP_SIZE: # warm up replay memory
while rpm.size() < MEMORY_WARMUP_SIZE: run_episode(agent, env, rpm)
total_reward, steps, _ = run_train_episode(env, agent, rpm)
pbar.update(steps) max_episode = 2000
# train # start train
test_flag = 0 episode = 0
pbar = tqdm(total=args.train_total_steps) while episode < max_episode:
total_steps = 0 # train part
max_reward = None for i in range(0, 50):
while total_steps < args.train_total_steps: total_reward = run_episode(agent, env, rpm)
# start epoch episode += 1
total_reward, steps, loss = run_train_episode(env, agent, rpm)
total_steps += steps eval_reward = evaluate(agent, env)
pbar.set_description('[train]exploration:{}'.format(agent.exploration)) logger.info('episode:{} test_reward:{}'.format(
tensorboard.add_scalar('dqn/score', total_reward, total_steps) episode, eval_reward))
tensorboard.add_scalar('dqn/loss', loss,
total_steps) # mean of total loss
tensorboard.add_scalar('dqn/exploration', agent.exploration,
total_steps)
pbar.update(steps)
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
pbar.write("testing")
eval_rewards = []
for _ in tqdm(range(3), desc='eval agent'):
eval_reward = run_evaluate_episode(test_env, agent)
eval_rewards.append(eval_reward)
logger.info(
"eval_agent done, (steps, eval_reward): ({}, {})".format(
total_steps, np.mean(eval_rewards)))
eval_test = np.mean(eval_rewards)
tensorboard.add_scalar('dqn/eval', eval_test, total_steps)
pbar.close()
if __name__ == '__main__': if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--rom', help='path of the rom of the atari game', required=True)
parser.add_argument(
'--batch_size', type=int, default=64, help='batch size for training')
parser.add_argument(
'--algo',
default='DQN',
help=
'DQN/DDQN/Dueling, represent DQN, double DQN, and dueling DQN respectively',
)
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e7),
help='maximum environmental steps of games')
parser.add_argument(
'--test_every_steps',
type=int,
default=100000,
help='the step interval between two consecutive evaluations')
args = parser.parse_args()
main() main()
## Reproduce DQN with PARL
Based on PARL, the DQN algorithm of deep reinforcement learning has been reproduced, reaching the same level of indicators as the paper in Atari benchmarks.
+ DQN in
[Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
### Atari games introduction
Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari games.
### Benchmark result
Mean episode rewards for 10 million training steps.
<img src=".benchmark/merge.png" width = "1150" height ="230" alt="pong" />
Performance of DQN on various environments
<p align="center">
<img src=".benchmark/table.png" alt="result" width="700"/>
</p>
## How to use
### Dependencies:
+ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL)
+ gym
+ tqdm
+ atari-py
+ [ale_python_interface](https://github.com/mgbellemare/Arcade-Learning-Environment)
### Start Training:
```
# To train a model for Pong game
python train.py --rom ./rom_files/pong.bin
```
> To train more games, you can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms).
...@@ -106,7 +106,7 @@ class AtariAgent(parl.Agent): ...@@ -106,7 +106,7 @@ class AtariAgent(parl.Agent):
'reward': reward, 'reward': reward,
'next_obs': next_obs.astype('float32'), 'next_obs': next_obs.astype('float32'),
'terminal': terminal, 'terminal': terminal,
'lr': lr 'lr': np.float32(lr)
} }
cost = self.fluid_executor.run( cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0] self.learn_program, feed=feed, fetch_list=[self.cost])[0]
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import copy
from collections import deque, namedtuple
Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
class ReplayMemory(object):
def __init__(self, max_size, obs_shape, context_len):
self.max_size = int(max_size)
self.obs_shape = obs_shape
self.context_len = int(context_len)
self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
self.action = np.zeros((self.max_size, ), dtype='int32')
self.reward = np.zeros((self.max_size, ), dtype='float32')
self.isOver = np.zeros((self.max_size, ), dtype='bool')
self._curr_size = 0
self._curr_pos = 0
self._context = deque(maxlen=context_len - 1)
def append(self, exp):
"""append a new experience into replay memory
"""
if self._curr_size < self.max_size:
self._assign(self._curr_pos, exp)
self._curr_size += 1
else:
self._assign(self._curr_pos, exp)
self._curr_pos = (self._curr_pos + 1) % self.max_size
if exp.isOver:
self._context.clear()
else:
self._context.append(exp)
def recent_obs(self):
""" maintain recent obs for training"""
lst = list(self._context)
obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
(self._context.maxlen - len(lst))
obs.extend([k.obs for k in lst])
return obs
def sample(self, idx):
""" return obs, action, reward, isOver,
note that some frames in obs may be generated from last episode,
they should be removed from obs
"""
obs = np.zeros(
(self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
# confirm that no frame was generated from last episode
has_last_episode = False
for k in range(self.context_len - 2, -1, -1):
to_check_idx = obs_idx[k]
if self.isOver[to_check_idx]:
has_last_episode = True
obs_idx = obs_idx[k + 1:]
obs[k + 1:] = self.obs[obs_idx]
break
if not has_last_episode:
obs = self.obs[obs_idx]
real_idx = (idx + self.context_len - 1) % self._curr_size
action = self.action[real_idx]
reward = self.reward[real_idx]
isOver = self.isOver[real_idx]
return obs, reward, action, isOver
def __len__(self):
return self._curr_size
def size(self):
return self._curr_size
def _assign(self, pos, exp):
self.obs[pos] = exp.obs
self.reward[pos] = exp.reward
self.action[pos] = exp.action
self.isOver[pos] = exp.isOver
def sample_batch(self, batch_size):
"""sample a batch from replay memory for training
"""
batch_idx = np.random.randint(
self._curr_size - self.context_len - 1, size=batch_size)
batch_idx = (self._curr_pos + batch_idx) % self._curr_size
batch_exp = [self.sample(i) for i in batch_idx]
return self._process_batch(batch_exp)
def _process_batch(self, batch_exp):
obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
action = np.asarray([e[2] for e in batch_exp], dtype='int8')
isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
return [obs, action, reward, isOver]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import gym
import paddle.fluid as fluid
import numpy as np
import os
import parl
from atari_agent import AtariAgent
from atari_model import AtariModel
from datetime import datetime
from replay_memory import ReplayMemory, Experience
from parl.utils import summary, logger
from tqdm import tqdm
from utils import get_player
MEMORY_SIZE = 1e6
MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20
IMAGE_SIZE = (84, 84)
CONTEXT_LEN = 4
FRAME_SKIP = 4
UPDATE_FREQ = 4
GAMMA = 0.99
LEARNING_RATE = 3e-4
def run_train_episode(env, agent, rpm):
total_reward = 0
all_cost = []
obs = env.reset()
steps = 0
while True:
steps += 1
context = rpm.recent_obs()
context.append(obs)
context = np.stack(context, axis=0)
action = agent.sample(context)
next_obs, reward, isOver, _ = env.step(action)
rpm.append(Experience(obs, action, reward, isOver))
# start training
if rpm.size() > MEMORY_WARMUP_SIZE:
if steps % UPDATE_FREQ == 0:
batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size)
batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
batch_next_obs = batch_all_obs[:, 1:, :, :]
cost = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs, batch_isOver)
all_cost.append(float(cost))
total_reward += reward
obs = next_obs
if isOver:
break
if all_cost:
logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
total_reward, np.mean(all_cost)))
return total_reward, steps, np.mean(all_cost)
def run_evaluate_episode(env, agent):
obs = env.reset()
total_reward = 0
while True:
action = agent.predict(obs)
obs, reward, isOver, info = env.step(action)
total_reward += reward
if isOver:
break
return total_reward
def main():
env = get_player(
args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP)
test_env = get_player(
args.rom,
image_size=IMAGE_SIZE,
frame_skip=FRAME_SKIP,
context_len=CONTEXT_LEN)
rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN)
act_dim = env.action_space.n
model = AtariModel(act_dim, args.algo)
if args.algo == 'Double':
algorithm = parl.algorithms.DDQN(model, act_dim=act_dim, gamma=GAMMA)
elif args.algo in ['DQN', 'Dueling']:
algorithm = parl.algorithms.DQN(model, act_dim=act_dim, gamma=GAMMA)
agent = AtariAgent(
algorithm,
act_dim=act_dim,
start_lr=LEARNING_RATE,
total_step=args.train_total_steps,
update_freq=UPDATE_FREQ)
with tqdm(
total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar:
while rpm.size() < MEMORY_WARMUP_SIZE:
total_reward, steps, _ = run_train_episode(env, agent, rpm)
pbar.update(steps)
# train
test_flag = 0
pbar = tqdm(total=args.train_total_steps)
total_steps = 0
max_reward = None
while total_steps < args.train_total_steps:
# start epoch
total_reward, steps, loss = run_train_episode(env, agent, rpm)
total_steps += steps
pbar.set_description('[train]exploration:{}'.format(agent.exploration))
summary.add_scalar('dqn/score', total_reward, total_steps)
summary.add_scalar('dqn/loss', loss, total_steps) # mean of total loss
summary.add_scalar('dqn/exploration', agent.exploration, total_steps)
pbar.update(steps)
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
test_flag += 1
pbar.write("testing")
eval_rewards = []
for _ in tqdm(range(3), desc='eval agent'):
eval_reward = run_evaluate_episode(test_env, agent)
eval_rewards.append(eval_reward)
logger.info(
"eval_agent done, (steps, eval_reward): ({}, {})".format(
total_steps, np.mean(eval_rewards)))
eval_test = np.mean(eval_rewards)
summary.add_scalar('dqn/eval', eval_test, total_steps)
pbar.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--rom', help='path of the rom of the atari game', required=True)
parser.add_argument(
'--batch_size', type=int, default=64, help='batch size for training')
parser.add_argument(
'--algo',
default='DQN',
help=
'DQN/DDQN/Dueling, represent DQN, double DQN, and dueling DQN respectively',
)
parser.add_argument(
'--train_total_steps',
type=int,
default=int(1e7),
help='maximum environmental steps of games')
parser.add_argument(
'--test_every_steps',
type=int,
default=100000,
help='the step interval between two consecutive evaluations')
args = parser.parse_args()
main()
...@@ -34,7 +34,7 @@ Then we can start the distributed training by running: ...@@ -34,7 +34,7 @@ Then we can start the distributed training by running:
python train.py python train.py
``` ```
Training result will be saved in `train_log` with training curve that can be visualized in tensorboard data. Training result will be saved in `train_log` with training curve.
### Reference ### Reference
+ [Ray](https://github.com/ray-project/ray) + [Ray](https://github.com/ray-project/ray)
......
...@@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter ...@@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from noise import SharedNoiseTable from noise import SharedNoiseTable
from parl.utils import logger, tensorboard from parl.utils import logger, summary
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from six.moves import queue from six.moves import queue
from actor import Actor from actor import Actor
...@@ -202,7 +202,7 @@ class Learner(object): ...@@ -202,7 +202,7 @@ class Learner(object):
logger.info(metrics) logger.info(metrics)
for k, v in metrics.items(): for k, v in metrics.items():
if v is not None: if v is not None:
tensorboard.add_scalar(k, v, self.sample_total_steps) summary.add_scalar(k, v, self.sample_total_steps)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -24,7 +24,7 @@ from atari_model import AtariModel ...@@ -24,7 +24,7 @@ from atari_model import AtariModel
from atari_agent import AtariAgent from atari_agent import AtariAgent
from collections import defaultdict from collections import defaultdict
from parl.env.atari_wrappers import wrap_deepmind from parl.env.atari_wrappers import wrap_deepmind
from parl.utils import logger, get_gpu_count, tensorboard from parl.utils import logger, get_gpu_count, summary
from parl.utils.scheduler import PiecewiseScheduler from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
...@@ -313,7 +313,7 @@ class Learner(object): ...@@ -313,7 +313,7 @@ class Learner(object):
for key, value in metric.items(): for key, value in metric.items():
if value is not None: if value is not None:
tensorboard.add_scalar(key, value, self.sample_total_steps) summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
...@@ -58,7 +58,10 @@ class AtariAgent(parl.Agent): ...@@ -58,7 +58,10 @@ class AtariAgent(parl.Agent):
lr = layers.data( lr = layers.data(
name='lr', shape=[1], dtype='float32', append_batch_size=False) name='lr', shape=[1], dtype='float32', append_batch_size=False)
entropy_coeff = layers.data( entropy_coeff = layers.data(
name='entropy_coeff', shape=[], dtype='float32') name='entropy_coeff',
shape=[1],
dtype='float32',
append_batch_size=False)
self.learn_reader = fluid.layers.create_py_reader_by_data( self.learn_reader = fluid.layers.create_py_reader_by_data(
capacity=32, capacity=32,
......
...@@ -22,7 +22,7 @@ import parl ...@@ -22,7 +22,7 @@ import parl
from atari_model import AtariModel from atari_model import AtariModel
from atari_agent import AtariAgent from atari_agent import AtariAgent
from parl.env.atari_wrappers import wrap_deepmind from parl.env.atari_wrappers import wrap_deepmind
from parl.utils import logger, tensorboard, get_gpu_count from parl.utils import logger, summary, get_gpu_count
from parl.utils.scheduler import PiecewiseScheduler from parl.utils.scheduler import PiecewiseScheduler
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
...@@ -121,7 +121,9 @@ class Learner(object): ...@@ -121,7 +121,9 @@ class Learner(object):
yield [ yield [
obs_np, actions_np, behaviour_logits_np, rewards_np, obs_np, actions_np, behaviour_logits_np, rewards_np,
dones_np, self.lr, self.entropy_coeff dones_np,
np.float32(self.lr),
np.array([self.entropy_coeff], dtype='float32')
] ]
def run_learn(self): def run_learn(self):
...@@ -219,7 +221,7 @@ class Learner(object): ...@@ -219,7 +221,7 @@ class Learner(object):
min_episode_steps = np.min(np.array(episode_steps).flatten()) min_episode_steps = np.min(np.array(episode_steps).flatten())
metric = { metric = {
'Sample steps': self.sample_total_steps, 'sample_steps': self.sample_total_steps,
'max_episode_rewards': max_episode_rewards, 'max_episode_rewards': max_episode_rewards,
'mean_episode_rewards': mean_episode_rewards, 'mean_episode_rewards': mean_episode_rewards,
'min_episode_rewards': min_episode_rewards, 'min_episode_rewards': min_episode_rewards,
...@@ -242,7 +244,7 @@ class Learner(object): ...@@ -242,7 +244,7 @@ class Learner(object):
for key, value in metric.items(): for key, value in metric.items():
if value is not None: if value is not None:
tensorboard.add_scalar(key, value, self.sample_total_steps) summary.add_scalar(key, value, self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
# LiftSim基线
## 简介
基于PARL库实现Deep Q-network算法,应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]
## 依赖库
- paddlepaddle >= 1.5.1
- parl >= 1.1.2
- rlschool >= 0.0.1
Windows版本仅支持Python3.5及以上版本。
## 运行
```python
python demo.py
```
## Benchmark
<img src="rl_10.png" width="400"/>
Accumulated Reward:每3600 steps内reward的总和,可体现电梯调度在单位时间(模拟环境0.5小时)内的效率。
[rlschool]: https://github.com/PaddlePaddle/RLSchool
[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import parl
import numpy as np
import numpy.random as random
from copy import deepcopy
from collections import deque
from rlschool import EPSILON, HUGE
from rl_benchmark.model import RLDispatcherModel
from rl_benchmark.agent import ElevatorAgent
from parl.algorithms import DQN
from parl.utils import ReplayMemory
MEMORY_SIZE = 1000000
BATCH_SIZE = 64
class RL_dispatcher():
"""
An RL benchmark for elevator system
"""
def __init__(self, env, max_episode):
self.env = env
self._obs_dim = env.observation_space
self._act_dim = env.action_space
self._global_step = 0
self.max_episode = max_episode
self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
self._model = RLDispatcherModel(self._act_dim)
hyperparas = {
'action_dim': self._act_dim,
'lr': 5.0e-4,
'gamma': 0.998
}
self._algorithm = DQN(self._model, hyperparas)
self._agent = ElevatorAgent(self._algorithm, self._obs_dim,
self._act_dim)
self._warm_up_size = 2000
self._statistic_freq = 1000
self._loss_queue = deque()
def run_episode(self):
self.env.reset()
acc_reward = 0.0
while self._global_step < self.max_episode:
# self.env.render()
state = self.env.state
action = self._agent.sample(state)
state_, reward, done, info = self.env.step(action)
output_info = self.learn_step(state, action, reward)
acc_reward += reward
if (isinstance(output_info, dict) and len(output_info) > 0):
self.env.log_notice("%s", output_info)
if (self._global_step % 3600 == 0):
self.env.log_notice(
"Accumulated Reward: %f, Mansion Status: %s", acc_reward,
self.env.statistics)
acc_reward = 0.0
self._agent.save('./model.ckpt')
def learn_step(self, state, action, r):
self._global_step += 1
if (self._global_step > self._warm_up_size):
for i in range(self.env.elevator_num):
self._rpm.append(self._last_observation_array[i],
self._last_action[i], self._last_reward,
deepcopy(state[i]), False)
self._last_observation_array = deepcopy(state)
self._last_action = deepcopy(action)
self._last_reward = r
ret_dict = {}
if self._rpm.size() > self._warm_up_size:
batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
self._rpm.sample_batch(BATCH_SIZE)
cost = self._agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs, batch_terminal)
self._loss_queue.appendleft(cost)
if (len(self._loss_queue) > self._statistic_freq):
self._loss_queue.pop()
if (self._global_step % self._statistic_freq == 0):
ret_dict["Temporal Difference Error(Average)"] = \
float(sum(self._loss_queue)) / float(len(self._loss_queue))
return ret_dict
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# wrapper part modified from
# https://github.com/openai/gym/blob/master/gym/core.py
from rlschool import LiftSim
from wrapper_utils import obs_dim, act_dim, mansion_state_preprocessing
from wrapper_utils import action_idx_to_action
class Wrapper(LiftSim):
def __init__(self, env):
self.env = env
self._mansion = env._mansion
self.mansion_attr = self._mansion.attribute
self.elevator_num = self.mansion_attr.ElevatorNumber
self.observation_space = obs_dim(self.mansion_attr)
self.action_space = act_dim(self.mansion_attr)
self.viewer = env.viewer
def __getattr__(self, name):
if name.startswith('_'):
raise AttributeError(
"attempted to get missing private attribute '{}'".format(name))
return getattr(self.env, name)
def seed(self, seed=None):
return self.env.seed(seed)
def step(self, action):
return self.env.step(action)
def reset(self):
return self.env.reset()
def render(self):
return self.env.render()
def close(self):
return self.env.close()
class RewardWrapper(Wrapper):
pass
class ActionWrapper(Wrapper):
def reset(self):
return self.env.reset()
def step(self, action):
act = []
for a in action:
act.extend(self.action(a, self.action_space))
return self.env.step(act)
def action(self, action, action_space):
return action_idx_to_action(action, action_space)
class ObservationWrapper(Wrapper):
def reset(self):
self.env.reset()
return self.observation(self._mansion.state)
def step(self, action):
observation, reward, done, info = self.env.step(action)
return (self.observation(observation), reward, done, info)
def observation(self, observation):
return mansion_state_preprocessing(observation)
@property
def state(self):
return self.observation(self._mansion.state)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import random
import numpy as np
from rlschool import ElevatorState, ElevatorAction
from rlschool import MansionAttribute, MansionState
from rlschool import EPSILON, HUGE
from rlschool import MansionConfig
from rlschool import MansionManager
def discretize(value, n_dim, min_val, max_val):
"""
discretize a value into a vector of n_dim dimension 1-hot representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
Args:
value: the value that needs to be discretized into 1-hot format
n_dim: number of dimensions
min_val: minimal value in the result
man_val: maximum value in the result
Returns:
the discretized vector
"""
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 1, active_pos)
active_pos = max(0, active_pos)
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0
return ret_array
def linear_discretize(value, n_dim, min_val, max_val):
"""
discretize a value into a vector of n_dim dimensional representation
with the value below min_val being [1, 0, 0, ..., 0]
and the value above max_val being [0, 0, ..., 0, 1]
e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0
if value = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8]
Args:
value: the value that needs to be discretized
n_dim: number of dimensions
min_val: minimal value in the result
man_val: maximum value in the result
Returns:
the discretized vector
"""
assert n_dim > 0
if (n_dim == 1):
return [1]
delta = (max_val - min_val) / float(n_dim - 1)
active_pos = int((value - min_val) / delta + 0.5)
active_pos = min(n_dim - 2, active_pos)
active_pos = max(0, active_pos)
anchor_pt = active_pos * delta + min_val
if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta):
anchor_pt -= delta
active_pos -= 1
weight = (value - anchor_pt) / delta
weight = min(1.0, max(0.0, weight))
ret_array = [0 for i in range(n_dim)]
ret_array[active_pos] = 1.0 - weight
ret_array[active_pos + 1] = weight
return ret_array
def ele_state_preprocessing(ele_state):
"""Process elevator state, make it usable for network
Args:
ele_state: ElevatorState, nametuple, defined in rlschool/liftsim/environment/mansion/utils.py
Returns:
ele_feature: list of elevator state
"""
ele_feature = []
# add floor information
ele_feature.extend(
linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0,
ele_state.MaximumFloor))
# add velocity information
ele_feature.extend(
linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed,
ele_state.MaximumSpeed))
# add door information
ele_feature.append(ele_state.DoorState)
ele_feature.append(float(ele_state.DoorIsOpening))
ele_feature.append(float(ele_state.DoorIsClosing))
# add direction information
ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1))
# add load weight information
ele_feature.extend(
linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, 0.0,
1.0))
# add other information
target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)]
for target_floor in ele_state.ReservedTargetFloors:
target_floor_binaries[target_floor - 1] = 1.0
ele_feature.extend(target_floor_binaries)
dispatch_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor + 1)]
dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0
ele_feature.extend(dispatch_floor_binaries)
ele_feature.append(ele_state.DispatchTargetDirection)
return ele_feature
def obs_dim(mansion_attr):
"""Calculate the observation dimension
Args:
mansion_attr: MansionAttribute, attribute of mansion_manager
Returns:
observation dimension
"""
assert isinstance(mansion_attr, MansionAttribute)
ele_dim = mansion_attr.NumberOfFloor * 3 + 34
obs_dim = (ele_dim + 1) * mansion_attr.ElevatorNumber + \
mansion_attr.NumberOfFloor * 2
return obs_dim
def act_dim(mansion_attr):
"""Calculate the action dimension, which is number of floor times 2 plus 2.
The additional two are for special cases: the elevator stops at once if the new dispatch_target is 0,
the original dispatch_target does not change if dispatch_target is -1. See implementation in
method action_idx_to_action below.
Args:
mansion_attr: MansionAttribute, attribute of mansion_manager
Returns:
action dimension
"""
assert isinstance(mansion_attr, MansionAttribute)
return mansion_attr.NumberOfFloor * 2 + 2
def mansion_state_preprocessing(mansion_state):
"""Process mansion_state to make it usable for networks, convert it into a numpy array
Args:
mansion_state: namedtuple of mansion state,
defined in rlschool/liftsim/environment/mansion/utils.py
Returns:
the converted numpy array
"""
ele_features = list()
for ele_state in mansion_state.ElevatorStates:
ele_features.append(ele_state_preprocessing(ele_state))
max_floor = ele_state.MaximumFloor
target_floor_binaries_up = [0.0 for i in range(max_floor)]
target_floor_binaries_down = [0.0 for i in range(max_floor)]
for floor in mansion_state.RequiringUpwardFloors:
target_floor_binaries_up[floor - 1] = 1.0
for floor in mansion_state.RequiringDownwardFloors:
target_floor_binaries_down[floor - 1] = 1.0
target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down
idx = 0
man_features = list()
for idx in range(len(mansion_state.ElevatorStates)):
elevator_id_vec = discretize(idx + 1,
len(mansion_state.ElevatorStates), 1,
len(mansion_state.ElevatorStates))
idx_array = list(range(len(mansion_state.ElevatorStates)))
idx_array.remove(idx)
# random.shuffle(idx_array)
man_features.append(ele_features[idx])
for left_idx in idx_array:
man_features[idx] = man_features[idx] + ele_features[left_idx]
man_features[idx] = man_features[idx] + \
elevator_id_vec + target_floor_binaries
return np.asarray(man_features, dtype='float32')
def action_idx_to_action(action_idx, act_dim):
"""Convert action_inx to action
Args:
action_idx: the index needed to be converted
act_dim: action dimension
Returns:
the converted namedtuple
"""
assert isinstance(action_idx, int)
assert isinstance(act_dim, int)
realdim = act_dim - 2
if (action_idx == realdim):
return ElevatorAction(0, 1)
elif (action_idx == realdim + 1):
return ElevatorAction(-1, 1)
action = action_idx
if (action_idx < realdim / 2):
direction = 1
action += 1
else:
direction = -1
action -= int(realdim / 2)
action += 1
return [action, direction]
def action_to_action_idx(action, act_dim):
"""Convert action to number according to act_dim.
Args:
action: namedtuple defined in rlschool/liftsim/environment/mansion/utils.py
act_dim: action dimension
Returns:
action_idx: the result index
"""
assert isinstance(action, ElevatorAction)
assert isinstance(act_dim, int)
realdim = act_dim - 2
if (action.TargetFloor == 0):
return realdim
elif (action.TargetFloor < 0):
return realdim + 1
action_idx = 0
if (action.DirectionIndicator < 0):
action_idx += int(realdim / 2)
action_idx += action.TargetFloor - 1
return action_idx
...@@ -98,7 +98,7 @@ simple_world_comm<br> ...@@ -98,7 +98,7 @@ simple_world_comm<br>
+ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) + [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
+ [parl](https://github.com/PaddlePaddle/PARL) + [parl](https://github.com/PaddlePaddle/PARL)
+ [multiagent-particle-envs](https://github.com/openai/multiagent-particle-envs) + [multiagent-particle-envs](https://github.com/openai/multiagent-particle-envs)
+ gym + gym==0.10.5
### Start Training: ### Start Training:
``` ```
......
...@@ -20,7 +20,7 @@ from simple_model import MAModel ...@@ -20,7 +20,7 @@ from simple_model import MAModel
from simple_agent import MAAgent from simple_agent import MAAgent
import parl import parl
from parl.env.multiagent_simple_env import MAenv from parl.env.multiagent_simple_env import MAenv
from parl.utils import logger, tensorboard from parl.utils import logger, summary
def run_episode(env, agents): def run_episode(env, agents):
...@@ -62,7 +62,7 @@ def run_episode(env, agents): ...@@ -62,7 +62,7 @@ def run_episode(env, agents):
# learn policy # learn policy
for i, agent in enumerate(agents): for i, agent in enumerate(agents):
critic_loss = agent.learn(agents) critic_loss = agent.learn(agents)
tensorboard.add_scalar('critic_loss_%d' % i, critic_loss, summary.add_scalar('critic_loss_%d' % i, critic_loss,
agent.global_train_step) agent.global_train_step)
return total_reward, agents_reward, steps return total_reward, agents_reward, steps
...@@ -155,11 +155,11 @@ def train_agent(): ...@@ -155,11 +155,11 @@ def train_agent():
format(total_steps, total_episodes, mean_episode_reward, format(total_steps, total_episodes, mean_episode_reward,
use_time)) use_time))
t_start = time.time() t_start = time.time()
tensorboard.add_scalar('mean_episode_reward/episode', summary.add_scalar('mean_episode_reward/episode',
mean_episode_reward, total_episodes) mean_episode_reward, total_episodes)
tensorboard.add_scalar('mean_episode_reward/steps', summary.add_scalar('mean_episode_reward/steps',
mean_episode_reward, total_steps) mean_episode_reward, total_steps)
tensorboard.add_scalar('use_time/1000episode', use_time, summary.add_scalar('use_time/1000episode', use_time,
total_episodes) total_episodes)
# save model # save model
......
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
from actor import Actor from actor import Actor
from opensim_model import OpenSimModel from opensim_model import OpenSimModel
from opensim_agent import OpenSimAgent from opensim_agent import OpenSimAgent
from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count from parl.utils import logger, ReplayMemory, summary, get_gpu_count
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from parl.remote.client import get_global_client from parl.remote.client import get_global_client
from parl.utils import machine_info from parl.utils import machine_info
......
...@@ -22,7 +22,7 @@ import numpy as np ...@@ -22,7 +22,7 @@ import numpy as np
from actor import Actor from actor import Actor
from opensim_model import OpenSimModel from opensim_model import OpenSimModel
from opensim_agent import OpenSimAgent from opensim_agent import OpenSimAgent
from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count from parl.utils import logger, ReplayMemory, summary, get_gpu_count
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from parl.remote.client import get_global_client from parl.remote.client import get_global_client
from parl.utils import machine_info from parl.utils import machine_info
...@@ -97,7 +97,7 @@ class Learner(object): ...@@ -97,7 +97,7 @@ class Learner(object):
# add lock between training and predicting # add lock between training and predicting
self.model_lock = threading.Lock() self.model_lock = threading.Lock()
# add lock when appending data to rpm or writing scalars to tensorboard # add lock when appending data to rpm or writing scalars to summary
self.memory_lock = threading.Lock() self.memory_lock = threading.Lock()
self.ready_actor_queue = queue.Queue() self.ready_actor_queue = queue.Queue()
...@@ -246,23 +246,23 @@ class Learner(object): ...@@ -246,23 +246,23 @@ class Learner(object):
episode_env_reward) episode_env_reward)
if self.env_reward_stat.count > 500: if self.env_reward_stat.count > 500:
tensorboard.add_scalar('recent_env_reward', summary.add_scalar('recent_env_reward',
self.env_reward_stat.mean, self.env_reward_stat.mean,
self.total_steps) self.total_steps)
tensorboard.add_scalar('recent_shaping_reward', summary.add_scalar('recent_shaping_reward',
self.shaping_reward_stat.mean, self.shaping_reward_stat.mean,
self.total_steps) self.total_steps)
if self.critic_loss_stat.count > 500: if self.critic_loss_stat.count > 500:
tensorboard.add_scalar('recent_critic_loss', summary.add_scalar('recent_critic_loss',
self.critic_loss_stat.mean, self.critic_loss_stat.mean,
self.total_steps) self.total_steps)
tensorboard.add_scalar('episode_length', n, self.total_steps) summary.add_scalar('episode_length', n, self.total_steps)
tensorboard.add_scalar('max_env_reward', self.max_env_reward, summary.add_scalar('max_env_reward', self.max_env_reward,
self.total_steps) self.total_steps)
tensorboard.add_scalar('ready_actor_num', summary.add_scalar('ready_actor_num',
self.ready_actor_queue.qsize(), self.ready_actor_queue.qsize(),
self.total_steps) self.total_steps)
tensorboard.add_scalar('episode_time', episode_time, summary.add_scalar('episode_time', episode_time,
self.total_steps) self.total_steps)
self.noiselevel = self.noiselevel * NOISE_DECAY self.noiselevel = self.noiselevel * NOISE_DECAY
......
...@@ -21,7 +21,7 @@ import time ...@@ -21,7 +21,7 @@ import time
import parl import parl
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
from mujoco_model import ActorModel, CriticModel from mujoco_model import ActorModel, CriticModel
from parl.utils import logger, tensorboard, action_mapping, ReplayMemory from parl.utils import logger, summary, action_mapping, ReplayMemory
ACTOR_LR = 1e-3 ACTOR_LR = 1e-3
CRITIC_LR = 1e-3 CRITIC_LR = 1e-3
...@@ -111,8 +111,7 @@ def main(): ...@@ -111,8 +111,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm) train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
tensorboard.add_scalar('train/episode_reward', train_reward, summary.add_scalar('train/episode_reward', train_reward, total_steps)
total_steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
...@@ -120,7 +119,7 @@ def main(): ...@@ -120,7 +119,7 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) total_steps, evaluate_reward))
tensorboard.add_scalar('eval/episode_reward', evaluate_reward, summary.add_scalar('eval/episode_reward', evaluate_reward,
total_steps) total_steps)
......
...@@ -19,7 +19,7 @@ import time ...@@ -19,7 +19,7 @@ import time
import parl import parl
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from parl.utils import logger, tensorboard, action_mapping, ReplayMemory from parl.utils import logger, summary, action_mapping, ReplayMemory
MAX_EPISODES = 5000 MAX_EPISODES = 5000
ACTOR_LR = 3e-4 ACTOR_LR = 3e-4
...@@ -117,8 +117,7 @@ def main(): ...@@ -117,8 +117,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm) train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
tensorboard.add_scalar('train/episode_reward', train_reward, summary.add_scalar('train/episode_reward', train_reward, total_steps)
total_steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
...@@ -126,7 +125,7 @@ def main(): ...@@ -126,7 +125,7 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) total_steps, evaluate_reward))
tensorboard.add_scalar('eval/episode_reward', evaluate_reward, summary.add_scalar('eval/episode_reward', evaluate_reward,
total_steps) total_steps)
......
../DQN/atari.py ../DQN_variant/atari.py
\ No newline at end of file \ No newline at end of file
../DQN/atari_wrapper.py ../DQN_variant/atari_wrapper.py
\ No newline at end of file \ No newline at end of file
...@@ -19,23 +19,16 @@ import copy ...@@ -19,23 +19,16 @@ import copy
import paddle.fluid as fluid import paddle.fluid as fluid
from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.algorithm import Algorithm
from parl.core.fluid import layers from parl.core.fluid import layers
from parl.utils.deprecation import deprecated
__all__ = ['DQN'] __all__ = ['DQN']
class DQN(Algorithm): class DQN(Algorithm):
def __init__(self, def __init__(self, model, act_dim=None, gamma=None, lr=None):
model,
hyperparas=None,
act_dim=None,
gamma=None,
lr=None):
""" DQN algorithm """ DQN algorithm
Args: Args:
model (parl.Model): model defining forward network of Q function model (parl.Model): model defining forward network of Q function
hyperparas (dict): (deprecated) dict of hyper parameters.
act_dim (int): dimension of the action space act_dim (int): dimension of the action space
gamma (float): discounted factor for reward computation. gamma (float): discounted factor for reward computation.
lr (float): learning rate. lr (float): learning rate.
...@@ -43,14 +36,6 @@ class DQN(Algorithm): ...@@ -43,14 +36,6 @@ class DQN(Algorithm):
self.model = model self.model = model
self.target_model = copy.deepcopy(model) self.target_model = copy.deepcopy(model)
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.act_dim = hyperparas['action_dim']
self.gamma = hyperparas['gamma']
else:
assert isinstance(act_dim, int) assert isinstance(act_dim, int)
assert isinstance(gamma, float) assert isinstance(gamma, float)
assert isinstance(lr, float) assert isinstance(lr, float)
...@@ -100,12 +85,7 @@ class DQN(Algorithm): ...@@ -100,12 +85,7 @@ class DQN(Algorithm):
cost = layers.reduce_mean(cost) cost = layers.reduce_mean(cost)
return cost return cost
def sync_target(self, gpu_id=None): def sync_target(self):
""" sync weights of self.model to self.target_model """ sync weights of self.model to self.target_model
""" """
if gpu_id is not None:
warnings.warn(
"the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.model.sync_weights_to(self.target_model) self.model.sync_weights_to(self.target_model)
...@@ -22,7 +22,7 @@ from tqdm import tqdm ...@@ -22,7 +22,7 @@ from tqdm import tqdm
import parl import parl
import paddle.fluid as fluid import paddle.fluid as fluid
from parl.utils import get_gpu_count from parl.utils import get_gpu_count
from parl.utils import tensorboard, logger from parl.utils import summary, logger
from dqn import DQN # slight changes from parl.algorithms.DQN from dqn import DQN # slight changes from parl.algorithms.DQN
from atari_agent import AtariAgent from atari_agent import AtariAgent
...@@ -45,21 +45,21 @@ gpu_num = get_gpu_count() ...@@ -45,21 +45,21 @@ gpu_num = get_gpu_count()
def run_train_step(agent, rpm): def run_train_step(agent, rpm):
for step in range(args.train_total_steps): for step in range(args.train_total_steps):
# use the first 80% data to train # use the first 80% data to train
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size * gpu_num) args.batch_size * gpu_num)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
batch_next_state = batch_all_state[:, 1:, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :]
cost = agent.learn(batch_state, batch_action, batch_reward, cost = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_state, batch_isOver) batch_next_obs, batch_isOver)
if step % 100 == 0: if step % 100 == 0:
# use the last 20% data to evaluate # use the last 20% data to evaluate
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch( batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch(
args.batch_size) args.batch_size)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
batch_next_state = batch_all_state[:, 1:, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :]
eval_cost = agent.supervised_eval(batch_state, batch_action, eval_cost = agent.supervised_eval(batch_obs, batch_action,
batch_reward, batch_next_state, batch_reward, batch_next_obs,
batch_isOver) batch_isOver)
logger.info( logger.info(
"train step {}, train costs are {}, eval cost is {}.".format( "train step {}, train costs are {}, eval cost is {}.".format(
...@@ -67,17 +67,17 @@ def run_train_step(agent, rpm): ...@@ -67,17 +67,17 @@ def run_train_step(agent, rpm):
def collect_exp(env, rpm, agent): def collect_exp(env, rpm, agent):
state = env.reset() obs = env.reset()
# collect data to fulfill replay memory # collect data to fulfill replay memory
for i in tqdm(range(MEMORY_SIZE)): for i in tqdm(range(MEMORY_SIZE)):
context = rpm.recent_state() context = rpm.recent_obs()
context.append(state) context.append(obs)
context = np.stack(context, axis=0) context = np.stack(context, axis=0)
action = agent.sample(context) action = agent.sample(context)
next_state, reward, isOver, _ = env.step(action) next_obs, reward, isOver, _ = env.step(action)
rpm.append(Experience(state, action, reward, isOver)) rpm.append(Experience(obs, action, reward, isOver))
state = next_state obs = next_obs
def main(): def main():
......
...@@ -18,18 +18,18 @@ import os ...@@ -18,18 +18,18 @@ import os
from collections import deque, namedtuple from collections import deque, namedtuple
from parl.utils import logger from parl.utils import logger
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
class ReplayMemory(object): class ReplayMemory(object):
def __init__(self, def __init__(self,
max_size, max_size,
state_shape, obs_shape,
context_len, context_len,
load_file=False, load_file=False,
file_path=None): file_path=None):
self.max_size = int(max_size) self.max_size = int(max_size)
self.state_shape = state_shape self.obs_shape = obs_shape
self.context_len = int(context_len) self.context_len = int(context_len)
self.file_path = file_path self.file_path = file_path
...@@ -38,8 +38,7 @@ class ReplayMemory(object): ...@@ -38,8 +38,7 @@ class ReplayMemory(object):
self.load_memory() self.load_memory()
logger.info("memory size is {}".format(self._curr_size)) logger.info("memory size is {}".format(self._curr_size))
else: else:
self.state = np.zeros( self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
(self.max_size, ) + state_shape, dtype='uint8')
self.action = np.zeros((self.max_size, ), dtype='int32') self.action = np.zeros((self.max_size, ), dtype='int32')
self.reward = np.zeros((self.max_size, ), dtype='float32') self.reward = np.zeros((self.max_size, ), dtype='float32')
self.isOver = np.zeros((self.max_size, ), dtype='bool') self.isOver = np.zeros((self.max_size, ), dtype='bool')
...@@ -62,42 +61,41 @@ class ReplayMemory(object): ...@@ -62,42 +61,41 @@ class ReplayMemory(object):
else: else:
self._context.append(exp) self._context.append(exp)
def recent_state(self): def recent_obs(self):
""" maintain recent state for training""" """ maintain recent obs for training"""
lst = list(self._context) lst = list(self._context)
states = [np.zeros(self.state_shape, dtype='uint8')] * \ obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
(self._context.maxlen - len(lst)) (self._context.maxlen - len(lst))
states.extend([k.state for k in lst]) obs.extend([k.obs for k in lst])
return states return obs
def sample(self, idx): def sample(self, idx):
""" return state, action, reward, isOver, """ return obs, action, reward, isOver,
note that some frames in state may be generated from last episode, note that some frames in obs may be generated from last episode,
they should be removed from state they should be removed from obs
""" """
state = np.zeros( obs = np.zeros(
(self.context_len + 1, ) + self.state_shape, dtype=np.uint8) (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
state_idx = np.arange(idx, obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
idx + self.context_len + 1) % self._curr_size
# confirm that no frame was generated from last episode # confirm that no frame was generated from last episode
has_last_episode = False has_last_episode = False
for k in range(self.context_len - 2, -1, -1): for k in range(self.context_len - 2, -1, -1):
to_check_idx = state_idx[k] to_check_idx = obs_idx[k]
if self.isOver[to_check_idx]: if self.isOver[to_check_idx]:
has_last_episode = True has_last_episode = True
state_idx = state_idx[k + 1:] obs_idx = obs_idx[k + 1:]
state[k + 1:] = self.state[state_idx] obs[k + 1:] = self.obs[obs_idx]
break break
if not has_last_episode: if not has_last_episode:
state = self.state[state_idx] obs = self.obs[obs_idx]
real_idx = (idx + self.context_len - 1) % self._curr_size real_idx = (idx + self.context_len - 1) % self._curr_size
action = self.action[real_idx] action = self.action[real_idx]
reward = self.reward[real_idx] reward = self.reward[real_idx]
isOver = self.isOver[real_idx] isOver = self.isOver[real_idx]
return state, reward, action, isOver return obs, reward, action, isOver
def __len__(self): def __len__(self):
return self._curr_size return self._curr_size
...@@ -106,7 +104,7 @@ class ReplayMemory(object): ...@@ -106,7 +104,7 @@ class ReplayMemory(object):
return self._curr_size return self._curr_size
def _assign(self, pos, exp): def _assign(self, pos, exp):
self.state[pos] = exp.state self.obs[pos] = exp.obs
self.reward[pos] = exp.reward self.reward[pos] = exp.reward
self.action[pos] = exp.action self.action[pos] = exp.action
self.isOver[pos] = exp.isOver self.isOver[pos] = exp.isOver
...@@ -129,15 +127,15 @@ class ReplayMemory(object): ...@@ -129,15 +127,15 @@ class ReplayMemory(object):
return self._process_batch(batch_exp) return self._process_batch(batch_exp)
def _process_batch(self, batch_exp): def _process_batch(self, batch_exp):
state = np.asarray([e[0] for e in batch_exp], dtype='uint8') obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
reward = np.asarray([e[1] for e in batch_exp], dtype='float32') reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
action = np.asarray([e[2] for e in batch_exp], dtype='int8') action = np.asarray([e[2] for e in batch_exp], dtype='int8')
isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
return [state, action, reward, isOver] return [obs, action, reward, isOver]
def save_memory(self): def save_memory(self):
save_data = [ save_data = [
self.state, self.reward, self.action, self.isOver, self._curr_size, self.obs, self.reward, self.action, self.isOver, self._curr_size,
self._curr_pos, self._context self._curr_pos, self._context
] ]
np.savez(self.file_path, *save_data) np.savez(self.file_path, *save_data)
...@@ -145,7 +143,7 @@ class ReplayMemory(object): ...@@ -145,7 +143,7 @@ class ReplayMemory(object):
def load_memory(self): def load_memory(self):
container = np.load(self.file_path, allow_pickle=True) container = np.load(self.file_path, allow_pickle=True)
[ [
self.state, self.reward, self.action, self.isOver, self._curr_size, self.obs, self.reward, self.action, self.isOver, self._curr_size,
self._curr_pos, self._context self._curr_pos, self._context
] = [container[key] for key in container] ] = [container[key] for key in container]
self._curr_size = self._curr_size.astype(int) self._curr_size = self._curr_size.astype(int)
......
../DQN/rom_files/ ../DQN_variant/rom_files
\ No newline at end of file \ No newline at end of file
../DQN/utils.py ../DQN_variant/utils.py
\ No newline at end of file \ No newline at end of file
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import gym
import numpy as np
class CartpoleAgent(object):
def __init__(self, obs_dim, act_dim, learning_rate):
self.learning_rate = learning_rate
# init weights
self.w = np.random.random((act_dim, obs_dim)) * 0.1
self.b = np.zeros(act_dim)
self.weights_total_size = self.w.size + self.b.size
def predict(self, obs):
out = np.dot(self.w, obs) + self.b
action = np.argmax(out)
return action
def learn(self, rewards, noises):
gradient = np.dot(
np.asarray(rewards, dtype=np.float32),
np.asarray(noises, dtype=np.float32))
gradient /= rewards.size
flat_weights = self.get_flat_weights()
# Compute the new weights.
new_weights = flat_weights + self.learning_rate * gradient
self.set_flat_weights(new_weights)
def set_flat_weights(self, flat_weights):
self.w = flat_weights[:self.w.size].reshape(self.w.shape)
self.b = flat_weights[self.w.size:]
def get_flat_weights(self):
flat_weights = np.concatenate(([self.w.ravel(), self.b]), axis=0)
return flat_weights
def evaluate(env, agent):
ep_reward = 0
obs = env.reset()
while True:
action = agent.predict(obs)
obs, reward, done, _ = env.step(action)
ep_reward += reward
if done:
break
return ep_reward
def reward_normalize(reward):
reward = np.asarray(reward)
max_r = np.max(reward)
min_r = np.min(reward)
if max_r == min_r:
reward = np.zeros(reward.shape)
else:
reward = (reward - min_r) / (max_r - min_r)
reward -= 0.5
return reward
if __name__ == '__main__':
env = gym.make('CartPole-v0')
agent = CartpoleAgent(obs_dim=4, act_dim=2, learning_rate=0.1)
for epcho in range(100):
rewards = []
noises = []
lastest_flat_weights = agent.get_flat_weights()
for episode in range(10):
noise = np.random.randn(agent.weights_total_size)
perturbation = noise * 0.05
agent.set_flat_weights(lastest_flat_weights + perturbation)
ep_reward = evaluate(env, agent)
noises.append(noise)
rewards.append(ep_reward)
normalized_rewards = reward_normalize(rewards)
agent.set_flat_weights(lastest_flat_weights)
agent.learn(normalized_rewards, noises)
# evaluate
if (epcho % 10) == 0:
ep_reward = evaluate(env, agent)
print('Epcho {}, Test reward {}'.format(epcho, ep_reward))
## 《PARL强化学习入门实践》课程示例
针对强化学习初学者,PARL提供了[入门课程](https://aistudio.baidu.com/aistudio/course/introduce/1335),展示最基础的5个强化学习算法代码示例。
## 课程大纲
+ 一、强化学习(RL)初印象
+ RL概述、入门路线
+ 实践:环境搭建([lesson1](lesson1/gridworld.py) 的代码提供了格子环境世界的渲染封装)
+ 二、基于表格型方法求解RL
+ MDP、状态价值、Q表格
+ 实践: [Sarsa](lesson2/sarsa)[Q-learning](lesson2/q_learning)
+ 三、基于神经网络方法求解RL
+ 函数逼近方法
+ 实践:[DQN](lesson3/dqn)
+ 四、基于策略梯度求解RL
+ 策略近似、策略梯度
+ 实践:[Policy Gradient](lesson4/policy_gradient)
+ 五、连续动作空间上求解RL
+ 实战:[DDPG](lesson5/ddpg)
## 使用说明
### 安装依赖
+ [paddlepaddle==1.6.3](https://github.com/PaddlePaddle/Paddle)
+ [parl==1.3.1](https://github.com/PaddlePaddle/PARL)
+ gym
### 运行示例
进入每个示例对应的代码文件夹中,运行
```
python train.py
```
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
import turtle
import numpy as np
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
def GridWorld(gridmap=None, is_slippery=False):
if gridmap is None:
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
env = FrozenLakeWapper(env)
return env
class FrozenLakeWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.max_y = env.desc.shape[0]
self.max_x = env.desc.shape[1]
self.t = None
self.unit = 50
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for _ in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for i in range(self.desc.shape[0]):
for j in range(self.desc.shape[1]):
x = j
y = self.max_y - 1 - i
if self.desc[i][j] == b'S': # Start
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'F': # Frozen ice
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'G': # Goal
self.draw_box(x, y, 'yellow')
elif self.desc[i][j] == b'H': # Hole
self.draw_box(x, y, 'black')
else:
self.draw_box(x, y, 'white')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
class CliffWalkingWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.t = None
self.unit = 50
self.max_x = 12
self.max_y = 4
def draw_x_line(self, y, x0, x1, color='gray'):
assert x1 > x0
self.t.color(color)
self.t.setheading(0)
self.t.up()
self.t.goto(x0, y)
self.t.down()
self.t.forward(x1 - x0)
def draw_y_line(self, x, y0, y1, color='gray'):
assert y1 > y0
self.t.color(color)
self.t.setheading(90)
self.t.up()
self.t.goto(x, y0)
self.t.down()
self.t.forward(y1 - y0)
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for i in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for _ in range(2):
self.t.forward(self.max_x * self.unit)
self.t.left(90)
self.t.forward(self.max_y * self.unit)
self.t.left(90)
for i in range(1, self.max_y):
self.draw_x_line(
y=i * self.unit, x0=0, x1=self.max_x * self.unit)
for i in range(1, self.max_x):
self.draw_y_line(
x=i * self.unit, y0=0, y1=self.max_y * self.unit)
for i in range(1, self.max_x - 1):
self.draw_box(i, 0, 'black')
self.draw_box(self.max_x - 1, 0, 'yellow')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
if __name__ == '__main__':
# 环境1:FrozenLake, 可以配置冰面是否是滑的
# 0 left, 1 down, 2 right, 3 up
env = gym.make("FrozenLake-v0", is_slippery=False)
env = FrozenLakeWapper(env)
# 环境2:CliffWalking, 悬崖环境
# env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
# env = CliffWalkingWapper(env)
# 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
# gridmap = [
# 'SFFF',
# 'FHFF',
# 'FFFF',
# 'HFGF' ]
# env = GridWorld(gridmap)
env.reset()
for step in range(10):
action = np.random.randint(0, 4)
obs, reward, done, info = env.step(action)
print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
step, action, obs, reward, done, info))
env.render() # 渲染一帧图像
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import numpy as np
class QLearningAgent(object):
def __init__(self,
obs_n,
act_n,
learning_rate=0.01,
gamma=0.9,
e_greed=0.1):
self.act_n = act_n # 动作维度,有几个动作可选
self.lr = learning_rate # 学习率
self.gamma = gamma # reward的衰减率
self.epsilon = e_greed # 按一定概率随机选动作
self.Q = np.zeros((obs_n, act_n))
# 根据输入观察值,采样输出的动作值,带探索
def sample(self, obs):
if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作
action = self.predict(obs)
else:
action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作
return action
# 根据输入观察值,预测输出的动作值
def predict(self, obs):
Q_list = self.Q[obs, :]
maxQ = np.max(Q_list)
action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action
action = np.random.choice(action_list)
return action
# 学习方法,也就是更新Q-table的方法
def learn(self, obs, action, reward, next_obs, done):
""" off-policy
obs: 交互前的obs, s_t
action: 本次交互选择的action, a_t
reward: 本次动作获得的奖励r
next_obs: 本次交互后的obs, s_t+1
done: episode是否结束
"""
predict_Q = self.Q[obs, action]
if done:
target_Q = reward # 没有下一个状态了
else:
target_Q = reward + self.gamma * np.max(
self.Q[next_obs, :]) # Q-learning
self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q
# 把 Q表格 的数据保存到文件中
def save(self):
npy_file = './q_table.npy'
np.save(npy_file, self.Q)
print(npy_file + ' saved.')
# 从文件中读取数据到 Q表格
def restore(self, npy_file='./q_table.npy'):
self.Q = np.load(npy_file)
print(npy_file + ' loaded.')
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
import turtle
import numpy as np
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
def GridWorld(gridmap=None, is_slippery=False):
if gridmap is None:
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
env = FrozenLakeWapper(env)
return env
class FrozenLakeWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.max_y = env.desc.shape[0]
self.max_x = env.desc.shape[1]
self.t = None
self.unit = 50
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for _ in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for i in range(self.desc.shape[0]):
for j in range(self.desc.shape[1]):
x = j
y = self.max_y - 1 - i
if self.desc[i][j] == b'S': # Start
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'F': # Frozen ice
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'G': # Goal
self.draw_box(x, y, 'yellow')
elif self.desc[i][j] == b'H': # Hole
self.draw_box(x, y, 'black')
else:
self.draw_box(x, y, 'white')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
class CliffWalkingWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.t = None
self.unit = 50
self.max_x = 12
self.max_y = 4
def draw_x_line(self, y, x0, x1, color='gray'):
assert x1 > x0
self.t.color(color)
self.t.setheading(0)
self.t.up()
self.t.goto(x0, y)
self.t.down()
self.t.forward(x1 - x0)
def draw_y_line(self, x, y0, y1, color='gray'):
assert y1 > y0
self.t.color(color)
self.t.setheading(90)
self.t.up()
self.t.goto(x, y0)
self.t.down()
self.t.forward(y1 - y0)
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for i in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for _ in range(2):
self.t.forward(self.max_x * self.unit)
self.t.left(90)
self.t.forward(self.max_y * self.unit)
self.t.left(90)
for i in range(1, self.max_y):
self.draw_x_line(
y=i * self.unit, x0=0, x1=self.max_x * self.unit)
for i in range(1, self.max_x):
self.draw_y_line(
x=i * self.unit, y0=0, y1=self.max_y * self.unit)
for i in range(1, self.max_x - 1):
self.draw_box(i, 0, 'black')
self.draw_box(self.max_x - 1, 0, 'yellow')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
if __name__ == '__main__':
# 环境1:FrozenLake, 可以配置冰面是否是滑的
# 0 left, 1 down, 2 right, 3 up
env = gym.make("FrozenLake-v0", is_slippery=False)
env = FrozenLakeWapper(env)
# 环境2:CliffWalking, 悬崖环境
# env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
# env = CliffWalkingWapper(env)
# 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
# gridmap = [
# 'SFFF',
# 'FHFF',
# 'FFFF',
# 'HFGF' ]
# env = GridWorld(gridmap)
env.reset()
for step in range(10):
action = np.random.randint(0, 4)
obs, reward, done, info = env.step(action)
print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
step, action, obs, reward, done, info))
# env.render() # 渲染一帧图像
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
from gridworld import CliffWalkingWapper, FrozenLakeWapper
from agent import QLearningAgent
import time
def run_episode(env, agent, render=False):
total_steps = 0 # 记录每个episode走了多少step
total_reward = 0
obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)
while True:
action = agent.sample(obs) # 根据算法选择一个动作
next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互
# 训练 Q-learning算法
agent.learn(obs, action, reward, next_obs, done)
obs = next_obs # 存储上一个观察值
total_reward += reward
total_steps += 1 # 计算step数
if render:
env.render() #渲染新的一帧图形
if done:
break
return total_reward, total_steps
def test_episode(env, agent):
total_reward = 0
obs = env.reset()
while True:
action = agent.predict(obs) # greedy
next_obs, reward, done, _ = env.step(action)
total_reward += reward
obs = next_obs
time.sleep(0.5)
env.render()
if done:
print('test reward = %.1f' % (total_reward))
break
def main():
# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up
# env = FrozenLakeWapper(env)
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env)
agent = QLearningAgent(
obs_n=env.observation_space.n,
act_n=env.action_space.n,
learning_rate=0.1,
gamma=0.9,
e_greed=0.1)
is_render = False
for episode in range(500):
ep_reward, ep_steps = run_episode(env, agent, is_render)
print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
ep_reward))
# 每隔20个episode渲染一下看看效果
if episode % 20 == 0:
is_render = True
else:
is_render = False
# 训练结束,查看算法效果
test_episode(env, agent)
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import numpy as np
class SarsaAgent(object):
def __init__(self,
obs_n,
act_n,
learning_rate=0.01,
gamma=0.9,
e_greed=0.1):
self.act_n = act_n # 动作维度,有几个动作可选
self.lr = learning_rate # 学习率
self.gamma = gamma # reward的衰减率
self.epsilon = e_greed # 按一定概率随机选动作
self.Q = np.zeros((obs_n, act_n))
# 根据输入观察值,采样输出的动作值,带探索
def sample(self, obs):
if np.random.uniform(0, 1) < (1.0 - self.epsilon): #根据table的Q值选动作
action = self.predict(obs)
else:
action = np.random.choice(self.act_n) #有一定概率随机探索选取一个动作
return action
# 根据输入观察值,预测输出的动作值
def predict(self, obs):
Q_list = self.Q[obs, :]
maxQ = np.max(Q_list)
action_list = np.where(Q_list == maxQ)[0] # maxQ可能对应多个action
action = np.random.choice(action_list)
return action
# 学习方法,也就是更新Q-table的方法
def learn(self, obs, action, reward, next_obs, next_action, done):
""" on-policy
obs: 交互前的obs, s_t
action: 本次交互选择的action, a_t
reward: 本次动作获得的奖励r
next_obs: 本次交互后的obs, s_t+1
next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1
done: episode是否结束
"""
predict_Q = self.Q[obs, action]
if done:
target_Q = reward # 没有下一个状态了
else:
target_Q = reward + self.gamma * self.Q[next_obs,
next_action] # Sarsa
self.Q[obs, action] += self.lr * (target_Q - predict_Q) # 修正q
def save(self):
npy_file = './q_table.npy'
np.save(npy_file, self.Q)
print(npy_file + ' saved.')
def restore(self, npy_file='./q_table.npy'):
self.Q = np.load(npy_file)
print(npy_file + ' loaded.')
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
import turtle
import numpy as np
# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
def GridWorld(gridmap=None, is_slippery=False):
if gridmap is None:
gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
env = FrozenLakeWapper(env)
return env
class FrozenLakeWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.max_y = env.desc.shape[0]
self.max_x = env.desc.shape[1]
self.t = None
self.unit = 50
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for _ in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for i in range(self.desc.shape[0]):
for j in range(self.desc.shape[1]):
x = j
y = self.max_y - 1 - i
if self.desc[i][j] == b'S': # Start
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'F': # Frozen ice
self.draw_box(x, y, 'white')
elif self.desc[i][j] == b'G': # Goal
self.draw_box(x, y, 'yellow')
elif self.desc[i][j] == b'H': # Hole
self.draw_box(x, y, 'black')
else:
self.draw_box(x, y, 'white')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
class CliffWalkingWapper(gym.Wrapper):
def __init__(self, env):
gym.Wrapper.__init__(self, env)
self.t = None
self.unit = 50
self.max_x = 12
self.max_y = 4
def draw_x_line(self, y, x0, x1, color='gray'):
assert x1 > x0
self.t.color(color)
self.t.setheading(0)
self.t.up()
self.t.goto(x0, y)
self.t.down()
self.t.forward(x1 - x0)
def draw_y_line(self, x, y0, y1, color='gray'):
assert y1 > y0
self.t.color(color)
self.t.setheading(90)
self.t.up()
self.t.goto(x, y0)
self.t.down()
self.t.forward(y1 - y0)
def draw_box(self, x, y, fillcolor='', line_color='gray'):
self.t.up()
self.t.goto(x * self.unit, y * self.unit)
self.t.color(line_color)
self.t.fillcolor(fillcolor)
self.t.setheading(90)
self.t.down()
self.t.begin_fill()
for i in range(4):
self.t.forward(self.unit)
self.t.right(90)
self.t.end_fill()
def move_player(self, x, y):
self.t.up()
self.t.setheading(90)
self.t.fillcolor('red')
self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
def render(self):
if self.t == None:
self.t = turtle.Turtle()
self.wn = turtle.Screen()
self.wn.setup(self.unit * self.max_x + 100,
self.unit * self.max_y + 100)
self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
self.unit * self.max_y)
self.t.shape('circle')
self.t.width(2)
self.t.speed(0)
self.t.color('gray')
for _ in range(2):
self.t.forward(self.max_x * self.unit)
self.t.left(90)
self.t.forward(self.max_y * self.unit)
self.t.left(90)
for i in range(1, self.max_y):
self.draw_x_line(
y=i * self.unit, x0=0, x1=self.max_x * self.unit)
for i in range(1, self.max_x):
self.draw_y_line(
x=i * self.unit, y0=0, y1=self.max_y * self.unit)
for i in range(1, self.max_x - 1):
self.draw_box(i, 0, 'black')
self.draw_box(self.max_x - 1, 0, 'yellow')
self.t.shape('turtle')
x_pos = self.s % self.max_x
y_pos = self.max_y - 1 - int(self.s / self.max_x)
self.move_player(x_pos, y_pos)
if __name__ == '__main__':
# 环境1:FrozenLake, 可以配置冰面是否是滑的
# 0 left, 1 down, 2 right, 3 up
env = gym.make("FrozenLake-v0", is_slippery=False)
env = FrozenLakeWapper(env)
# 环境2:CliffWalking, 悬崖环境
# env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
# env = CliffWalkingWapper(env)
# 环境3:自定义格子世界,可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
# gridmap = [
# 'SFFF',
# 'FHFF',
# 'FFFF',
# 'HFGF' ]
# env = GridWorld(gridmap)
env.reset()
for step in range(10):
action = np.random.randint(0, 4)
obs, reward, done, info = env.step(action)
print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
step, action, obs, reward, done, info))
# env.render() # 渲染一帧图像
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# -*- coding: utf-8 -*-
import gym
from gridworld import CliffWalkingWapper, FrozenLakeWapper
from agent import SarsaAgent
import time
def run_episode(env, agent, render=False):
total_steps = 0 # 记录每个episode走了多少step
total_reward = 0
obs = env.reset() # 重置环境, 重新开一局(即开始新的一个episode)
action = agent.sample(obs) # 根据算法选择一个动作
while True:
next_obs, reward, done, _ = env.step(action) # 与环境进行一个交互
next_action = agent.sample(next_obs) # 根据算法选择一个动作
# 训练 Sarsa 算法
agent.learn(obs, action, reward, next_obs, next_action, done)
action = next_action
obs = next_obs # 存储上一个观察值
total_reward += reward
total_steps += 1 # 计算step数
if render:
env.render() #渲染新的一帧图形
if done:
break
return total_reward, total_steps
def test_episode(env, agent):
total_reward = 0
obs = env.reset()
while True:
action = agent.predict(obs) # greedy
next_obs, reward, done, _ = env.step(action)
total_reward += reward
obs = next_obs
time.sleep(0.5)
env.render()
if done:
print('test reward = %.1f' % (total_reward))
break
def main():
# env = gym.make("FrozenLake-v0", is_slippery=False) # 0 left, 1 down, 2 right, 3 up
# env = FrozenLakeWapper(env)
env = gym.make("CliffWalking-v0") # 0 up, 1 right, 2 down, 3 left
env = CliffWalkingWapper(env)
agent = SarsaAgent(
obs_n=env.observation_space.n,
act_n=env.action_space.n,
learning_rate=0.1,
gamma=0.9,
e_greed=0.1)
is_render = False
for episode in range(500):
ep_reward, ep_steps = run_episode(env, agent, is_render)
print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
ep_reward))
# 每隔20个episode渲染一下看看效果
if episode % 20 == 0:
is_render = True
else:
is_render = False
# 训练结束,查看算法效果
test_episode(env, agent)
if __name__ == "__main__":
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import numpy as np
import paddle.fluid as fluid
import parl
from parl import layers
class Agent(parl.Agent):
def __init__(self,
algorithm,
obs_dim,
act_dim,
e_greed=0.1,
e_greed_decrement=0):
assert isinstance(obs_dim, int)
assert isinstance(act_dim, int)
self.obs_dim = obs_dim
self.act_dim = act_dim
super(Agent, self).__init__(algorithm)
self.global_step = 0
self.update_target_steps = 200 # 每隔200个training steps再把model的参数复制到target_model中
self.e_greed = e_greed # 有一定概率随机选取动作,探索
self.e_greed_decrement = e_greed_decrement # 随着训练逐步收敛,探索的程度慢慢降低
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.value = self.alg.predict(obs)
with fluid.program_guard(self.learn_program): # 搭建计算图用于 更新Q网络,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)
def sample(self, obs):
sample = np.random.rand() # 产生0~1之间的小数
if sample < self.e_greed:
act = np.random.randint(self.act_dim) # 探索:每个动作都有概率被选择
else:
act = self.predict(obs) # 选择最优动作
self.e_greed = max(
0.01, self.e_greed - self.e_greed_decrement) # 随着训练逐步收敛,探索的程度慢慢降低
return act
def predict(self, obs): # 选择最优动作
obs = np.expand_dims(obs, axis=0)
pred_Q = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.value])[0]
pred_Q = np.squeeze(pred_Q, axis=0)
act = np.argmax(pred_Q) # 选择Q最大的下标,即对应的动作
return act
def learn(self, obs, act, reward, next_obs, terminal):
# 每隔200个training steps同步一次model和target_model的参数
if self.global_step % self.update_target_steps == 0:
self.alg.sync_target()
self.global_step += 1
act = np.expand_dims(act, -1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int32'),
'reward': reward,
'next_obs': next_obs.astype('float32'),
'terminal': terminal
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0] # 训练一次网络
return cost
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import copy
import paddle.fluid as fluid
import parl
from parl import layers
class DQN(parl.Algorithm):
def __init__(self, model, act_dim=None, gamma=None, lr=None):
""" DQN algorithm
Args:
model (parl.Model): 定义Q函数的前向网络结构
act_dim (int): action空间的维度,即有几个action
gamma (float): reward的衰减因子
lr (float): learning_rate,学习率.
"""
self.model = model
self.target_model = copy.deepcopy(model)
assert isinstance(act_dim, int)
assert isinstance(gamma, float)
assert isinstance(lr, float)
self.act_dim = act_dim
self.gamma = gamma
self.lr = lr
def predict(self, obs):
""" 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...]
"""
return self.model.value(obs)
def learn(self, obs, action, reward, next_obs, terminal):
""" 使用DQN算法更新self.model的value网络
"""
# 从target_model中获取 max Q' 的值,用于计算target_Q
next_pred_value = self.target_model.value(next_obs)
best_v = layers.reduce_max(next_pred_value, dim=1)
best_v.stop_gradient = True # 阻止梯度传递
terminal = layers.cast(terminal, dtype='float32')
target = reward + (1.0 - terminal) * self.gamma * best_v
pred_value = self.model.value(obs) # 获取Q预测值
# 将action转onehot向量,比如:3 => [0,0,0,1,0]
action_onehot = layers.one_hot(action, self.act_dim)
action_onehot = layers.cast(action_onehot, dtype='float32')
# 下面一行是逐元素相乘,拿到action对应的 Q(s,a)
# 比如:pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
# ==> pred_action_value = [[3.9]]
pred_action_value = layers.reduce_sum(
layers.elementwise_mul(action_onehot, pred_value), dim=1)
# 计算 Q(s,a) 与 target_Q的均方差,得到loss
cost = layers.square_error_cost(pred_action_value, target)
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(learning_rate=self.lr) # 使用Adam优化器
optimizer.minimize(cost)
return cost
def sync_target(self):
""" 把 self.model 的模型参数值同步到 self.target_model
"""
self.model.sync_weights_to(self.target_model)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import parl
from parl import layers # 封装了 paddle.fluid.layers 的API
class Model(parl.Model):
def __init__(self, act_dim):
hid1_size = 128
hid2_size = 128
# 3层全连接网络
self.fc1 = layers.fc(size=hid1_size, act='relu')
self.fc2 = layers.fc(size=hid2_size, act='relu')
self.fc3 = layers.fc(size=act_dim, act=None)
def value(self, obs):
h1 = self.fc1(obs)
h2 = self.fc2(h1)
Q = self.fc3(h2)
return Q
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
import random
import collections
import numpy as np
class ReplayMemory(object):
def __init__(self, max_size):
self.buffer = collections.deque(maxlen=max_size)
def append(self, exp):
self.buffer.append(exp)
def sample(self, batch_size):
mini_batch = random.sample(self.buffer, batch_size)
obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
for experience in mini_batch:
s, a, r, s_p, done = experience
obs_batch.append(s)
action_batch.append(a)
reward_batch.append(r)
next_obs_batch.append(s_p)
done_batch.append(done)
return np.array(obs_batch).astype('float32'), \
np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
def __len__(self):
return len(self.buffer)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import os
import gym
import numpy as np
import parl
from parl.utils import logger # 日志打印工具
from model import Model
from algorithm import DQN # from parl.algorithms import DQN # parl >= 1.3.1
from agent import Agent
from replay_memory import ReplayMemory
LEARN_FREQ = 5 # 训练频率,不需要每一个step都learn,攒一些新增经验后再learn,提高效率
MEMORY_SIZE = 20000 # replay memory的大小,越大越占用内存
MEMORY_WARMUP_SIZE = 200 # replay_memory 里需要预存一些经验数据,再从里面sample一个batch的经验让agent去learn
BATCH_SIZE = 32 # 每次给agent learn的数据数量,从replay memory随机里sample一批数据出来
LEARNING_RATE = 0.001 # 学习率
GAMMA = 0.99 # reward 的衰减因子,一般取 0.9 到 0.999 不等
# 训练一个episode
def run_episode(env, agent, rpm):
total_reward = 0
obs = env.reset()
step = 0
while True:
step += 1
action = agent.sample(obs) # 采样动作,所有动作都有概率被尝试到
next_obs, reward, done, _ = env.step(action)
rpm.append((obs, action, reward, next_obs, done))
# train model
if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
(batch_obs, batch_action, batch_reward, batch_next_obs,
batch_done) = rpm.sample(BATCH_SIZE)
train_loss = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_obs,
batch_done) # s,a,r,s',done
total_reward += reward
obs = next_obs
if done:
break
return total_reward
# 评估 agent, 跑 5 个episode,总reward求平均
def evaluate(env, agent, render=False):
eval_reward = []
for i in range(5):
obs = env.reset()
episode_reward = 0
while True:
action = agent.predict(obs) # 预测动作,只选最优动作
obs, reward, done, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if done:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)
def main():
env = gym.make(
'CartPole-v0'
) # CartPole-v0: expected reward > 180 MountainCar-v0 : expected reward > -120
action_dim = env.action_space.n # CartPole-v0: 2
obs_shape = env.observation_space.shape # CartPole-v0: (4,)
rpm = ReplayMemory(MEMORY_SIZE) # DQN的经验回放池
# 根据parl框架构建agent
model = Model(act_dim=action_dim)
algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
agent = Agent(
algorithm,
obs_dim=obs_shape[0],
act_dim=action_dim,
e_greed=0.1, # 有一定概率随机选取动作,探索
e_greed_decrement=1e-6) # 随着训练逐步收敛,探索的程度慢慢降低
# 加载模型
# save_path = './dqn_model.ckpt'
# agent.restore(save_path)
# 先往经验池里存一些数据,避免最开始训练的时候样本丰富度不够
while len(rpm) < MEMORY_WARMUP_SIZE:
run_episode(env, agent, rpm)
max_episode = 2000
# start train
episode = 0
while episode < max_episode: # 训练max_episode个回合,test部分不计算入episode数量
# train part
for i in range(0, 50):
total_reward = run_episode(env, agent, rpm)
episode += 1
# test part
eval_reward = evaluate(env, agent, render=True) # render=True 查看显示效果
logger.info('episode:{} e_greed:{} Test reward:{}'.format(
episode, agent.e_greed, eval_reward))
# 训练结束,保存模型
save_path = './dqn_model.ckpt'
agent.save(save_path)
if __name__ == '__main__':
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import numpy as np
import paddle.fluid as fluid
import parl
from parl import layers
class Agent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
self.obs_dim = obs_dim
self.act_dim = act_dim
super(Agent, self).__init__(algorithm)
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program): # 搭建计算图用于 预测动作,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.act_prob = self.alg.predict(obs)
with fluid.program_guard(
self.learn_program): # 搭建计算图用于 更新policy网络,定义输入输出变量
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='int64')
reward = layers.data(name='reward', shape=[], dtype='float32')
self.cost = self.alg.learn(obs, act, reward)
def sample(self, obs):
obs = np.expand_dims(obs, axis=0) # 增加一维维度
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])[0]
act_prob = np.squeeze(act_prob, axis=0) # 减少一维维度
act = np.random.choice(range(self.act_dim), p=act_prob) # 根据动作概率选取动作
return act
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act_prob = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.act_prob])[0]
act_prob = np.squeeze(act_prob, axis=0)
act = np.argmax(act_prob) # 根据动作概率选择概率最高的动作
return act
def learn(self, obs, act, reward):
act = np.expand_dims(act, axis=-1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int64'),
'reward': reward.astype('float32')
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0]
return cost
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import paddle.fluid as fluid
import parl
from parl import layers
class PolicyGradient(parl.Algorithm):
def __init__(self, model, lr=None):
""" Policy Gradient algorithm
Args:
model (parl.Model): policy的前向网络.
lr (float): 学习率.
"""
self.model = model
assert isinstance(lr, float)
self.lr = lr
def predict(self, obs):
""" 使用policy model预测输出的动作概率
"""
return self.model(obs)
def learn(self, obs, action, reward):
""" 用policy gradient 算法更新policy model
"""
act_prob = self.model(obs) # 获取输出动作概率
# log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
log_prob = layers.reduce_sum(
-1.0 * layers.log(act_prob) * layers.one_hot(
action, act_prob.shape[1]),
dim=1)
cost = log_prob * reward
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.Adam(self.lr)
optimizer.minimize(cost)
return cost
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -12,13 +12,21 @@ ...@@ -12,13 +12,21 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import warnings #-*- coding: utf-8 -*-
warnings.simplefilter('default') import parl
from parl import layers
warnings.warn(
"module `parl.framework.agent_base.Agent` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Agent` instead.",
DeprecationWarning,
stacklevel=2)
from parl.core.fluid.agent import * class Model(parl.Model):
def __init__(self, act_dim):
act_dim = act_dim
hid1_size = act_dim * 10
self.fc1 = layers.fc(size=hid1_size, act='tanh')
self.fc2 = layers.fc(size=act_dim, act='softmax')
def forward(self, obs): # 可直接用 model = Model(5); model(obs)调用
out = self.fc1(obs)
out = self.fc2(out)
return out
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import os
import gym
import numpy as np
import parl
from agent import Agent
from model import Model
from algorithm import PolicyGradient # from parl.algorithms import PolicyGradient
from parl.utils import logger
LEARNING_RATE = 1e-3
# 训练一个episode
def run_episode(env, agent):
obs_list, action_list, reward_list = [], [], []
obs = env.reset()
while True:
obs_list.append(obs)
action = agent.sample(obs)
action_list.append(action)
obs, reward, done, info = env.step(action)
reward_list.append(reward)
if done:
break
return obs_list, action_list, reward_list
# 评估 agent, 跑 5 个episode,总reward求平均
def evaluate(env, agent, render=False):
eval_reward = []
for i in range(5):
obs = env.reset()
episode_reward = 0
while True:
action = agent.predict(obs)
obs, reward, isOver, _ = env.step(action)
episode_reward += reward
if render:
env.render()
if isOver:
break
eval_reward.append(episode_reward)
return np.mean(eval_reward)
def calc_reward_to_go(reward_list, gamma=1.0):
for i in range(len(reward_list) - 2, -1, -1):
# G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
reward_list[i] += gamma * reward_list[i + 1] # Gt
return np.array(reward_list)
def main():
env = gym.make('CartPole-v0')
# env = env.unwrapped # Cancel the minimum score limit
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.n
logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
# 根据parl框架构建agent
model = Model(act_dim=act_dim)
alg = PolicyGradient(model, lr=LEARNING_RATE)
agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)
# 加载模型
# if os.path.exists('./model.ckpt'):
# agent.restore('./model.ckpt')
# run_episode(env, agent, train_or_test='test', render=True)
# exit()
for i in range(1000):
obs_list, action_list, reward_list = run_episode(env, agent)
if i % 10 == 0:
logger.info("Episode {}, Reward Sum {}.".format(
i, sum(reward_list)))
batch_obs = np.array(obs_list)
batch_action = np.array(action_list)
batch_reward = calc_reward_to_go(reward_list)
agent.learn(batch_obs, batch_action, batch_reward)
if (i + 1) % 100 == 0:
total_reward = evaluate(env, agent, render=True)
logger.info('Test reward: {}'.format(total_reward))
# save the parameters to ./model.ckpt
agent.save('./model.ckpt')
if __name__ == '__main__':
main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import numpy as np
import parl
from parl import layers
from paddle import fluid
class Agent(parl.Agent):
def __init__(self, algorithm, obs_dim, act_dim):
assert isinstance(obs_dim, int)
assert isinstance(act_dim, int)
self.obs_dim = obs_dim
self.act_dim = act_dim
super(Agent, self).__init__(algorithm)
# 注意:最开始先同步self.model和self.target_model的参数.
self.alg.sync_target(decay=0)
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
self.pred_act = self.alg.predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(
name='obs', shape=[self.obs_dim], dtype='float32')
act = layers.data(
name='act', shape=[self.act_dim], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(
name='next_obs', shape=[self.obs_dim], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
_, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
terminal)
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act = self.fluid_executor.run(
self.pred_program, feed={'obs': obs},
fetch_list=[self.pred_act])[0]
act = np.squeeze(act)
return act
def learn(self, obs, act, reward, next_obs, terminal):
feed = {
'obs': obs,
'act': act,
'reward': reward,
'next_obs': next_obs,
'terminal': terminal
}
critic_cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
self.alg.sync_target()
return critic_cost
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import parl
from parl import layers
from copy import deepcopy
from paddle import fluid
class DDPG(parl.Algorithm):
def __init__(self,
model,
gamma=None,
tau=None,
actor_lr=None,
critic_lr=None):
""" DDPG algorithm
Args:
model (parl.Model): actor and critic 的前向网络.
model 必须实现 get_actor_params() 方法.
gamma (float): reward的衰减因子.
tau (float): self.target_model 跟 self.model 同步参数 的 软更新参数
actor_lr (float): actor 的学习率
critic_lr (float): critic 的学习率
"""
assert isinstance(gamma, float)
assert isinstance(tau, float)
assert isinstance(actor_lr, float)
assert isinstance(critic_lr, float)
self.gamma = gamma
self.tau = tau
self.actor_lr = actor_lr
self.critic_lr = critic_lr
self.model = model
self.target_model = deepcopy(model)
def predict(self, obs):
""" 使用 self.model 的 actor model 来预测动作
"""
return self.model.policy(obs)
def learn(self, obs, action, reward, next_obs, terminal):
""" 用DDPG算法更新 actor 和 critic
"""
actor_cost = self._actor_learn(obs)
critic_cost = self._critic_learn(obs, action, reward, next_obs,
terminal)
return actor_cost, critic_cost
def _actor_learn(self, obs):
action = self.model.policy(obs)
Q = self.model.value(obs, action)
cost = layers.reduce_mean(-1.0 * Q)
optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
return cost
def _critic_learn(self, obs, action, reward, next_obs, terminal):
next_action = self.target_model.policy(next_obs)
next_Q = self.target_model.value(next_obs, next_action)
terminal = layers.cast(terminal, dtype='float32')
target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
target_Q.stop_gradient = True
Q = self.model.value(obs, action)
cost = layers.square_error_cost(Q, target_Q)
cost = layers.reduce_mean(cost)
optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
optimizer.minimize(cost)
return cost
def sync_target(self, decay=None, share_vars_parallel_executor=None):
""" self.target_model从self.model复制参数过来,若decay不为None,则是软更新
"""
if decay is None:
decay = 1.0 - self.tau
self.model.sync_weights_to(
self.target_model,
decay=decay,
share_vars_parallel_executor=share_vars_parallel_executor)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
"""
Classic cart-pole system implemented by Rich Sutton et al.
Copied from http://incompleteideas.net/sutton/book/code/pole.c
permalink: https://perma.cc/C9ZM-652R
Continuous version by Ian Danforth
"""
import math
import gym
from gym import spaces, logger
from gym.utils import seeding
import numpy as np
class ContinuousCartPoleEnv(gym.Env):
metadata = {
'render.modes': ['human', 'rgb_array'],
'video.frames_per_second': 50
}
def __init__(self):
self.gravity = 9.8
self.masscart = 1.0
self.masspole = 0.1
self.total_mass = (self.masspole + self.masscart)
self.length = 0.5 # actually half the pole's length
self.polemass_length = (self.masspole * self.length)
self.force_mag = 30.0
self.tau = 0.02 # seconds between state updates
self.min_action = -1.0
self.max_action = 1.0
# Angle at which to fail the episode
self.theta_threshold_radians = 12 * 2 * math.pi / 360
self.x_threshold = 2.4
# Angle limit set to 2 * theta_threshold_radians so failing observation
# is still within bounds
high = np.array([
self.x_threshold * 2,
np.finfo(np.float32).max, self.theta_threshold_radians * 2,
np.finfo(np.float32).max
])
self.action_space = spaces.Box(
low=self.min_action, high=self.max_action, shape=(1, ))
self.observation_space = spaces.Box(-high, high)
self.seed()
self.viewer = None
self.state = None
self.steps_beyond_done = None
def seed(self, seed=None):
self.np_random, seed = seeding.np_random(seed)
return [seed]
def stepPhysics(self, force):
x, x_dot, theta, theta_dot = self.state
costheta = math.cos(theta)
sintheta = math.sin(theta)
temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta
) / self.total_mass
thetaacc = (self.gravity * sintheta - costheta * temp) / \
(self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
x = x + self.tau * x_dot
x_dot = x_dot + self.tau * xacc
theta = theta + self.tau * theta_dot
theta_dot = theta_dot + self.tau * thetaacc
return (x, x_dot, theta, theta_dot)
def step(self, action):
action = np.expand_dims(action, 0)
assert self.action_space.contains(action), \
"%r (%s) invalid" % (action, type(action))
# Cast action to float to strip np trappings
force = self.force_mag * float(action)
self.state = self.stepPhysics(force)
x, x_dot, theta, theta_dot = self.state
done = x < -self.x_threshold \
or x > self.x_threshold \
or theta < -self.theta_threshold_radians \
or theta > self.theta_threshold_radians
done = bool(done)
if not done:
reward = 1.0
elif self.steps_beyond_done is None:
# Pole just fell!
self.steps_beyond_done = 0
reward = 1.0
else:
if self.steps_beyond_done == 0:
logger.warn("""
You are calling 'step()' even though this environment has already returned
done = True. You should always call 'reset()' once you receive 'done = True'
Any further steps are undefined behavior.
""")
self.steps_beyond_done += 1
reward = 0.0
return np.array(self.state), reward, done, {}
def reset(self):
self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4, ))
self.steps_beyond_done = None
return np.array(self.state)
def render(self, mode='human'):
screen_width = 600
screen_height = 400
world_width = self.x_threshold * 2
scale = screen_width / world_width
carty = 100 # TOP OF CART
polewidth = 10.0
polelen = scale * 1.0
cartwidth = 50.0
cartheight = 30.0
if self.viewer is None:
from gym.envs.classic_control import rendering
self.viewer = rendering.Viewer(screen_width, screen_height)
l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
axleoffset = cartheight / 4.0
cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
self.carttrans = rendering.Transform()
cart.add_attr(self.carttrans)
self.viewer.add_geom(cart)
l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2
pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
pole.set_color(.8, .6, .4)
self.poletrans = rendering.Transform(translation=(0, axleoffset))
pole.add_attr(self.poletrans)
pole.add_attr(self.carttrans)
self.viewer.add_geom(pole)
self.axle = rendering.make_circle(polewidth / 2)
self.axle.add_attr(self.poletrans)
self.axle.add_attr(self.carttrans)
self.axle.set_color(.5, .5, .8)
self.viewer.add_geom(self.axle)
self.track = rendering.Line((0, carty), (screen_width, carty))
self.track.set_color(0, 0, 0)
self.viewer.add_geom(self.track)
if self.state is None:
return None
x = self.state
cartx = x[0] * scale + screen_width / 2.0 # MIDDLE OF CART
self.carttrans.set_translation(cartx, carty)
self.poletrans.set_rotation(-x[2])
return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
def close(self):
if self.viewer:
self.viewer.close()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import paddle.fluid as fluid
import parl
from parl import layers
class Model(parl.Model):
def __init__(self, act_dim):
self.actor_model = ActorModel(act_dim)
self.critic_model = CriticModel()
def policy(self, obs):
return self.actor_model.policy(obs)
def value(self, obs, act):
return self.critic_model.value(obs, act)
def get_actor_params(self):
return self.actor_model.parameters()
class ActorModel(parl.Model):
def __init__(self, act_dim):
hid_size = 100
self.fc1 = layers.fc(size=hid_size, act='relu')
self.fc2 = layers.fc(size=act_dim, act='tanh')
def policy(self, obs):
hid = self.fc1(obs)
means = self.fc2(hid)
return means
class CriticModel(parl.Model):
def __init__(self):
hid_size = 100
self.fc1 = layers.fc(size=hid_size, act='relu')
self.fc2 = layers.fc(size=1, act=None)
def value(self, obs, act):
concat = layers.concat([obs, act], axis=1)
hid = self.fc1(concat)
Q = self.fc2(hid)
Q = layers.squeeze(Q, axes=[1])
return Q
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
import random
import collections
import numpy as np
class ReplayMemory(object):
def __init__(self, max_size):
self.buffer = collections.deque(maxlen=max_size)
def append(self, exp):
self.buffer.append(exp)
def sample(self, batch_size):
mini_batch = random.sample(self.buffer, batch_size)
obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
for experience in mini_batch:
s, a, r, s_p, done = experience
obs_batch.append(s)
action_batch.append(a)
reward_batch.append(r)
next_obs_batch.append(s_p)
done_batch.append(done)
return np.array(obs_batch).astype('float32'), \
np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
def __len__(self):
return len(self.buffer)
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
#-*- coding: utf-8 -*-
import gym
import numpy as np
import parl
from parl.utils import logger
from agent import Agent
from model import Model
from algorithm import DDPG # from parl.algorithms import DDPG
from env import ContinuousCartPoleEnv
from replay_memory import ReplayMemory
ACTOR_LR = 1e-3 # Actor网络的 learning rate
CRITIC_LR = 1e-3 # Critic网络的 learning rate
GAMMA = 0.99 # reward 的衰减因子
TAU = 0.001 # 软更新的系数
MEMORY_SIZE = int(1e6) # 经验池大小
MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20 # 预存一部分经验之后再开始训练
BATCH_SIZE = 128
REWARD_SCALE = 0.1 # reward 缩放系数
NOISE = 0.05 # 动作噪声方差
TRAIN_EPISODE = 6e3 # 训练的总episode数
# 训练一个episode
def run_episode(agent, env, rpm):
obs = env.reset()
total_reward = 0
steps = 0
while True:
steps += 1
batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32'))
# 增加探索扰动, 输出限制在 [-1.0, 1.0] 范围内
action = np.clip(np.random.normal(action, NOISE), -1.0, 1.0)
next_obs, reward, done, info = env.step(action)
action = [action] # 方便存入replaymemory
rpm.append((obs, action, REWARD_SCALE * reward, next_obs, done))
if len(rpm) > MEMORY_WARMUP_SIZE and (steps % 5) == 0:
(batch_obs, batch_action, batch_reward, batch_next_obs,
batch_done) = rpm.sample(BATCH_SIZE)
agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
batch_done)
obs = next_obs
total_reward += reward
if done or steps >= 200:
break
return total_reward
# 评估 agent, 跑 5 个episode,总reward求平均
def evaluate(env, agent, render=False):
eval_reward = []
for i in range(5):
obs = env.reset()
total_reward = 0
steps = 0
while True:
batch_obs = np.expand_dims(obs, axis=0)
action = agent.predict(batch_obs.astype('float32'))
action = np.clip(action, -1.0, 1.0)
steps += 1
next_obs, reward, done, info = env.step(action)
obs = next_obs
total_reward += reward
if render:
env.render()
if done or steps >= 200:
break
eval_reward.append(total_reward)
return np.mean(eval_reward)
def main():
env = ContinuousCartPoleEnv()
obs_dim = env.observation_space.shape[0]
act_dim = env.action_space.shape[0]
# 使用PARL框架创建agent
model = Model(act_dim)
algorithm = DDPG(
model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
agent = Agent(algorithm, obs_dim, act_dim)
# 创建经验池
rpm = ReplayMemory(MEMORY_SIZE)
# 往经验池中预存数据
while len(rpm) < MEMORY_WARMUP_SIZE:
run_episode(agent, env, rpm)
episode = 0
while episode < TRAIN_EPISODE:
for i in range(50):
total_reward = run_episode(agent, env, rpm)
episode += 1
eval_reward = evaluate(env, agent, render=False)
logger.info('episode:{} Test reward:{}'.format(
episode, eval_reward))
if __name__ == '__main__':
main()
### papers relative to improved RL algorithms
1. **Proximal Distilled Evolutionary Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1906.09807.pdf)
*Cristian Bodnar, Ben Day, Pietro Lio ́*
2. **Uncertainty-Aware Action Advising for Deep Reinforcement Learning Agents** AAAI2020. [paper](https://aaai.org/Papers/AAAI/2020GB/AAAI-SilvaF.2159.pdf)
*Felipe Leno da Silva (University of Sao Paulo)*; Pablo Hernandez-Leal (Borealis AI); Bilal Kartal (Borealis AI); Matthew Taylor (Borealis AI)*
3. **Partner Selection for the Emergence of Cooperation in Multi-Agent Systems Using Reinforcement Learning** AAAI2020. [paper](https://aaai.org/Papers/AAAI/2020GB/AAAI-AnastassacosN.1598.pdf)
*Nicolas Anastassacos, Stephen Hailes, Mirco Musolesi*
4. **Reinforcement Learning with Perturbed Reward** AAAI2020. [paper](https://www.aaai.org/Papers/AAAI/2020GB/AAAI-WangJK.4139.pdf)
*Jingkang Wang, Yang Liu, Bo Li*
5. **Deep Model-Based Reinforcement Learning via Estimated Uncertainty and Conservative Policy Optimization** AAAI2020. [paper](https://arxiv.org/pdf/1911.12574.pdf)
*Qi Zhou, HouQiang Li, Jie Wang*
6. **Reinforcement Learning of Risk-Constrained Policies in Markov Decision Processes** AAAI2020. [paper](https://www.fi.muni.cz/~xnovot18/aaai20.pdf)
*Toma ́sˇ Bra ́zdil, Krishnendu Chatterjee, Petr Novotny ́, Jirˇ ́ı Vahala*
7. **Exploratory Combinatorial Optimization with Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1909.04063.pdf)
*Thomas D. Barrett, William R. Clements, Jakob N. Foerster, Alex I. Lvovsky*
8. **Fixed-Horizon Temporal Difference Methods for Stable Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1909.03906.pdf)
*Kristopher De Asis, Alan Chan, Silviu Pitis, Richard S. Sutton, Daniel Graves*
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
__version__ = "1.2.1" __version__ = "1.3.1"
""" """
generates new PARL python API generates new PARL python API
""" """
......
...@@ -24,23 +24,15 @@ __all__ = ['A3C'] ...@@ -24,23 +24,15 @@ __all__ = ['A3C']
class A3C(Algorithm): class A3C(Algorithm):
def __init__(self, model, hyperparas=None, vf_loss_coeff=None): def __init__(self, model, vf_loss_coeff=None):
""" A3C/A2C algorithm """ A3C/A2C algorithm
Args: Args:
model (parl.Model): forward network of policy and value model (parl.Model): forward network of policy and value
hyperparas (dict): (deprecated) dict of hyper parameters.
vf_loss_coeff (float): coefficient of the value function loss vf_loss_coeff (float): coefficient of the value function loss
""" """
self.model = model self.model = model
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithms.A3C` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.vf_loss_coeff = hyperparas['vf_loss_coeff']
else:
assert isinstance(vf_loss_coeff, (int, float)) assert isinstance(vf_loss_coeff, (int, float))
self.vf_loss_coeff = vf_loss_coeff self.vf_loss_coeff = vf_loss_coeff
......
...@@ -19,7 +19,6 @@ from parl.core.fluid import layers ...@@ -19,7 +19,6 @@ from parl.core.fluid import layers
from copy import deepcopy from copy import deepcopy
from paddle import fluid from paddle import fluid
from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.algorithm import Algorithm
from parl.utils.deprecation import deprecated
__all__ = ['DDPG'] __all__ = ['DDPG']
...@@ -27,7 +26,6 @@ __all__ = ['DDPG'] ...@@ -27,7 +26,6 @@ __all__ = ['DDPG']
class DDPG(Algorithm): class DDPG(Algorithm):
def __init__(self, def __init__(self,
model, model,
hyperparas=None,
gamma=None, gamma=None,
tau=None, tau=None,
actor_lr=None, actor_lr=None,
...@@ -37,22 +35,11 @@ class DDPG(Algorithm): ...@@ -37,22 +35,11 @@ class DDPG(Algorithm):
Args: Args:
model (parl.Model): forward network of actor and critic. model (parl.Model): forward network of actor and critic.
The function get_actor_params() of model should be implemented. The function get_actor_params() of model should be implemented.
hyperparas (dict): (deprecated) dict of hyper parameters.
gamma (float): discounted factor for reward computation. gamma (float): discounted factor for reward computation.
tau (float): decay coefficient when updating the weights of self.target_model with self.model tau (float): decay coefficient when updating the weights of self.target_model with self.model
actor_lr (float): learning rate of the actor model actor_lr (float): learning rate of the actor model
critic_lr (float): learning rate of the critic model critic_lr (float): learning rate of the critic model
""" """
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithms.DDPG` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.gamma = hyperparas['gamma']
self.tau = hyperparas['tau']
self.actor_lr = hyperparas['actor_lr']
self.critic_lr = hyperparas['critic_lr']
else:
assert isinstance(gamma, float) assert isinstance(gamma, float)
assert isinstance(tau, float) assert isinstance(tau, float)
assert isinstance(actor_lr, float) assert isinstance(actor_lr, float)
...@@ -65,25 +52,11 @@ class DDPG(Algorithm): ...@@ -65,25 +52,11 @@ class DDPG(Algorithm):
self.model = model self.model = model
self.target_model = deepcopy(model) self.target_model = deepcopy(model)
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='predict')
def define_predict(self, obs):
""" use actor model of self.model to predict the action
"""
return self.predict(obs)
def predict(self, obs): def predict(self, obs):
""" use actor model of self.model to predict the action """ use actor model of self.model to predict the action
""" """
return self.model.policy(obs) return self.model.policy(obs)
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='learn')
def define_learn(self, obs, action, reward, next_obs, terminal):
""" update actor and critic model with DDPG algorithm
"""
return self.learn(obs, action, reward, next_obs, terminal)
def learn(self, obs, action, reward, next_obs, terminal): def learn(self, obs, action, reward, next_obs, terminal):
""" update actor and critic model with DDPG algorithm """ update actor and critic model with DDPG algorithm
""" """
...@@ -115,15 +88,7 @@ class DDPG(Algorithm): ...@@ -115,15 +88,7 @@ class DDPG(Algorithm):
optimizer.minimize(cost) optimizer.minimize(cost)
return cost return cost
def sync_target(self, def sync_target(self, decay=None, share_vars_parallel_executor=None):
gpu_id=None,
decay=None,
share_vars_parallel_executor=None):
if gpu_id is not None:
warnings.warn(
"the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DDPG` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
if decay is None: if decay is None:
decay = 1.0 - self.tau decay = 1.0 - self.tau
self.model.sync_weights_to( self.model.sync_weights_to(
......
...@@ -21,19 +21,17 @@ import paddle.fluid as fluid ...@@ -21,19 +21,17 @@ import paddle.fluid as fluid
from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.algorithm import Algorithm
from parl.core.fluid import layers from parl.core.fluid import layers
__all__ = ['DDQN']
class DDQN(Algorithm): class DDQN(Algorithm):
def __init__( def __init__(self, model, act_dim=None, gamma=None, lr=None):
self,
model,
act_dim=None,
gamma=None,
):
""" Double DQN algorithm """ Double DQN algorithm
Args: Args:
model (parl.Model): model defining forward network of Q function. model (parl.Model): model defining forward network of Q function
act_dim (int): dimension of the action space
gamma (float): discounted factor for reward computation. gamma (float): discounted factor for reward computation.
lr (float): learning rate.
""" """
self.model = model self.model = model
self.target_model = copy.deepcopy(model) self.target_model = copy.deepcopy(model)
...@@ -43,11 +41,29 @@ class DDQN(Algorithm): ...@@ -43,11 +41,29 @@ class DDQN(Algorithm):
self.act_dim = act_dim self.act_dim = act_dim
self.gamma = gamma self.gamma = gamma
self.lr = lr
def predict(self, obs): def predict(self, obs):
""" use value model self.model to predict the action value
"""
return self.model.value(obs) return self.model.value(obs)
def learn(self, obs, action, reward, next_obs, terminal, learning_rate): def learn(self,
obs,
action,
reward,
next_obs,
terminal,
learning_rate=None):
""" update value model self.model with DQN algorithm
"""
# Support the modification of learning_rate
if learning_rate is None:
assert isinstance(
self.lr,
float), "Please set the learning rate of DQN in initializaion."
learning_rate = self.lr
pred_value = self.model.value(obs) pred_value = self.model.value(obs)
action_onehot = layers.one_hot(action, self.act_dim) action_onehot = layers.one_hot(action, self.act_dim)
action_onehot = layers.cast(action_onehot, dtype='float32') action_onehot = layers.cast(action_onehot, dtype='float32')
...@@ -85,12 +101,7 @@ class DDQN(Algorithm): ...@@ -85,12 +101,7 @@ class DDQN(Algorithm):
optimizer.minimize(cost) optimizer.minimize(cost)
return cost return cost
def sync_target(self, gpu_id=None): def sync_target(self):
""" sync weights of self.model to self.target_model """ sync weights of self.model to self.target_model
""" """
if gpu_id is not None:
warnings.warn(
"the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.model.sync_weights_to(self.target_model) self.model.sync_weights_to(self.target_model)
...@@ -19,18 +19,16 @@ import copy ...@@ -19,18 +19,16 @@ import copy
import paddle.fluid as fluid import paddle.fluid as fluid
from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.algorithm import Algorithm
from parl.core.fluid import layers from parl.core.fluid import layers
from parl.utils.deprecation import deprecated
__all__ = ['DQN'] __all__ = ['DQN']
class DQN(Algorithm): class DQN(Algorithm):
def __init__(self, model, hyperparas=None, act_dim=None, gamma=None): def __init__(self, model, act_dim=None, gamma=None, lr=None):
""" DQN algorithm """ DQN algorithm
Args: Args:
model (parl.Model): model defining forward network of Q function model (parl.Model): model defining forward network of Q function
hyperparas (dict): (deprecated) dict of hyper parameters.
act_dim (int): dimension of the action space act_dim (int): dimension of the action space
gamma (float): discounted factor for reward computation. gamma (float): discounted factor for reward computation.
lr (float): learning rate. lr (float): learning rate.
...@@ -38,41 +36,33 @@ class DQN(Algorithm): ...@@ -38,41 +36,33 @@ class DQN(Algorithm):
self.model = model self.model = model
self.target_model = copy.deepcopy(model) self.target_model = copy.deepcopy(model)
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.act_dim = hyperparas['action_dim']
self.gamma = hyperparas['gamma']
else:
assert isinstance(act_dim, int) assert isinstance(act_dim, int)
assert isinstance(gamma, float) assert isinstance(gamma, float)
self.act_dim = act_dim self.act_dim = act_dim
self.gamma = gamma self.gamma = gamma
self.lr = lr
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='predict')
def define_predict(self, obs):
""" use value model self.model to predict the action value
"""
return self.predict(obs)
def predict(self, obs): def predict(self, obs):
""" use value model self.model to predict the action value """ use value model self.model to predict the action value
""" """
return self.model.value(obs) return self.model.value(obs)
@deprecated( def learn(self,
deprecated_in='1.2', removed_in='1.3', replace_function='learn') obs,
def define_learn(self, obs, action, reward, next_obs, terminal, action,
learning_rate): reward,
return self.learn(obs, action, reward, next_obs, terminal, next_obs,
learning_rate) terminal,
learning_rate=None):
def learn(self, obs, action, reward, next_obs, terminal, learning_rate):
""" update value model self.model with DQN algorithm """ update value model self.model with DQN algorithm
""" """
# Support the modification of learning_rate
if learning_rate is None:
assert isinstance(
self.lr,
float), "Please set the learning rate of DQN in initializaion."
learning_rate = self.lr
pred_value = self.model.value(obs) pred_value = self.model.value(obs)
next_pred_value = self.target_model.value(next_obs) next_pred_value = self.target_model.value(next_obs)
...@@ -92,12 +82,7 @@ class DQN(Algorithm): ...@@ -92,12 +82,7 @@ class DQN(Algorithm):
optimizer.minimize(cost) optimizer.minimize(cost)
return cost return cost
def sync_target(self, gpu_id=None): def sync_target(self):
""" sync weights of self.model to self.target_model """ sync weights of self.model to self.target_model
""" """
if gpu_id is not None:
warnings.warn(
"the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.model.sync_weights_to(self.target_model) self.model.sync_weights_to(self.target_model)
...@@ -85,34 +85,21 @@ class VTraceLoss(object): ...@@ -85,34 +85,21 @@ class VTraceLoss(object):
class IMPALA(Algorithm): class IMPALA(Algorithm):
def __init__(self, def __init__(self,
model, model,
hyperparas=None,
sample_batch_steps=None, sample_batch_steps=None,
gamma=None, gamma=None,
vf_loss_coeff=None, vf_loss_coeff=None,
clip_rho_threshold=None, clip_rho_threshold=None,
clip_pg_rho_threshold=None): clip_pg_rho_threshold=None):
""" IMPALA algorithm r""" IMPALA algorithm
Args: Args:
model (parl.Model): forward network of policy and value model (parl.Model): forward network of policy and value
hyperparas (dict): (deprecated) dict of hyper parameters.
sample_batch_steps (int): steps of each environment sampling. sample_batch_steps (int): steps of each environment sampling.
gamma (float): discounted factor for reward computation. gamma (float): discounted factor for reward computation.
vf_loss_coeff (float): coefficient of the value function loss. vf_loss_coeff (float): coefficient of the value function loss.
clip_rho_threshold (float): clipping threshold for importance weights (rho). clip_rho_threshold (float): clipping threshold for importance weights (rho).
clip_pg_rho_threshold (float): clipping threshold on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)). clip_pg_rho_threshold (float): clipping threshold on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
""" """
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithms.IMPALA` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.sample_batch_steps = hyperparas['sample_batch_steps']
self.gamma = hyperparas['gamma']
self.vf_loss_coeff = hyperparas['vf_loss_coeff']
self.clip_rho_threshold = hyperparas['clip_rho_threshold']
self.clip_pg_rho_threshold = hyperparas['clip_pg_rho_threshold']
else:
assert isinstance(sample_batch_steps, int) assert isinstance(sample_batch_steps, int)
assert isinstance(gamma, float) assert isinstance(gamma, float)
assert isinstance(vf_loss_coeff, float) assert isinstance(vf_loss_coeff, float)
......
...@@ -146,7 +146,7 @@ def from_importance_weights(behaviour_actions_log_probs, ...@@ -146,7 +146,7 @@ def from_importance_weights(behaviour_actions_log_probs,
def recursively_scan(discounts, cs, deltas): def recursively_scan(discounts, cs, deltas):
""" Recursively calculate vs_minus_v_xs according to following equation: r""" Recursively calculate vs_minus_v_xs according to following equation:
vs_minus_v_xs(t) = deltas(t) + discounts(t) * cs(t) * vs_minus_v_xs(t + 1) vs_minus_v_xs(t) = deltas(t) + discounts(t) * cs(t) * vs_minus_v_xs(t + 1)
Args: Args:
......
...@@ -27,10 +27,11 @@ from parl.core.fluid.policy_distribution import SoftMultiCategoricalDistribution ...@@ -27,10 +27,11 @@ from parl.core.fluid.policy_distribution import SoftMultiCategoricalDistribution
def SoftPDistribution(logits, act_space): def SoftPDistribution(logits, act_space):
"""input: """Args:
logits: the output of policy model logits: the output of policy model
act_space: action space, must be gym.spaces.Discrete or multiagent.multi_discrete.MultiDiscrete act_space: action space, must be gym.spaces.Discrete or multiagent.multi_discrete.MultiDiscrete
output:
Return:
instance of SoftCategoricalDistribution or SoftMultiCategoricalDistribution instance of SoftCategoricalDistribution or SoftMultiCategoricalDistribution
""" """
# is instance of gym.spaces.Discrete # is instance of gym.spaces.Discrete
......
...@@ -18,51 +18,28 @@ warnings.simplefilter('default') ...@@ -18,51 +18,28 @@ warnings.simplefilter('default')
import paddle.fluid as fluid import paddle.fluid as fluid
from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.algorithm import Algorithm
from parl.core.fluid import layers from parl.core.fluid import layers
from parl.utils.deprecation import deprecated
__all__ = ['PolicyGradient'] __all__ = ['PolicyGradient']
class PolicyGradient(Algorithm): class PolicyGradient(Algorithm):
def __init__(self, model, hyperparas=None, lr=None): def __init__(self, model, lr=None):
""" Policy Gradient algorithm """ Policy Gradient algorithm
Args: Args:
model (parl.Model): forward network of the policy. model (parl.Model): forward network of the policy.
hyperparas (dict): (deprecated) dict of hyper parameters.
lr (float): learning rate of the policy model. lr (float): learning rate of the policy model.
""" """
self.model = model self.model = model
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithms.PolicyGradient` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.lr = hyperparas['lr']
else:
assert isinstance(lr, float) assert isinstance(lr, float)
self.lr = lr self.lr = lr
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='predict')
def define_predict(self, obs):
""" use policy model self.model to predict the action probability
"""
return self.predict(obs)
def predict(self, obs): def predict(self, obs):
""" use policy model self.model to predict the action probability """ use policy model self.model to predict the action probability
""" """
return self.model(obs) return self.model(obs)
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='learn')
def define_learn(self, obs, action, reward):
""" update policy model self.model with policy gradient algorithm
"""
return self.learn(obs, action, reward)
def learn(self, obs, action, reward): def learn(self, obs, action, reward):
""" update policy model self.model with policy gradient algorithm """ update policy model self.model with policy gradient algorithm
""" """
......
...@@ -20,7 +20,6 @@ from copy import deepcopy ...@@ -20,7 +20,6 @@ from copy import deepcopy
from paddle import fluid from paddle import fluid
from parl.core.fluid import layers from parl.core.fluid import layers
from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.algorithm import Algorithm
from parl.utils.deprecation import deprecated
__all__ = ['PPO'] __all__ = ['PPO']
...@@ -28,7 +27,6 @@ __all__ = ['PPO'] ...@@ -28,7 +27,6 @@ __all__ = ['PPO']
class PPO(Algorithm): class PPO(Algorithm):
def __init__(self, def __init__(self,
model, model,
hyperparas=None,
act_dim=None, act_dim=None,
policy_lr=None, policy_lr=None,
value_lr=None, value_lr=None,
...@@ -37,7 +35,6 @@ class PPO(Algorithm): ...@@ -37,7 +35,6 @@ class PPO(Algorithm):
Args: Args:
model (parl.Model): model defining forward network of policy and value. model (parl.Model): model defining forward network of policy and value.
hyperparas (dict): (deprecated) dict of hyper parameters.
act_dim (float): dimension of the action space. act_dim (float): dimension of the action space.
policy_lr (float): learning rate of the policy model. policy_lr (float): learning rate of the policy model.
value_lr (float): learning rate of the value model. value_lr (float): learning rate of the value model.
...@@ -47,19 +44,6 @@ class PPO(Algorithm): ...@@ -47,19 +44,6 @@ class PPO(Algorithm):
# Used to calculate probability of action in old policy # Used to calculate probability of action in old policy
self.old_policy_model = deepcopy(model.policy_model) self.old_policy_model = deepcopy(model.policy_model)
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithms.PPO` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.act_dim = hyperparas['act_dim']
self.policy_lr = hyperparas['policy_lr']
self.value_lr = hyperparas['value_lr']
if 'epsilon' in hyperparas:
self.epsilon = hyperparas['epsilon']
else:
self.epsilon = 0.2 # default
else:
assert isinstance(act_dim, int) assert isinstance(act_dim, int)
assert isinstance(policy_lr, float) assert isinstance(policy_lr, float)
assert isinstance(value_lr, float) assert isinstance(value_lr, float)
...@@ -111,49 +95,18 @@ class PPO(Algorithm): ...@@ -111,49 +95,18 @@ class PPO(Algorithm):
log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim) log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim)
return kl return kl
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='predict')
def define_predict(self, obs):
""" Use policy model of self.model to predict means and logvars of actions
"""
return self.predict(obs)
def predict(self, obs): def predict(self, obs):
""" Use the policy model of self.model to predict means and logvars of actions """ Use the policy model of self.model to predict means and logvars of actions
""" """
means, logvars = self.model.policy(obs) means, logvars = self.model.policy(obs)
return means return means
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='sample')
def define_sample(self, obs):
""" Use the policy model of self.model to sample actions
"""
return self.sample(obs)
def sample(self, obs): def sample(self, obs):
""" Use the policy model of self.model to sample actions """ Use the policy model of self.model to sample actions
""" """
sampled_act = self.model.policy_sample(obs) sampled_act = self.model.policy_sample(obs)
return sampled_act return sampled_act
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='policy_learn')
def define_policy_learn(self, obs, actions, advantages, beta=None):
""" Learn policy model with:
1. CLIP loss: Clipped Surrogate Objective
2. KLPEN loss: Adaptive KL Penalty Objective
See: https://arxiv.org/pdf/1707.02286.pdf
Args:
obs: Tensor, (batch_size, obs_dim)
actions: Tensor, (batch_size, act_dim)
advantages: Tensor (batch_size, )
beta: Tensor (1) or None
if None, use CLIP Loss; else, use KLPEN loss.
"""
return self.policy_learn(obs, actions, advantages, beta)
def policy_learn(self, obs, actions, advantages, beta=None): def policy_learn(self, obs, actions, advantages, beta=None):
""" Learn policy model with: """ Learn policy model with:
1. CLIP loss: Clipped Surrogate Objective 1. CLIP loss: Clipped Surrogate Objective
...@@ -196,27 +149,11 @@ class PPO(Algorithm): ...@@ -196,27 +149,11 @@ class PPO(Algorithm):
optimizer.minimize(loss) optimizer.minimize(loss)
return loss, kl return loss, kl
@deprecated(
deprecated_in='1.2',
removed_in='1.3',
replace_function='value_predict')
def define_value_predict(self, obs):
""" Use value model of self.model to predict value of obs
"""
return self.value_predict(obs)
def value_predict(self, obs): def value_predict(self, obs):
""" Use value model of self.model to predict value of obs """ Use value model of self.model to predict value of obs
""" """
return self.model.value(obs) return self.model.value(obs)
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='value_learn')
def define_value_learn(self, obs, val):
""" Learn value model with square error cost
"""
return self.value_learn(obs, val)
def value_learn(self, obs, val): def value_learn(self, obs, val):
""" Learn the value model with square error cost """ Learn the value model with square error cost
""" """
...@@ -227,12 +164,7 @@ class PPO(Algorithm): ...@@ -227,12 +164,7 @@ class PPO(Algorithm):
optimizer.minimize(loss) optimizer.minimize(loss)
return loss return loss
def sync_old_policy(self, gpu_id=None): def sync_old_policy(self):
""" Synchronize weights of self.model.policy_model to self.old_policy_model """ Synchronize weights of self.model.policy_model to self.old_policy_model
""" """
if gpu_id is not None:
warnings.warn(
"the `gpu_id` argument of `sync_old_policy` function in `parl.Algorithms.PPO` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.model.policy_model.sync_weights_to(self.old_policy_model) self.model.policy_model.sync_weights_to(self.old_policy_model)
...@@ -102,11 +102,11 @@ class SAC(Algorithm): ...@@ -102,11 +102,11 @@ class SAC(Algorithm):
return cost return cost
def critic_learn(self, obs, action, reward, next_obs, terminal): def critic_learn(self, obs, action, reward, next_obs, terminal):
next_state_action, next_state_log_pi = self.sample(next_obs) next_obs_action, next_obs_log_pi = self.sample(next_obs)
qf1_next_target, qf2_next_target = self.target_critic.value( qf1_next_target, qf2_next_target = self.target_critic.value(
next_obs, next_state_action) next_obs, next_obs_action)
min_qf_next_target = layers.elementwise_min( min_qf_next_target = layers.elementwise_min(
qf1_next_target, qf2_next_target) - next_state_log_pi * self.alpha qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha
terminal = layers.cast(terminal, dtype='float32') terminal = layers.cast(terminal, dtype='float32')
target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
import numpy as np
import paddle.fluid as fluid
import parl
from parl import layers
class DQNModel(parl.Model):
def __init__(self):
self.fc1 = layers.fc(size=32, act='relu')
self.fc2 = layers.fc(size=2)
def value(self, obs):
x = self.fc1(obs)
act = self.fc2(x)
return act
class DQNAgent(parl.Agent):
def __init__(self, algorithm):
super(DQNAgent, self).__init__(algorithm)
self.alg = algorithm
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.value = self.alg.predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
action = layers.data(name='act', shape=[1], dtype='int32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
lr = layers.data(
name='lr', shape=[1], dtype='float32', append_batch_size=False)
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self.cost = self.alg.learn(obs, action, reward, next_obs, terminal,
lr)
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
pred_Q = self.fluid_executor.run(
self.pred_program,
feed={'obs': obs.astype('float32')},
fetch_list=[self.value])[0]
pred_Q = np.squeeze(pred_Q, axis=0)
act = np.argmax(pred_Q)
return act
def learn(self, obs, act, reward, next_obs, terminal):
lr = 3e-4
obs = np.expand_dims(obs, axis=0)
next_obs = np.expand_dims(next_obs, axis=0)
act = np.expand_dims(act, -1)
feed = {
'obs': obs.astype('float32'),
'act': act.astype('int32'),
'reward': reward,
'next_obs': next_obs.astype('float32'),
'terminal': terminal,
'lr': np.float32(lr)
}
cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.cost])[0]
return cost
class A3CModel(parl.Model):
def __init__(self):
self.fc = layers.fc(size=32, act='relu')
self.policy_fc = layers.fc(size=2)
self.value_fc = layers.fc(size=1)
def policy(self, obs):
x = self.fc(obs)
policy_logits = self.policy_fc(x)
return policy_logits
def value(self, obs):
x = self.fc(obs)
values = self.value_fc(x)
values = layers.squeeze(values, axes=[1])
return values
def policy_and_value(self, obs):
x = self.fc(obs)
policy_logits = self.policy_fc(x)
values = self.value_fc(x)
values = layers.squeeze(values, axes=[1])
return policy_logits, values
class A3CAgent(parl.Agent):
def __init__(self, algorithm):
super(A3CAgent, self).__init__(algorithm)
self.alg = algorithm
def build_program(self):
self.predict_program = fluid.Program()
self.value_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.predict_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.predict_actions = self.alg.predict(obs)
with fluid.program_guard(self.value_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.values = self.alg.value(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
actions = layers.data(name='actions', shape=[], dtype='int64')
advantages = layers.data(
name='advantages', shape=[], dtype='float32')
target_values = layers.data(
name='target_values', shape=[], dtype='float32')
lr = layers.data(
name='lr', shape=[1], dtype='float32', append_batch_size=False)
entropy_coeff = layers.data(
name='entropy_coeff',
shape=[1],
dtype='float32',
append_batch_size=False)
total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
obs, actions, advantages, target_values, lr, entropy_coeff)
self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]
def predict(self, obs_np):
obs_np = obs_np.astype('float32')
predict_actions = self.fluid_executor.run(
self.predict_program,
feed={'obs': obs_np},
fetch_list=[self.predict_actions])[0]
return predict_actions
def value(self, obs_np):
obs_np = obs_np.astype('float32')
values = self.fluid_executor.run(
self.value_program, feed={'obs': obs_np},
fetch_list=[self.values])[0]
return values
def learn(self, obs_np, actions_np, advantages_np, target_values_np):
obs_np = obs_np.astype('float32')
actions_np = actions_np.astype('int64')
advantages_np = advantages_np.astype('float32')
target_values_np = target_values_np.astype('float32')
lr = 3e-4
entropy_coeff = 0.
total_loss, pi_loss, vf_loss, entropy = self.fluid_executor.run(
self.learn_program,
feed={
'obs': obs_np,
'actions': actions_np,
'advantages': advantages_np,
'target_values': target_values_np,
'lr': np.array([lr], dtype='float32'),
'entropy_coeff': np.array([entropy_coeff], dtype='float32')
},
fetch_list=self.learn_outputs)
return total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff
class IMPALAModel(parl.Model):
def __init__(self):
self.fc = layers.fc(size=32, act='relu')
self.policy_fc = layers.fc(size=2)
self.value_fc = layers.fc(size=1)
def policy(self, obs):
x = self.fc(obs)
policy_logits = self.policy_fc(x)
return policy_logits
def value(self, obs):
x = self.fc(obs)
values = self.value_fc(x)
values = layers.squeeze(values, axes=[1])
return values
class IMPALAAgent(parl.Agent):
def __init__(self, algorithm):
super(IMPALAAgent, self).__init__(algorithm)
self.alg = algorithm
def build_program(self):
self.predict_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.predict_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.predict_actions = self.alg.predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
actions = layers.data(name='actions', shape=[], dtype='int64')
behaviour_logits = layers.data(
name='behaviour_logits', shape=[2], dtype='float32')
rewards = layers.data(name='rewards', shape=[], dtype='float32')
dones = layers.data(name='dones', shape=[], dtype='float32')
lr = layers.data(
name='lr', shape=[1], dtype='float32', append_batch_size=False)
entropy_coeff = layers.data(
name='entropy_coeff',
shape=[1],
dtype='float32',
append_batch_size=False)
vtrace_loss, kl = self.alg.learn(obs, actions, behaviour_logits,
rewards, dones, lr, entropy_coeff)
self.learn_outputs = [
vtrace_loss.total_loss, vtrace_loss.pi_loss,
vtrace_loss.vf_loss, vtrace_loss.entropy, kl
]
def predict(self, obs_np):
obs_np = obs_np.astype('float32')
predict_actions = self.fluid_executor.run(
self.predict_program,
feed={'obs': obs_np},
fetch_list=[self.predict_actions])[0]
return predict_actions
def learn(self, obs, actions, behaviour_logits, rewards, dones, lr,
entropy_coeff):
total_loss, pi_loss, vf_loss, entropy, kl = self.fluid_executor.run(
self.learn_program,
feed={
'obs': obs,
'actions': actions,
'behaviour_logits': behaviour_logits,
'rewards': rewards,
'dones': dones,
'lr': np.array([lr], dtype='float32'),
'entropy_coeff': np.array([entropy_coeff], dtype='float32')
},
fetch_list=self.learn_outputs)
return total_loss, pi_loss, vf_loss, entropy, kl
class SACActor(parl.Model):
def __init__(self):
self.mean_linear = layers.fc(size=1)
self.log_std_linear = layers.fc(size=1)
def policy(self, obs):
means = self.mean_linear(obs)
log_std = self.log_std_linear(obs)
return means, log_std
class SACCritic(parl.Model):
def __init__(self):
self.fc1 = layers.fc(size=1)
self.fc2 = layers.fc(size=1)
def value(self, obs, act):
concat = layers.concat([obs, act], axis=1)
Q1 = self.fc1(concat)
Q2 = self.fc2(concat)
Q1 = layers.squeeze(Q1, axes=[1])
Q2 = layers.squeeze(Q2, axes=[1])
return Q1, Q2
class SACAgent(parl.Agent):
def __init__(self, algorithm):
super(SACAgent, self).__init__(algorithm)
self.alg.sync_target(decay=0)
def build_program(self):
self.pred_program = fluid.Program()
self.sample_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.pred_act = self.alg.predict(obs)
with fluid.program_guard(self.sample_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.sample_act, _ = self.alg.sample(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self.critic_cost, self.actor_cost = self.alg.learn(
obs, act, reward, next_obs, terminal)
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act = self.fluid_executor.run(
self.pred_program, feed={'obs': obs},
fetch_list=[self.pred_act])[0]
return act
def sample(self, obs):
obs = np.expand_dims(obs, axis=0)
act = self.fluid_executor.run(
self.sample_program,
feed={'obs': obs},
fetch_list=[self.sample_act])[0]
return act
def learn(self, obs, act, reward, next_obs, terminal):
feed = {
'obs': obs,
'act': act,
'reward': reward,
'next_obs': next_obs,
'terminal': terminal
}
[critic_cost, actor_cost] = self.fluid_executor.run(
self.learn_program,
feed=feed,
fetch_list=[self.critic_cost, self.actor_cost])
return critic_cost[0], actor_cost[0]
class DDPGModel(parl.Model):
def __init__(self):
self.policy_fc = layers.fc(size=1)
self.value_fc = layers.fc(size=1)
def policy(self, obs):
act = self.policy_fc(obs)
return act
def value(self, obs, act):
concat = layers.concat([obs, act], axis=1)
Q = self.value_fc(concat)
Q = layers.squeeze(Q, axes=[1])
return Q
def get_actor_params(self):
return self.parameters()[:2]
class DDPGAgent(parl.Agent):
def __init__(self, algorithm):
super(DDPGAgent, self).__init__(algorithm)
self.alg.sync_target(decay=0)
def build_program(self):
self.pred_program = fluid.Program()
self.learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.pred_act = self.alg.predict(obs)
with fluid.program_guard(self.learn_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
_, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
terminal)
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act = self.fluid_executor.run(
self.pred_program, feed={'obs': obs},
fetch_list=[self.pred_act])[0]
return act
def learn(self, obs, act, reward, next_obs, terminal):
feed = {
'obs': obs,
'act': act,
'reward': reward,
'next_obs': next_obs,
'terminal': terminal
}
critic_cost = self.fluid_executor.run(
self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
self.alg.sync_target()
return critic_cost
class TD3Model(parl.Model):
def __init__(self):
self.actor_fc = layers.fc(size=1)
self.q1 = layers.fc(size=1)
self.q2 = layers.fc(size=1)
def policy(self, obs):
return self.actor_fc(obs)
def value(self, obs, act):
concat = layers.concat([obs, act], axis=1)
Q1 = self.q1(concat)
Q1 = layers.squeeze(Q1, axes=[1])
Q2 = self.q2(concat)
Q2 = layers.squeeze(Q2, axes=[1])
return Q1, Q2
def Q1(self, obs, act):
concat = layers.concat([obs, act], axis=1)
Q1 = self.q1(concat)
Q1 = layers.squeeze(Q1, axes=[1])
return Q1
def get_actor_params(self):
return self.parameters()[:2]
class TD3Agent(parl.Agent):
def __init__(self, algorithm):
super(TD3Agent, self).__init__(algorithm)
self.alg.sync_target(decay=0)
def build_program(self):
self.pred_program = fluid.Program()
self.actor_learn_program = fluid.Program()
self.critic_learn_program = fluid.Program()
with fluid.program_guard(self.pred_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.pred_act = self.alg.predict(obs)
with fluid.program_guard(self.actor_learn_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
self.actor_cost = self.alg.actor_learn(obs)
with fluid.program_guard(self.critic_learn_program):
obs = layers.data(name='obs', shape=[4], dtype='float32')
act = layers.data(name='act', shape=[1], dtype='float32')
reward = layers.data(name='reward', shape=[], dtype='float32')
next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
terminal = layers.data(name='terminal', shape=[], dtype='bool')
self.critic_cost = self.alg.critic_learn(obs, act, reward,
next_obs, terminal)
def predict(self, obs):
obs = np.expand_dims(obs, axis=0)
act = self.fluid_executor.run(
self.pred_program, feed={'obs': obs},
fetch_list=[self.pred_act])[0]
return act
def learn(self, obs, act, reward, next_obs, terminal):
feed = {
'obs': obs,
'act': act,
'reward': reward,
'next_obs': next_obs,
'terminal': terminal
}
critic_cost = self.fluid_executor.run(
self.critic_learn_program,
feed=feed,
fetch_list=[self.critic_cost])[0]
actor_cost = self.fluid_executor.run(
self.actor_learn_program,
feed={'obs': obs},
fetch_list=[self.actor_cost])[0]
self.alg.sync_target()
return actor_cost, critic_cost
class PARLtest(unittest.TestCase):
def setUp(self):
# set up DQN test
DQN_model = DQNModel()
DQN_alg = parl.algorithms.DQN(DQN_model, act_dim=2, gamma=0.9)
self.DQN_agent = DQNAgent(DQN_alg)
# set up A3C test
A3C_model = A3CModel()
A3C_alg = parl.algorithms.A3C(A3C_model, vf_loss_coeff=0.)
self.A3C_agent = A3CAgent(A3C_alg)
# set up IMPALA test
IMPALA_model = IMPALAModel()
IMPALA_alg = parl.algorithms.IMPALA(
IMPALA_model,
sample_batch_steps=4,
gamma=0.9,
vf_loss_coeff=0.,
clip_rho_threshold=1.,
clip_pg_rho_threshold=1.)
self.IMPALA_agent = IMPALAAgent(IMPALA_alg)
# set up SAC test
SAC_actor = SACActor()
SAC_critic = SACCritic()
SAC_alg = parl.algorithms.SAC(
SAC_actor,
SAC_critic,
max_action=1.,
gamma=0.99,
tau=0.005,
actor_lr=1e-3,
critic_lr=1e-3)
self.SAC_agent = SACAgent(SAC_alg)
# set up DDPG test
DDPG_model = DDPGModel()
DDPG_alg = parl.algorithms.DDPG(
DDPG_model, gamma=0.99, tau=0.001, actor_lr=3e-4, critic_lr=3e-4)
self.DDPG_agent = DDPGAgent(DDPG_alg)
# set up TD3 test
TD3_model = TD3Model()
TD3_alg = parl.algorithms.TD3(
TD3_model,
1.,
gamma=0.99,
tau=0.005,
actor_lr=3e-4,
critic_lr=3e-4)
self.TD3_agent = TD3Agent(TD3_alg)
def test_DQN_predict(self):
"""Test APIs in PARL DQN predict
"""
obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
act = self.DQN_agent.predict(obs)
def test_DQN_learn(self):
"""Test APIs in PARL DQN learn
"""
obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
next_obs = np.array([-0.02332638, -0.16414229, 0.01142936, 0.29949173])
terminal = np.array([False]).astype('bool')
reward = np.array([1.0]).astype('float32')
act = np.array([0]).astype('int32')
cost = self.DQN_agent.learn(obs, act, reward, next_obs, terminal)
def test_A3C_predict(self):
"""Test APIs in PARL A3C predict
"""
obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
obs = np.expand_dims(obs, axis=0)
logits = self.A3C_agent.predict(obs)
def test_A3C_value(self):
"""Test APIs in PARL A3C predict
"""
obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
obs = np.expand_dims(obs, axis=0)
values = self.A3C_agent.value(obs)
def test_A3C_learn(self):
"""Test APIs in PARL A3C learn
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496]])
action = np.array([0])
advantages = np.array([-0.02332638])
target_values = np.array([1.])
self.A3C_agent.learn(obs, action, advantages, target_values)
def test_IMPALA_predict(self):
"""Test APIs in PARL IMPALA predict
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496]])
policy = self.IMPALA_agent.predict(obs)
def test_IMPALA_learn(self):
"""Test APIs in PARL IMPALA learn
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496],
[-0.02394919, 0.03114079, 0.01136446, 0.00324496],
[-0.02394919, 0.03114079, 0.01136446, 0.00324496],
[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype('float32')
actions = np.array([1, 1, 1, 1]).astype('int32')
behaviour_logits = np.array([[-1, 1], [-1, 1], [-1, 1],
[-1, 1]]).astype('float32')
rewards = np.array([0, 0, 0, 0]).astype('float32')
dones = np.array([False, False, False, False]).astype('float32')
lr = 3e-4
entropy_coeff = 0.
total_loss, pi_loss, vf_loss, entropy, kl = self.IMPALA_agent.learn(
obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff)
def test_SAC_predict(self):
"""Test APIs in PARL SAC predict
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype(np.float32)
act = self.SAC_agent.predict(obs)
def test_SAC_sample(self):
"""Test APIs in PARL SAC sample
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype(np.float32)
act = self.SAC_agent.sample(obs)
def test_SAC_learn(self):
"""Test APIs in PARL SAC learn
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype(np.float32)
next_obs = np.array(
[[-0.02332638, -0.16414229, 0.01142936,
0.29949173]]).astype(np.float32)
terminal = np.array([False]).astype('bool')
reward = np.array([1.0]).astype('float32')
act = np.array([[0.]]).astype('float32')
critic_cost, actor_cost = self.SAC_agent.learn(obs, act, reward,
next_obs, terminal)
def test_DDPG_predict(self):
"""Test APIs in PARL DDPG predict
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype(np.float32)
act = self.DDPG_agent.predict(obs)
def test_DDPG_learn(self):
"""Test APIs in PARL DDPG learn
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype(np.float32)
next_obs = np.array(
[[-0.02332638, -0.16414229, 0.01142936,
0.29949173]]).astype(np.float32)
terminal = np.array([False]).astype('bool')
reward = np.array([1.0]).astype('float32')
act = np.array([[0.]]).astype('float32')
critic_cost, actor_cost = self.SAC_agent.learn(obs, act, reward,
next_obs, terminal)
def test_TD3_predict(self):
"""Test APIs in PARL TD3 predict
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype(np.float32)
act = self.TD3_agent.predict(obs)
def test_TD3_learn(self):
"""Test APIs in PARL TD3 learn
"""
obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
0.00324496]]).astype(np.float32)
next_obs = np.array(
[[-0.02332638, -0.16414229, 0.01142936,
0.29949173]]).astype(np.float32)
terminal = np.array([False]).astype('bool')
reward = np.array([1.0]).astype('float32')
act = np.array([[0.]]).astype('float32')
critic_cost, actor_cost = self.TD3_agent.learn(obs, act, reward,
next_obs, terminal)
if __name__ == '__main__':
unittest.main()
...@@ -16,5 +16,5 @@ from parl.algorithms.torch.ddqn import * ...@@ -16,5 +16,5 @@ from parl.algorithms.torch.ddqn import *
from parl.algorithms.torch.dqn import * from parl.algorithms.torch.dqn import *
from parl.algorithms.torch.a2c import * from parl.algorithms.torch.a2c import *
from parl.algorithms.torch.td3 import * from parl.algorithms.torch.td3 import *
from parl.algorithms.torch.coma import * from parl.algorithms.torch.ppo import *
from parl.algorithms.torch.policy_gradient import * from parl.algorithms.torch.policy_gradient import *
...@@ -27,7 +27,7 @@ __all__ = ['A2C'] ...@@ -27,7 +27,7 @@ __all__ = ['A2C']
class A2C(parl.Algorithm): class A2C(parl.Algorithm):
def __init__(self, model, config, hyperparas=None): def __init__(self, model, config):
assert isinstance(config['vf_loss_coeff'], (int, float)) assert isinstance(config['vf_loss_coeff'], (int, float))
self.model = model self.model = model
self.vf_loss_coeff = config['vf_loss_coeff'] self.vf_loss_coeff = config['vf_loss_coeff']
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.distributions import Normal
__all__ = ['PPO']
class PPO(parl.Algorithm):
def __init__(self,
model,
clip_param,
value_loss_coef,
entropy_coef,
initial_lr,
eps=None,
max_grad_norm=None,
use_clipped_value_loss=True):
self.model = model
self.clip_param = clip_param
self.value_loss_coef = value_loss_coef
self.entropy_coef = entropy_coef
self.max_grad_norm = max_grad_norm
self.use_clipped_value_loss = use_clipped_value_loss
self.optimizer = optim.Adam(model.parameters(), lr=initial_lr, eps=eps)
def learn(self, obs_batch, actions_batch, value_preds_batch, return_batch,
old_action_log_probs_batch, adv_targ):
values = self.model.value(obs_batch)
mean, log_std = self.model.policy(obs_batch)
dist = Normal(mean, log_std.exp())
action_log_probs = dist.log_prob(actions_batch).sum(-1, keepdim=True)
dist_entropy = dist.entropy().sum(-1).mean()
ratio = torch.exp(action_log_probs - old_action_log_probs_batch)
surr1 = ratio * adv_targ
surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
1.0 + self.clip_param) * adv_targ
action_loss = -torch.min(surr1, surr2).mean()
if self.use_clipped_value_loss:
value_pred_clipped = value_preds_batch + \
(values - value_preds_batch).clamp(-self.clip_param, self.clip_param)
value_losses = (values - return_batch).pow(2)
value_losses_clipped = (value_pred_clipped - return_batch).pow(2)
value_loss = 0.5 * torch.max(value_losses,
value_losses_clipped).mean()
else:
value_loss = 0.5 * (return_batch - values).pow(2).mean()
self.optimizer.zero_grad()
(value_loss * self.value_loss_coef + action_loss -
dist_entropy * self.entropy_coef).backward()
nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
self.optimizer.step()
return value_loss.item(), action_loss.item(), dist_entropy.item()
def sample(self, obs):
value = self.model.value(obs)
mean, log_std = self.model.policy(obs)
dist = Normal(mean, log_std.exp())
action = dist.sample()
action_log_probs = dist.log_prob(action).sum(-1, keepdim=True)
return value, action, action_log_probs
def predict(self, obs):
mean, _ = self.model.policy(obs)
return mean
def value(self, obs):
return self.model.value(obs)
...@@ -15,9 +15,9 @@ ...@@ -15,9 +15,9 @@
import warnings import warnings
warnings.simplefilter('default') warnings.simplefilter('default')
import os
import paddle.fluid as fluid import paddle.fluid as fluid
from parl.core.fluid import layers from parl.core.fluid import layers
from parl.utils.deprecation import deprecated
from parl.core.agent_base import AgentBase from parl.core.agent_base import AgentBase
from parl.core.fluid.algorithm import Algorithm from parl.core.fluid.algorithm import Algorithm
from parl.utils import machine_info from parl.utils import machine_info
...@@ -46,7 +46,6 @@ class Agent(AgentBase): ...@@ -46,7 +46,6 @@ class Agent(AgentBase):
This class will initialize the neural network parameters automatically, and provides an executor for users to run the programs (self.fluid_executor). This class will initialize the neural network parameters automatically, and provides an executor for users to run the programs (self.fluid_executor).
Attributes: Attributes:
gpu_id (int): deprecated. specify which GPU to be used. -1 if to use the CPU.
fluid_executor (fluid.Executor): executor for running programs of the agent. fluid_executor (fluid.Executor): executor for running programs of the agent.
alg (parl.algorithm): algorithm of this agent. alg (parl.algorithm): algorithm of this agent.
...@@ -65,18 +64,12 @@ class Agent(AgentBase): ...@@ -65,18 +64,12 @@ class Agent(AgentBase):
""" """
def __init__(self, algorithm, gpu_id=None): def __init__(self, algorithm):
"""Build programs by calling the method ``self.build_program()`` and run initialization function of ``fluid.default_startup_program()``. """Build programs by calling the method ``self.build_program()`` and run initialization function of ``fluid.default_startup_program()``.
Args: Args:
algorithm (parl.Algorithm): an instance of `parl.Algorithm`. This algorithm is then passed to `self.alg`. algorithm (parl.Algorithm): an instance of `parl.Algorithm`. This algorithm is then passed to `self.alg`.
gpu_id (int): deprecated. specify which GPU to be used. -1 if to use the CPU.
""" """
if gpu_id is not None:
warnings.warn(
"the `gpu_id` argument of `__init__` function in `parl.Agent` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
assert isinstance(algorithm, Algorithm) assert isinstance(algorithm, Algorithm)
super(Agent, self).__init__(algorithm) super(Agent, self).__init__(algorithm)
...@@ -119,26 +112,6 @@ class Agent(AgentBase): ...@@ -119,26 +112,6 @@ class Agent(AgentBase):
""" """
raise NotImplementedError raise NotImplementedError
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='get_weights')
def get_params(self):
""" Returns a Python dictionary containing the whole parameters of self.alg.
Returns:
a Python List containing the parameters of self.alg.
"""
return self.algorithm.get_params()
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='set_weights')
def set_params(self, params):
"""Copy parameters from ``get_params()`` into this agent.
Args:
params(dict): a Python List containing the parameters of self.alg.
"""
self.algorithm.set_params(params)
def learn(self, *args, **kwargs): def learn(self, *args, **kwargs):
"""The training interface for ``Agent``. """The training interface for ``Agent``.
This function feeds the training data into the learn_program defined in ``build_program()``. This function feeds the training data into the learn_program defined in ``build_program()``.
...@@ -180,8 +153,8 @@ class Agent(AgentBase): ...@@ -180,8 +153,8 @@ class Agent(AgentBase):
""" """
if program is None: if program is None:
program = self.learn_program program = self.learn_program
dirname = '/'.join(save_path.split('/')[:-1]) dirname = os.sep.join(save_path.split(os.sep)[:-1])
filename = save_path.split('/')[-1] filename = save_path.split(os.sep)[-1]
fluid.io.save_params( fluid.io.save_params(
executor=self.fluid_executor, executor=self.fluid_executor,
dirname=dirname, dirname=dirname,
...@@ -214,8 +187,8 @@ class Agent(AgentBase): ...@@ -214,8 +187,8 @@ class Agent(AgentBase):
program = self.learn_program program = self.learn_program
if type(program) is fluid.compiler.CompiledProgram: if type(program) is fluid.compiler.CompiledProgram:
program = program._init_program program = program._init_program
dirname = '/'.join(save_path.split('/')[:-1]) dirname = os.sep.join(save_path.split(os.sep)[:-1])
filename = save_path.split('/')[-1] filename = save_path.split(os.sep)[-1]
fluid.io.load_params( fluid.io.load_params(
executor=self.fluid_executor, executor=self.fluid_executor,
dirname=dirname, dirname=dirname,
......
...@@ -17,7 +17,6 @@ warnings.simplefilter('default') ...@@ -17,7 +17,6 @@ warnings.simplefilter('default')
from parl.core.algorithm_base import AlgorithmBase from parl.core.algorithm_base import AlgorithmBase
from parl.core.fluid.model import Model from parl.core.fluid.model import Model
from parl.utils.deprecation import deprecated
__all__ = ['Algorithm'] __all__ = ['Algorithm']
...@@ -57,47 +56,13 @@ class Algorithm(AlgorithmBase): ...@@ -57,47 +56,13 @@ class Algorithm(AlgorithmBase):
""" """
def __init__(self, model=None, hyperparas=None): def __init__(self, model=None):
""" """
Args: Args:
model(``parl.Model``): a neural network that represents a policy or a Q-value function. model(``parl.Model``): a neural network that represents a policy or a Q-value function.
hyperparas(dict): a dict storing the hyper-parameters relative to training.
""" """
if model is not None:
warnings.warn(
"the `model` argument of `__init__` function in `parl.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
assert isinstance(model, Model) assert isinstance(model, Model)
self.model = model self.model = model
if hyperparas is not None:
warnings.warn(
"the `hyperparas` argument of `__init__` function in `parl.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
self.hp = hyperparas
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='get_weights')
def get_params(self):
""" Get parameters of self.model.
Returns:
params(dict): a Python List containing the parameters of self.model.
"""
return self.model.get_params()
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='set_weights')
def set_params(self, params):
""" Set parameters from ``get_params`` to the model.
Args:
params(dict ): a Python List containing the parameters of self.model.
"""
self.model.set_params(params)
def learn(self, *args, **kwargs): def learn(self, *args, **kwargs):
""" Define the loss function and create an optimizer to minize the loss. """ Define the loss function and create an optimizer to minize the loss.
......
...@@ -45,7 +45,7 @@ class TestParamSharing(unittest.TestCase): ...@@ -45,7 +45,7 @@ class TestParamSharing(unittest.TestCase):
dict_size = 100 dict_size = 100
input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32") input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32")
input_x = np.random.randint( input_x = np.random.randint(
dict_size, size=(batch_size, 1)).astype("int") dict_size, size=(batch_size, 1)).astype("int64")
################################# #################################
main_program1 = fluid.Program() main_program1 = fluid.Program()
...@@ -59,7 +59,7 @@ class TestParamSharing(unittest.TestCase): ...@@ -59,7 +59,7 @@ class TestParamSharing(unittest.TestCase):
main_program2 = fluid.Program() main_program2 = fluid.Program()
with fluid.program_guard(main_program2): with fluid.program_guard(main_program2):
x_ = layers.data(name='x', shape=[1], dtype="int") x_ = layers.data(name='x', shape=[1], dtype="int64")
cx_ = layers.cast( cx_ = layers.cast(
x=layers.one_hot(input=x_, depth=dict_size), dtype="float32") x=layers.one_hot(input=x_, depth=dict_size), dtype="float32")
y1_ = net.fc1(input=cx_) y1_ = net.fc1(input=cx_)
......
...@@ -17,7 +17,6 @@ import paddle.fluid as fluid ...@@ -17,7 +17,6 @@ import paddle.fluid as fluid
from parl.core.fluid.layers.layer_wrappers import LayerFunc from parl.core.fluid.layers.layer_wrappers import LayerFunc
from parl.core.fluid.plutils import * from parl.core.fluid.plutils import *
from parl.core.model_base import ModelBase from parl.core.model_base import ModelBase
from parl.utils.deprecation import deprecated
from parl.utils import machine_info from parl.utils import machine_info
__all__ = ['Model'] __all__ = ['Model']
...@@ -67,30 +66,6 @@ class Model(ModelBase): ...@@ -67,30 +66,6 @@ class Model(ModelBase):
""" """
@deprecated(
deprecated_in='1.2',
removed_in='1.3',
replace_function='sync_weights_to')
def sync_params_to(self,
target_net,
gpu_id=None,
decay=0.0,
share_vars_parallel_executor=None):
"""Synchronize parameters in the model to another model (target_net).
target_net_weights = decay * target_net_weights + (1 - decay) * source_net_weights
Args:
target_model (`parl.Model`): an instance of ``Model`` that has the same neural network architecture as the current model.
decay (float): the rate of decline in copying parameters. 0 if no parameters decay when synchronizing the parameters.
share_vars_parallel_executor (fluid.ParallelExecutor): Optional. If not None, will use fluid.ParallelExecutor
to run program instead of fluid.Executor
"""
self.sync_weights_to(
target_model=target_net,
decay=decay,
share_vars_parallel_executor=share_vars_parallel_executor)
def sync_weights_to(self, def sync_weights_to(self,
target_model, target_model,
decay=0.0, decay=0.0,
...@@ -181,21 +156,6 @@ class Model(ModelBase): ...@@ -181,21 +156,6 @@ class Model(ModelBase):
else: else:
self._cached_fluid_executor.run(fetch_list=[]) self._cached_fluid_executor.run(fetch_list=[])
@property
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='parameters')
def parameter_names(self):
"""Get names of all parameters in this ``Model``.
Only parameters created by ``parl.layers`` are included.
The order of parameter names is consistent among
different instances of the same `Model`.
Returns:
param_names(list): list of string containing parameter names of all parameters.
"""
return self.parameters()
def parameters(self): def parameters(self):
"""Get names of all parameters in this ``Model``. """Get names of all parameters in this ``Model``.
...@@ -223,26 +183,6 @@ class Model(ModelBase): ...@@ -223,26 +183,6 @@ class Model(ModelBase):
self._parameter_names = self._get_parameter_names(self) self._parameter_names = self._get_parameter_names(self)
return self._parameter_names return self._parameter_names
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='get_weights')
def get_params(self):
""" Return a Python list containing parameters of current model.
Returns:
parameters: a Python list containing parameters of the current model.
"""
return self.get_weights()
@deprecated(
deprecated_in='1.2', removed_in='1.3', replace_function='set_weights')
def set_params(self, params, gpu_id=None):
"""Set parameters in the model with params.
Args:
params (List): List of numpy array .
"""
self.set_weights(weights=params)
def get_weights(self): def get_weights(self):
"""Returns a Python list containing parameters of current model. """Returns a Python list containing parameters of current model.
......
...@@ -46,8 +46,8 @@ class TestAlgorithm(parl.Algorithm): ...@@ -46,8 +46,8 @@ class TestAlgorithm(parl.Algorithm):
class TestAgent(parl.Agent): class TestAgent(parl.Agent):
def __init__(self, algorithm, gpu_id=None): def __init__(self, algorithm):
super(TestAgent, self).__init__(algorithm, gpu_id) super(TestAgent, self).__init__(algorithm)
def build_program(self): def build_program(self):
self.predict_program = fluid.Program() self.predict_program = fluid.Program()
...@@ -92,8 +92,8 @@ class AgentBaseTest(unittest.TestCase): ...@@ -92,8 +92,8 @@ class AgentBaseTest(unittest.TestCase):
agent = TestAgent(self.algorithm) agent = TestAgent(self.algorithm)
obs = np.random.random([3, 10]).astype('float32') obs = np.random.random([3, 10]).astype('float32')
output_np = agent.predict(obs) output_np = agent.predict(obs)
save_path1 = './model.ckpt' save_path1 = 'model.ckpt'
save_path2 = './my_model/model-2.ckpt' save_path2 = os.path.join('my_model', 'model-2.ckpt')
agent.save(save_path1) agent.save(save_path1)
agent.save(save_path2) agent.save(save_path2)
self.assertTrue(os.path.exists(save_path1)) self.assertTrue(os.path.exists(save_path1))
...@@ -103,7 +103,7 @@ class AgentBaseTest(unittest.TestCase): ...@@ -103,7 +103,7 @@ class AgentBaseTest(unittest.TestCase):
agent = TestAgent(self.algorithm) agent = TestAgent(self.algorithm)
obs = np.random.random([3, 10]).astype('float32') obs = np.random.random([3, 10]).astype('float32')
output_np = agent.predict(obs) output_np = agent.predict(obs)
save_path1 = './model.ckpt' save_path1 = 'model.ckpt'
previous_output = agent.predict(obs) previous_output = agent.predict(obs)
agent.save(save_path1) agent.save(save_path1)
agent.restore(save_path1) agent.restore(save_path1)
...@@ -121,7 +121,7 @@ class AgentBaseTest(unittest.TestCase): ...@@ -121,7 +121,7 @@ class AgentBaseTest(unittest.TestCase):
agent.learn_program = parl.compile(agent.learn_program) agent.learn_program = parl.compile(agent.learn_program)
obs = np.random.random([3, 10]).astype('float32') obs = np.random.random([3, 10]).astype('float32')
previous_output = agent.predict(obs) previous_output = agent.predict(obs)
save_path1 = './model.ckpt' save_path1 = 'model.ckpt'
agent.save(save_path1) agent.save(save_path1)
agent.restore(save_path1) agent.restore(save_path1)
......
...@@ -690,6 +690,43 @@ class ModelBaseTest(unittest.TestCase): ...@@ -690,6 +690,43 @@ class ModelBaseTest(unittest.TestCase):
self.executor.run( self.executor.run(
pred_program, feed={'obs': x}, fetch_list=[model_output]) pred_program, feed={'obs': x}, fetch_list=[model_output])
def test_get_weights_set_weights_with_create_parameter(self):
model1 = TestModel2()
model2 = TestModel2()
pred_program = fluid.Program()
with fluid.program_guard(pred_program):
obs = layers.data(name='obs', shape=[100], dtype='float32')
model1_output = model1.predict(obs)
model2_output = model2.predict(obs)
self.executor.run(fluid.default_startup_program())
N = 10
random_obs = np.random.random(size=(N, 100)).astype('float32')
for i in range(N):
x = np.expand_dims(random_obs[i], axis=0)
outputs = self.executor.run(
pred_program,
feed={'obs': x},
fetch_list=[model1_output, model2_output])
self.assertNotEqual(
np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
# pass parameters of self.model to model2
params = model1.get_weights()
model2.set_weights(params)
random_obs = np.random.random(size=(N, 100)).astype('float32')
for i in range(N):
x = np.expand_dims(random_obs[i], axis=0)
outputs = self.executor.run(
pred_program,
feed={'obs': x},
fetch_list=[model1_output, model2_output])
self.assertEqual(
np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -113,8 +113,9 @@ class Agent(AgentBase): ...@@ -113,8 +113,9 @@ class Agent(AgentBase):
""" """
if model is None: if model is None:
model = self.algorithm.model model = self.algorithm.model
dirname = '/'.join(save_path.split('/')[:-1]) sep = os.sep
if not os.path.exists(dirname): dirname = sep.join(save_path.split(sep)[:-1])
if dirname != '' and not os.path.exists(dirname):
os.makedirs(dirname) os.makedirs(dirname)
torch.save(model.state_dict(), save_path) torch.save(model.state_dict(), save_path)
......
...@@ -77,8 +77,8 @@ class AgentBaseTest(unittest.TestCase): ...@@ -77,8 +77,8 @@ class AgentBaseTest(unittest.TestCase):
def test_save(self): def test_save(self):
agent = TestAgent(self.alg) agent = TestAgent(self.alg)
obs = torch.randn(3, 10) obs = torch.randn(3, 10)
save_path1 = './model.ckpt' save_path1 = 'model.ckpt'
save_path2 = './my_model/model-2.ckpt' save_path2 = os.path.join('my_model', 'model-2.ckpt')
agent.save(save_path1) agent.save(save_path1)
agent.save(save_path2) agent.save(save_path2)
self.assertTrue(os.path.exists(save_path1)) self.assertTrue(os.path.exists(save_path1))
...@@ -88,7 +88,7 @@ class AgentBaseTest(unittest.TestCase): ...@@ -88,7 +88,7 @@ class AgentBaseTest(unittest.TestCase):
agent = TestAgent(self.alg) agent = TestAgent(self.alg)
obs = torch.randn(3, 10) obs = torch.randn(3, 10)
output = agent.predict(obs) output = agent.predict(obs)
save_path1 = './model.ckpt' save_path1 = 'model.ckpt'
previous_output = agent.predict(obs).detach().cpu().numpy() previous_output = agent.predict(obs).detach().cpu().numpy()
agent.save(save_path1) agent.save(save_path1)
agent.restore(save_path1) agent.restore(save_path1)
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
warnings.simplefilter('default')
warnings.warn(
"import way `import parl.framework` is deprecated since version 1.2 and will be removed in version 1.3.",
DeprecationWarning,
stacklevel=2)
from parl.core.fluid.model import *
from parl.core.fluid.algorithm import *
from parl.core.fluid.agent import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
warnings.simplefilter('default')
warnings.warn(
"module `parl.framework.policy_distribution` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.policy_distribution` instead.",
DeprecationWarning,
stacklevel=2)
from parl.core.fluid.policy_distribution import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import warnings
warnings.simplefilter('default')
warnings.warn(
"import way `import parl.layers` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import layers` or `import parl; parl.layers` instead.",
DeprecationWarning,
stacklevel=2)
from parl.core.fluid.layers import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
print(
"import way `import parl.plutils` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import plutils` or `import parl; parl.plutils` instead."
)
from parl.core.fluid.plutils.common import *
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
print(
"import way `import parl.plutils` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import plutils` or `import parl; parl.plutils` instead."
)
from parl.core.fluid.plutils.common import *
...@@ -59,6 +59,7 @@ class Client(object): ...@@ -59,6 +59,7 @@ class Client(object):
self.heartbeat_socket_initialized = threading.Event() self.heartbeat_socket_initialized = threading.Event()
self.master_is_alive = True self.master_is_alive = True
self.client_is_alive = True self.client_is_alive = True
self.log_monitor_url = None
self.executable_path = self.get_executable_path() self.executable_path = self.get_executable_path()
...@@ -105,9 +106,19 @@ class Client(object): ...@@ -105,9 +106,19 @@ class Client(object):
for file in distributed_files: for file in distributed_files:
assert os.path.exists(file) assert os.path.exists(file)
assert not os.path.isabs(
file
), "[XPARL] Please do not distribute a file with absolute path."
with open(file, 'rb') as f: with open(file, 'rb') as f:
content = f.read() content = f.read()
pyfiles['other_files'][file] = content pyfiles['other_files'][file] = content
# append entry file to code list
main_file = sys.argv[0]
with open(main_file, 'rb') as code_file:
code = code_file.read()
# parl/remote/remote_decorator.py -> remote_decorator.py
file_name = main_file.split(os.sep)[-1]
pyfiles['python_files'][file_name] = code
except AssertionError as e: except AssertionError as e:
raise Exception( raise Exception(
'Failed to create the client, the file {} does not exist.'. 'Failed to create the client, the file {} does not exist.'.
...@@ -132,14 +143,19 @@ class Client(object): ...@@ -132,14 +143,19 @@ class Client(object):
thread.start() thread.start()
self.heartbeat_socket_initialized.wait() self.heartbeat_socket_initialized.wait()
self.client_id = self.reply_master_heartbeat_address.replace(':', '_') + \
'_' + str(int(time.time()))
# check if the master is connected properly # check if the master is connected properly
try: try:
self.submit_job_socket.send_multipart([ self.submit_job_socket.send_multipart([
remote_constants.CLIENT_CONNECT_TAG, remote_constants.CLIENT_CONNECT_TAG,
to_byte(self.heartbeat_master_address), to_byte(self.reply_master_heartbeat_address),
to_byte(socket.gethostname()) to_byte(socket.gethostname()),
to_byte(self.client_id),
]) ])
_ = self.submit_job_socket.recv_multipart() message = self.submit_job_socket.recv_multipart()
self.log_monitor_url = to_str(message[1])
except zmq.error.Again as e: except zmq.error.Again as e:
logger.warning("[Client] Can not connect to the master, please " logger.warning("[Client] Can not connect to the master, please "
"check if master is started and ensure the input " "check if master is started and ensure the input "
...@@ -150,17 +166,18 @@ class Client(object): ...@@ -150,17 +166,18 @@ class Client(object):
"address {} is correct.".format(master_address)) "address {} is correct.".format(master_address))
def _reply_heartbeat(self): def _reply_heartbeat(self):
"""Reply heartbeat signals to the specific node.""" """Reply heartbeat signals to the master node."""
socket = self.ctx.socket(zmq.REP) socket = self.ctx.socket(zmq.REP)
socket.linger = 0 socket.linger = 0
socket.setsockopt(zmq.RCVTIMEO, socket.setsockopt(zmq.RCVTIMEO,
remote_constants.HEARTBEAT_RCVTIMEO_S * 1000) remote_constants.HEARTBEAT_RCVTIMEO_S * 1000)
heartbeat_master_port =\ reply_master_heartbeat_port =\
socket.bind_to_random_port(addr="tcp://*") socket.bind_to_random_port(addr="tcp://*")
self.heartbeat_master_address = "{}:{}".format(get_ip_address(), self.reply_master_heartbeat_address = "{}:{}".format(
heartbeat_master_port) get_ip_address(), reply_master_heartbeat_port)
self.heartbeat_socket_initialized.set() self.heartbeat_socket_initialized.set()
connected = False
while self.client_is_alive and self.master_is_alive: while self.client_is_alive and self.master_is_alive:
try: try:
message = socket.recv_multipart() message = socket.recv_multipart()
...@@ -170,11 +187,18 @@ class Client(object): ...@@ -170,11 +187,18 @@ class Client(object):
remote_constants.HEARTBEAT_TAG, remote_constants.HEARTBEAT_TAG,
to_byte(self.executable_path), to_byte(self.executable_path),
to_byte(str(self.actor_num)), to_byte(str(self.actor_num)),
to_byte(str(elapsed_time)) to_byte(str(elapsed_time)),
]) to_byte(str(self.log_monitor_url)),
]) # TODO: remove additional information
except zmq.error.Again as e: except zmq.error.Again as e:
if connected:
logger.warning("[Client] Cannot connect to the master." logger.warning("[Client] Cannot connect to the master."
"Please check if it is still alive.") "Please check if it is still alive.")
else:
logger.warning(
"[Client] Cannot connect to the master."
"Please check the firewall between client and master.(e.g., ping the master IP)"
)
self.master_is_alive = False self.master_is_alive = False
socket.close(0) socket.close(0)
logger.warning("Client exit replying heartbeat for master.") logger.warning("Client exit replying heartbeat for master.")
...@@ -182,7 +206,7 @@ class Client(object): ...@@ -182,7 +206,7 @@ class Client(object):
def _check_and_monitor_job(self, job_heartbeat_address, def _check_and_monitor_job(self, job_heartbeat_address,
ping_heartbeat_address, max_memory): ping_heartbeat_address, max_memory):
""" Sometimes the client may receive a job that is dead, thus """ Sometimes the client may receive a job that is dead, thus
we have to check if this job is still alive before sending it to the actor. we have to check if this job is still alive before adding it to the `actor_num`.
""" """
# job_heartbeat_socket: sends heartbeat signal to job # job_heartbeat_socket: sends heartbeat signal to job
job_heartbeat_socket = self.ctx.socket(zmq.REQ) job_heartbeat_socket = self.ctx.socket(zmq.REQ)
...@@ -271,7 +295,8 @@ class Client(object): ...@@ -271,7 +295,8 @@ class Client(object):
self.lock.acquire() self.lock.acquire()
self.submit_job_socket.send_multipart([ self.submit_job_socket.send_multipart([
remote_constants.CLIENT_SUBMIT_TAG, remote_constants.CLIENT_SUBMIT_TAG,
to_byte(self.heartbeat_master_address) to_byte(self.reply_master_heartbeat_address),
to_byte(self.client_id),
]) ])
message = self.submit_job_socket.recv_multipart() message = self.submit_job_socket.recv_multipart()
self.lock.release() self.lock.release()
...@@ -326,9 +351,10 @@ def connect(master_address, distributed_files=[]): ...@@ -326,9 +351,10 @@ def connect(master_address, distributed_files=[]):
Exception: An exception is raised if the master node is not started. Exception: An exception is raised if the master node is not started.
""" """
assert len(master_address.split(":")) == 2, "please input address in " +\ assert len(master_address.split(":")) == 2, "Please input address in " +\
"{ip}:{port} format" "{ip}:{port} format"
global GLOBAL_CLIENT global GLOBAL_CLIENT
addr = master_address.split(":")[0]
cur_process_id = os.getpid() cur_process_id = os.getpid()
if GLOBAL_CLIENT is None: if GLOBAL_CLIENT is None:
GLOBAL_CLIENT = Client(master_address, cur_process_id, GLOBAL_CLIENT = Client(master_address, cur_process_id,
...@@ -337,6 +363,8 @@ def connect(master_address, distributed_files=[]): ...@@ -337,6 +363,8 @@ def connect(master_address, distributed_files=[]):
if GLOBAL_CLIENT.process_id != cur_process_id: if GLOBAL_CLIENT.process_id != cur_process_id:
GLOBAL_CLIENT = Client(master_address, cur_process_id, GLOBAL_CLIENT = Client(master_address, cur_process_id,
distributed_files) distributed_files)
logger.info("Remote actors log url: {}".format(
GLOBAL_CLIENT.log_monitor_url))
def get_global_client(): def get_global_client():
...@@ -366,5 +394,5 @@ def disconnect(): ...@@ -366,5 +394,5 @@ def disconnect():
GLOBAL_CLIENT = None GLOBAL_CLIENT = None
else: else:
logger.info( logger.info(
"No client to be released. Please make sure that you have call `parl.connect`" "No client to be released. Please make sure that you have called `parl.connect`"
) )
...@@ -28,7 +28,8 @@ class ClusterMonitor(object): ...@@ -28,7 +28,8 @@ class ClusterMonitor(object):
def __init__(self): def __init__(self):
self.status = { self.status = {
'workers': defaultdict(dict), 'workers': defaultdict(dict),
'clients': defaultdict(dict) 'clients': defaultdict(dict),
'client_jobs': defaultdict(dict),
} }
self.lock = threading.Lock() self.lock = threading.Lock()
...@@ -46,6 +47,11 @@ class ClusterMonitor(object): ...@@ -46,6 +47,11 @@ class ClusterMonitor(object):
worker_status['hostname'] = hostname worker_status['hostname'] = hostname
self.lock.release() self.lock.release()
def add_client_job(self, client_id, job_info):
self.lock.acquire()
self.status['client_jobs'][client_id].update(job_info)
self.lock.release()
def update_client_status(self, client_status, client_address, def update_client_status(self, client_status, client_address,
client_hostname): client_hostname):
"""Update client status with message send from client heartbeat. """Update client status with message send from client heartbeat.
...@@ -61,7 +67,8 @@ class ClusterMonitor(object): ...@@ -61,7 +67,8 @@ class ClusterMonitor(object):
'client_address': client_hostname, 'client_address': client_hostname,
'file_path': to_str(client_status[1]), 'file_path': to_str(client_status[1]),
'actor_num': int(to_str(client_status[2])), 'actor_num': int(to_str(client_status[2])),
'time': to_str(client_status[3]) 'time': to_str(client_status[3]),
'log_monitor_url': to_str(client_status[4]),
} }
self.lock.release() self.lock.release()
...@@ -96,13 +103,14 @@ class ClusterMonitor(object): ...@@ -96,13 +103,14 @@ class ClusterMonitor(object):
self.status['workers'].pop(worker_address) self.status['workers'].pop(worker_address)
self.lock.release() self.lock.release()
def drop_cluster_status(self, client_address): def drop_client_status(self, client_address):
"""Drop cluster status when it exits. """Drop client status when it exits.
Args: Args:
cluster_address (str): IP address of the exited client. client_address (str): IP address of the exited client.
""" """
self.lock.acquire() self.lock.acquire()
if client_address in self.status['clients']:
self.status['clients'].pop(client_address) self.status['clients'].pop(client_address)
self.lock.release() self.lock.release()
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -11,14 +11,27 @@ ...@@ -11,14 +11,27 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
"""
This file is used to fix the problem that cloudpickle cannot load some packages normally in Mac OS.
We hack the problem by trying load these packages in the main module in advance.
import warnings Template:
warnings.simplefilter('default') try:
import [PACKAGE1]
except ImportError:
pass
warnings.warn( try:
"module `parl.framework.algorithm_base.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Algorithm` instead.", import [PACKAGE2]
DeprecationWarning, except ImportError:
stacklevel=2) pass
from parl.core.fluid.algorithm import * """
from parl.utils import _IS_MAC
if _IS_MAC:
try:
import rlschool
except ImportError:
pass
...@@ -12,6 +12,9 @@ ...@@ -12,6 +12,9 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
# Fix cloudpickle compatible problem we known.
import compatible_trick
import os import os
os.environ['CUDA_VISIBLE_DEVICES'] = '' os.environ['CUDA_VISIBLE_DEVICES'] = ''
os.environ['XPARL'] = 'True' os.environ['XPARL'] = 'True'
...@@ -33,6 +36,7 @@ from parl.utils.communication import loads_argument, loads_return,\ ...@@ -33,6 +36,7 @@ from parl.utils.communication import loads_argument, loads_return,\
from parl.remote import remote_constants from parl.remote import remote_constants
from parl.utils.exceptions import SerializeError, DeserializeError from parl.utils.exceptions import SerializeError, DeserializeError
from parl.remote.message import InitializedJob from parl.remote.message import InitializedJob
from parl.remote.utils import load_remote_class, redirect_stdout_to_file
class Job(object): class Job(object):
...@@ -44,7 +48,7 @@ class Job(object): ...@@ -44,7 +48,7 @@ class Job(object):
""" """
def __init__(self, worker_address): def __init__(self, worker_address, log_server_address):
""" """
Args: Args:
worker_address(str): worker_address for sending job information(e.g, pid) worker_address(str): worker_address for sending job information(e.g, pid)
...@@ -56,16 +60,21 @@ class Job(object): ...@@ -56,16 +60,21 @@ class Job(object):
self.max_memory = None self.max_memory = None
self.job_address_receiver, job_address_sender = Pipe() self.job_address_receiver, job_address_sender = Pipe()
self.job_id_receiver, job_id_sender = Pipe()
self.worker_address = worker_address self.worker_address = worker_address
self.log_server_address = log_server_address
self.job_ip = get_ip_address() self.job_ip = get_ip_address()
self.pid = os.getpid() self.pid = os.getpid()
self.lock = threading.Lock()
self.run_job_process = Process( self.run_job_process = Process(
target=self.run, args=(job_address_sender, )) target=self.run, args=(job_address_sender, job_id_sender))
self.run_job_process.start() self.run_job_process.start()
"""
NOTE:
In Windows, it will raise errors when creating threading.Lock before starting multiprocess.Process.
"""
self.lock = threading.Lock()
self._create_sockets() self._create_sockets()
process = psutil.Process(self.pid) process = psutil.Process(self.pid)
...@@ -81,7 +90,7 @@ class Job(object): ...@@ -81,7 +90,7 @@ class Job(object):
_ = self.kill_job_socket.recv_multipart() _ = self.kill_job_socket.recv_multipart()
except zmq.error.Again as e: except zmq.error.Again as e:
pass pass
os._exit(1) os._exit(0)
def _create_sockets(self): def _create_sockets(self):
"""Create five sockets for each job in main process. """Create five sockets for each job in main process.
...@@ -95,6 +104,7 @@ class Job(object): ...@@ -95,6 +104,7 @@ class Job(object):
""" """
# wait for another process to create reply socket # wait for another process to create reply socket
self.job_address = self.job_address_receiver.recv() self.job_address = self.job_address_receiver.recv()
self.job_id = self.job_id_receiver.recv()
self.ctx = zmq.Context() self.ctx = zmq.Context()
# create the job_socket # create the job_socket
...@@ -128,7 +138,8 @@ class Job(object): ...@@ -128,7 +138,8 @@ class Job(object):
# sends job information to the worker # sends job information to the worker
initialized_job = InitializedJob( initialized_job = InitializedJob(
self.job_address, worker_heartbeat_address, self.job_address, worker_heartbeat_address,
client_heartbeat_address, ping_heartbeat_address, None, self.pid) client_heartbeat_address, ping_heartbeat_address, None, self.pid,
self.job_id, self.log_server_address)
self.job_socket.send_multipart( self.job_socket.send_multipart(
[remote_constants.NORMAL_TAG, [remote_constants.NORMAL_TAG,
cloudpickle.dumps(initialized_job)]) cloudpickle.dumps(initialized_job)])
...@@ -262,12 +273,15 @@ class Job(object): ...@@ -262,12 +273,15 @@ class Job(object):
# create directory (i.e. ./rom_files/) # create directory (i.e. ./rom_files/)
if '/' in file: if '/' in file:
try: try:
os.makedirs(os.path.join(*file.rsplit('/')[:-1])) sep = os.sep
recursive_dirs = os.path.join(*(file.split(sep)[:-1]))
recursive_dirs = os.path.join(envdir, recursive_dirs)
os.makedirs(recursive_dirs)
except OSError as e: except OSError as e:
pass pass
file = os.path.join(envdir, file)
with open(file, 'wb') as f: with open(file, 'wb') as f:
f.write(content) f.write(content)
logger.info('[job] reply')
reply_socket.send_multipart([remote_constants.NORMAL_TAG]) reply_socket.send_multipart([remote_constants.NORMAL_TAG])
return envdir return envdir
else: else:
...@@ -295,8 +309,14 @@ class Job(object): ...@@ -295,8 +309,14 @@ class Job(object):
if tag == remote_constants.INIT_OBJECT_TAG: if tag == remote_constants.INIT_OBJECT_TAG:
try: try:
cls = cloudpickle.loads(message[1]) file_name, class_name, end_of_file = cloudpickle.loads(
message[1])
#/home/nlp-ol/Firework/baidu/nlp/evokit/python_api/es_agent -> es_agent
file_name = file_name.split(os.sep)[-1]
cls = load_remote_class(file_name, class_name, end_of_file)
args, kwargs = cloudpickle.loads(message[2]) args, kwargs = cloudpickle.loads(message[2])
logfile_path = os.path.join(self.log_dir, 'stdout.log')
with redirect_stdout_to_file(logfile_path):
obj = cls(*args, **kwargs) obj = cls(*args, **kwargs)
except Exception as e: except Exception as e:
traceback_str = str(traceback.format_exc()) traceback_str = str(traceback.format_exc())
...@@ -318,7 +338,7 @@ class Job(object): ...@@ -318,7 +338,7 @@ class Job(object):
return obj return obj
def run(self, job_address_sender): def run(self, job_address_sender, job_id_sender):
"""An infinite loop waiting for a new task. """An infinite loop waiting for a new task.
Args: Args:
...@@ -333,19 +353,28 @@ class Job(object): ...@@ -333,19 +353,28 @@ class Job(object):
job_ip = get_ip_address() job_ip = get_ip_address()
job_address = "{}:{}".format(job_ip, job_port) job_address = "{}:{}".format(job_ip, job_port)
job_id = job_address.replace(':', '_') + '_' + str(int(time.time()))
self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id))
logger.set_dir(self.log_dir)
logger.info(
"[Job] Job {} initialized. Reply heartbeat socket Address: {}.".
format(job_id, job_address))
job_address_sender.send(job_address) job_address_sender.send(job_address)
job_id_sender.send(job_id)
try: try:
# receive source code from the actor and append them to the environment variables. # receive source code from the actor and append them to the environment variables.
envdir = self.wait_for_files(reply_socket, job_address) envdir = self.wait_for_files(reply_socket, job_address)
sys.path.append(envdir) sys.path.insert(0, envdir)
os.chdir(envdir)
obj = self.wait_for_connection(reply_socket) obj = self.wait_for_connection(reply_socket)
assert obj is not None assert obj is not None
self.single_task(obj, reply_socket, job_address) self.single_task(obj, reply_socket, job_address)
except Exception as e: except Exception as e:
logger.error( logger.error(
"Error occurs when running a single task. We will reset this job. Reason:{}" "Error occurs when running a single task. We will reset this job. \nReason:{}"
.format(e)) .format(e))
traceback_str = str(traceback.format_exc()) traceback_str = str(traceback.format_exc())
logger.error("traceback:\n{}".format(traceback_str)) logger.error("traceback:\n{}".format(traceback_str))
...@@ -376,7 +405,12 @@ class Job(object): ...@@ -376,7 +405,12 @@ class Job(object):
function_name = to_str(message[1]) function_name = to_str(message[1])
data = message[2] data = message[2]
args, kwargs = loads_argument(data) args, kwargs = loads_argument(data)
# Redirect stdout to stdout.log temporarily
logfile_path = os.path.join(self.log_dir, 'stdout.log')
with redirect_stdout_to_file(logfile_path):
ret = getattr(obj, function_name)(*args, **kwargs) ret = getattr(obj, function_name)(*args, **kwargs)
ret = dumps_return(ret) ret = dumps_return(ret)
reply_socket.send_multipart( reply_socket.send_multipart(
...@@ -435,5 +469,10 @@ if __name__ == "__main__": ...@@ -435,5 +469,10 @@ if __name__ == "__main__":
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument( parser.add_argument(
"--worker_address", required=True, type=str, help="worker_address") "--worker_address", required=True, type=str, help="worker_address")
parser.add_argument(
"--log_server_address",
required=True,
type=str,
help="log_server_address, address of the log web server on worker")
args = parser.parse_args() args = parser.parse_args()
job = Job(args.worker_address) job = Job(args.worker_address, args.log_server_address)
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import linecache
import os
from flask import Flask, current_app, jsonify, make_response, request, send_file
from flask_cors import CORS
app = Flask(__name__)
CORS(app)
@app.route(
"/get-log", methods=[
'GET',
])
def get_log():
'''
args:
job_id: id of the remote job
response:
log: newest `LINE_NUM` lines of the log file
'''
try:
job_id = request.args['job_id']
except:
return make_response(
jsonify(message="No job_id provided, please check your request."),
400)
log_dir = current_app.config.get('LOG_DIR')
log_dir = os.path.expanduser(log_dir)
log_file_path = os.path.join(log_dir, job_id, 'stdout.log')
if not os.path.isfile(log_file_path):
return make_response(
jsonify(message="Log not exsits, please check your job_id"), 400)
else:
line_num = current_app.config.get('LINE_NUM')
linecache.checkcache(log_file_path)
log_content = ''.join(linecache.getlines(log_file_path)[-line_num:])
return make_response(
jsonify(message="Log exsits, content in log", log=log_content),
200)
@app.route(
'/download-log', methods=[
'GET',
])
def download_log():
'''
args:
job_id: the id of the remote job
response:
log: log file
'''
try:
job_id = request.args['job_id']
except:
return make_response(
jsonify(message="No job_id provided, please check your request."),
400)
log_dir = current_app.config.get('LOG_DIR')
log_dir = os.path.expanduser(log_dir)
log_file_path = os.path.join(log_dir, job_id, 'stdout.log')
if not os.path.isfile(log_file_path):
return make_response(
jsonify(message="Log not exsits, please check your job_id"), 400)
else:
return send_file(log_file_path, as_attachment=True)
if __name__ == "__main__":
import logging
log = logging.getLogger('werkzeug')
log.disabled = True
parser = argparse.ArgumentParser()
parser.add_argument('--port', required=True, type=int)
parser.add_argument('--log_dir', required=True, type=str)
parser.add_argument('--line_num', required=True, type=int)
args = parser.parse_args()
app.config.from_mapping(
LOG_DIR=args.log_dir,
LINE_NUM=args.line_num,
)
app.run(host="0.0.0.0", port=args.port)
...@@ -57,11 +57,12 @@ class Master(object): ...@@ -57,11 +57,12 @@ class Master(object):
port: The ip port that the master node binds to. port: The ip port that the master node binds to.
""" """
def __init__(self, port): def __init__(self, port, monitor_port=None):
self.ctx = zmq.Context() self.ctx = zmq.Context()
self.master_ip = get_ip_address() self.master_ip = get_ip_address()
self.monitor_url = "http://{}:{}".format(self.master_ip, monitor_port)
logger.set_dir( logger.set_dir(
os.path.expanduser('~/.parl_data/master/{}:{}'.format( os.path.expanduser('~/.parl_data/master/{}_{}'.format(
self.master_ip, port))) self.master_ip, port)))
self.client_socket = self.ctx.socket(zmq.REP) self.client_socket = self.ctx.socket(zmq.REP)
self.client_socket.bind("tcp://*:{}".format(port)) self.client_socket.bind("tcp://*:{}".format(port))
...@@ -135,7 +136,7 @@ class Master(object): ...@@ -135,7 +136,7 @@ class Master(object):
except zmq.error.Again as e: except zmq.error.Again as e:
client_is_alive = False client_is_alive = False
self.cluster_monitor.drop_cluster_status( self.cluster_monitor.drop_client_status(
client_heartbeat_address) client_heartbeat_address)
logger.warning("[Master] cannot connect to the client " + logger.warning("[Master] cannot connect to the client " +
"{}. ".format(client_heartbeat_address) + "{}. ".format(client_heartbeat_address) +
...@@ -205,8 +206,11 @@ class Master(object): ...@@ -205,8 +206,11 @@ class Master(object):
# a client connects to the master # a client connects to the master
elif tag == remote_constants.CLIENT_CONNECT_TAG: elif tag == remote_constants.CLIENT_CONNECT_TAG:
# `client_heartbeat_address` is the
# `reply_master_heartbeat_address` of the client
client_heartbeat_address = to_str(message[1]) client_heartbeat_address = to_str(message[1])
client_hostname = to_str(message[2]) client_hostname = to_str(message[2])
client_id = to_str(message[3])
self.client_hostname[client_heartbeat_address] = client_hostname self.client_hostname[client_heartbeat_address] = client_hostname
logger.info( logger.info(
"Client {} is connected.".format(client_heartbeat_address)) "Client {} is connected.".format(client_heartbeat_address))
...@@ -215,11 +219,14 @@ class Master(object): ...@@ -215,11 +219,14 @@ class Master(object):
target=self._create_client_monitor, target=self._create_client_monitor,
args=(client_heartbeat_address, )) args=(client_heartbeat_address, ))
thread.start() thread.start()
self.client_socket.send_multipart([remote_constants.NORMAL_TAG]) log_monitor_address = "{}/logs?client_id={}".format(
self.monitor_url, client_id)
self.client_socket.send_multipart(
[remote_constants.NORMAL_TAG,
to_byte(log_monitor_address)])
# a client submits a job to the master # a client submits a job to the master
elif tag == remote_constants.CLIENT_SUBMIT_TAG: elif tag == remote_constants.CLIENT_SUBMIT_TAG:
# check available CPU resources # check available CPU resources
if self.cpu_num: if self.cpu_num:
logger.info("Submitting job...") logger.info("Submitting job...")
...@@ -230,6 +237,9 @@ class Master(object): ...@@ -230,6 +237,9 @@ class Master(object):
to_byte(job.client_heartbeat_address), to_byte(job.client_heartbeat_address),
to_byte(job.ping_heartbeat_address), to_byte(job.ping_heartbeat_address),
]) ])
client_id = to_str(message[2])
job_info = {job.job_id: job.log_server_address}
self.cluster_monitor.add_client_job(client_id, job_info)
self._print_workers() self._print_workers()
else: else:
self.client_socket.send_multipart([remote_constants.CPU_TAG]) self.client_socket.send_multipart([remote_constants.CPU_TAG])
......
...@@ -14,9 +14,15 @@ ...@@ -14,9 +14,15 @@
class InitializedJob(object): class InitializedJob(object):
def __init__(self, job_address, worker_heartbeat_address, def __init__(self,
client_heartbeat_address, ping_heartbeat_address, job_address,
worker_address, pid): worker_heartbeat_address,
client_heartbeat_address,
ping_heartbeat_address,
worker_address,
pid,
job_id=None,
log_server_address=None):
""" """
Args: Args:
job_address(str): Job address to which the new task connect. job_address(str): Job address to which the new task connect.
...@@ -35,6 +41,8 @@ class InitializedJob(object): ...@@ -35,6 +41,8 @@ class InitializedJob(object):
self.worker_address = worker_address self.worker_address = worker_address
self.pid = pid self.pid = pid
self.is_alive = True self.is_alive = True
self.job_id = job_id
self.log_server_address = log_server_address
class InitializedWorker(object): class InitializedWorker(object):
......
...@@ -19,7 +19,7 @@ import time ...@@ -19,7 +19,7 @@ import time
import zmq import zmq
import threading import threading
from flask import Flask, render_template, jsonify from flask import Flask, render_template, jsonify, request
app = Flask(__name__) app = Flask(__name__)
...@@ -42,7 +42,7 @@ class ClusterMonitor(object): ...@@ -42,7 +42,7 @@ class ClusterMonitor(object):
def __init__(self, master_address): def __init__(self, master_address):
ctx = zmq.Context() ctx = zmq.Context()
self.socket = ctx.socket(zmq.REQ) self.socket = ctx.socket(zmq.REQ)
self.socket.setsockopt(zmq.RCVTIMEO, 10000) self.socket.setsockopt(zmq.RCVTIMEO, 30000)
self.socket.connect('tcp://{}'.format(master_address)) self.socket.connect('tcp://{}'.format(master_address))
self.data = None self.data = None
...@@ -81,6 +81,7 @@ class ClusterMonitor(object): ...@@ -81,6 +81,7 @@ class ClusterMonitor(object):
data['total_vacant_cpus'] = total_vacant_cpus data['total_vacant_cpus'] = total_vacant_cpus
data['total_cpus'] = total_used_cpus + total_vacant_cpus data['total_cpus'] = total_used_cpus + total_vacant_cpus
data['clients'] = list(status['clients'].values()) data['clients'] = list(status['clients'].values())
data['client_jobs'] = status['client_jobs']
self.data = data self.data = data
time.sleep(10) time.sleep(10)
...@@ -99,7 +100,44 @@ def cluster(): ...@@ -99,7 +100,44 @@ def cluster():
return jsonify(data) return jsonify(data)
@app.route(
'/logs', methods=[
'GET',
])
def logs():
client_id = request.args.get('client_id')
return render_template('jobs.html', client_id=client_id)
@app.route(
'/get-jobs', methods=[
'GET',
])
def get_jobs():
client_id = request.args.get('client_id')
jobs = CLUSTER_MONITOR.get_data()['client_jobs'].get(client_id)
data = []
if jobs:
for idx, job_id in enumerate(jobs):
monitor_url = jobs[job_id]
data.append({
"id":
idx,
"job_id":
job_id,
"log_url":
"http://{}/get-log?job_id={}".format(monitor_url, job_id),
"download_url":
"http://{}/download-log?job_id={}".format(monitor_url, job_id),
})
return jsonify(data)
if __name__ == "__main__": if __name__ == "__main__":
import logging
log = logging.getLogger('werkzeug')
log.disabled = True
parser = argparse.ArgumentParser() parser = argparse.ArgumentParser()
parser.add_argument('--monitor_port', default=1234, type=int) parser.add_argument('--monitor_port', default=1234, type=int)
parser.add_argument('--address', default='localhost:8010', type=str) parser.add_argument('--address', default='localhost:8010', type=str)
......
...@@ -18,6 +18,7 @@ import threading ...@@ -18,6 +18,7 @@ import threading
import time import time
import zmq import zmq
import numpy as np import numpy as np
import inspect
from parl.utils import get_ip_address, logger, to_str, to_byte from parl.utils import get_ip_address, logger, to_str, to_byte
from parl.utils.communication import loads_argument, loads_return,\ from parl.utils.communication import loads_argument, loads_return,\
...@@ -74,6 +75,12 @@ def remote_class(*args, **kwargs): ...@@ -74,6 +75,12 @@ def remote_class(*args, **kwargs):
""" """
def decorator(cls): def decorator(cls):
# we are not going to create a remote actor in job.py
if 'XPARL' in os.environ and os.environ['XPARL'] == 'True':
logger.warning(
"Note: this object will be runnning as a local object")
return cls
class RemoteWrapper(object): class RemoteWrapper(object):
""" """
Wrapper for remote class in client side. Wrapper for remote class in client side.
...@@ -113,10 +120,13 @@ def remote_class(*args, **kwargs): ...@@ -113,10 +120,13 @@ def remote_class(*args, **kwargs):
self.job_shutdown = False self.job_shutdown = False
self.send_file(self.job_socket) self.send_file(self.job_socket)
file_name = inspect.getfile(cls)[:-3]
cls_source = inspect.getsourcelines(cls)
end_of_file = cls_source[1] + len(cls_source[0])
class_name = cls.__name__
self.job_socket.send_multipart([ self.job_socket.send_multipart([
remote_constants.INIT_OBJECT_TAG, remote_constants.INIT_OBJECT_TAG,
cloudpickle.dumps(cls), cloudpickle.dumps([file_name, class_name, end_of_file]),
cloudpickle.dumps([args, kwargs]), cloudpickle.dumps([args, kwargs]),
]) ])
message = self.job_socket.recv_multipart() message = self.job_socket.recv_multipart()
...@@ -128,6 +138,10 @@ def remote_class(*args, **kwargs): ...@@ -128,6 +138,10 @@ def remote_class(*args, **kwargs):
def __del__(self): def __del__(self):
"""Delete the remote class object and release remote resources.""" """Delete the remote class object and release remote resources."""
try:
self.job_socket.setsockopt(zmq.RCVTIMEO, 1 * 1000)
except AttributeError:
pass
if not self.job_shutdown: if not self.job_shutdown:
try: try:
self.job_socket.send_multipart( self.job_socket.send_multipart(
...@@ -138,6 +152,8 @@ def remote_class(*args, **kwargs): ...@@ -138,6 +152,8 @@ def remote_class(*args, **kwargs):
pass pass
except zmq.error.ZMQError: except zmq.error.ZMQError:
pass pass
except TypeError:
pass
def send_file(self, socket): def send_file(self, socket):
try: try:
...@@ -212,6 +228,7 @@ def remote_class(*args, **kwargs): ...@@ -212,6 +228,7 @@ def remote_class(*args, **kwargs):
return wrapper return wrapper
RemoteWrapper._original = cls
return RemoteWrapper return RemoteWrapper
max_memory = kwargs.get('max_memory') max_memory = kwargs.get('max_memory')
......
...@@ -18,15 +18,18 @@ import multiprocessing ...@@ -18,15 +18,18 @@ import multiprocessing
import os import os
import random import random
import re import re
import socket import requests
import subprocess import subprocess
import sys import sys
import time import time
import threading import threading
import tempfile
import warnings import warnings
import zmq import zmq
from multiprocessing import Process from multiprocessing import Process
from parl.utils import get_ip_address, to_str from parl.utils import (_IS_WINDOWS, get_free_tcp_port, get_ip_address,
get_port_from_range, is_port_available, kill_process,
to_str)
from parl.remote.remote_constants import STATUS_TAG from parl.remote.remote_constants import STATUS_TAG
# A flag to mark if parl is started from a command line # A flag to mark if parl is started from a command line
...@@ -34,33 +37,18 @@ os.environ['XPARL'] = 'True' ...@@ -34,33 +37,18 @@ os.environ['XPARL'] = 'True'
# Solve `Click will abort further execution because Python 3 was configured # Solve `Click will abort further execution because Python 3 was configured
# to use ASCII as encoding for the environment` error. # to use ASCII as encoding for the environment` error.
locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
if not _IS_WINDOWS:
try:
locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
except:
pass
#TODO: this line will cause error in python2/macOS #TODO: this line will cause error in python2/macOS
if sys.version_info.major == 3: if sys.version_info.major == 3:
warnings.simplefilter("ignore", ResourceWarning) warnings.simplefilter("ignore", ResourceWarning)
def get_free_tcp_port():
tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
tcp.bind(('', 0))
addr, port = tcp.getsockname()
tcp.close()
return str(port)
def is_port_available(port):
""" Check if a port is used.
True if the port is available for connection.
"""
port = int(port)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
available = sock.connect_ex(('localhost', port))
sock.close()
return available
def is_master_started(address): def is_master_started(address):
ctx = zmq.Context() ctx = zmq.Context()
socket = ctx.socket(zmq.REQ) socket = ctx.socket(zmq.REQ)
...@@ -77,6 +65,33 @@ def is_master_started(address): ...@@ -77,6 +65,33 @@ def is_master_started(address):
return False return False
def parse_port_range(log_server_port_range):
try:
re.match(r'\d*[-]\d*', log_server_port_range).span()
except:
raise Exception(
"The input log_server_port_range should be `start-end` format.")
start, end = map(int, log_server_port_range.split('-'))
if start > end:
raise Exception(
"Start port number must be smaller than the end port number.")
return start, end
def is_log_server_started(ip_address, port):
started = False
for _ in range(3):
try:
r = requests.get("http://{}:{}/get-log".format(ip_address, port))
if r.status_code == 400:
started = True
break
except:
time.sleep(3)
return started
@click.group() @click.group()
def cli(): def cli():
pass pass
...@@ -95,7 +110,15 @@ def cli(): ...@@ -95,7 +110,15 @@ def cli():
"cpus of this machine.") "cpus of this machine.")
@click.option( @click.option(
"--monitor_port", help="The port to start a cluster monitor.", type=str) "--monitor_port", help="The port to start a cluster monitor.", type=str)
def start_master(port, cpu_num, monitor_port, debug): @click.option(
"--log_server_port_range",
help='''
Port range (start-end) of the log server on the worker. Default: 8000-9000.
The worker will pick a random avaliable port in [start, end] for the log server.
''',
default="8000-9000",
type=str)
def start_master(port, cpu_num, monitor_port, debug, log_server_port_range):
if debug: if debug:
os.environ['DEBUG'] = 'True' os.environ['DEBUG'] = 'True'
...@@ -112,19 +135,33 @@ def start_master(port, cpu_num, monitor_port, debug): ...@@ -112,19 +135,33 @@ def start_master(port, cpu_num, monitor_port, debug):
cpu_num) if cpu_num is not None else multiprocessing.cpu_count() cpu_num) if cpu_num is not None else multiprocessing.cpu_count()
start_file = __file__.replace('scripts.pyc', 'start.py') start_file = __file__.replace('scripts.pyc', 'start.py')
start_file = start_file.replace('scripts.py', 'start.py') start_file = start_file.replace('scripts.py', 'start.py')
monitor_file = __file__.replace('scripts.pyc', 'monitor.py')
monitor_file = monitor_file.replace('scripts.py', 'monitor.py')
monitor_port = monitor_port if monitor_port else get_free_tcp_port() monitor_port = monitor_port if monitor_port else get_free_tcp_port()
start, end = parse_port_range(log_server_port_range)
log_server_port = get_port_from_range(start, end)
while log_server_port == monitor_port or log_server_port == port:
log_server_port = get_port_from_range(start, end)
master_command = [ master_command = [
sys.executable, start_file, "--name", "master", "--port", port sys.executable,
start_file,
"--name",
"master",
"--port",
port,
"--monitor_port",
monitor_port,
] ]
worker_command = [ worker_command = [
sys.executable, start_file, "--name", "worker", "--address", sys.executable, start_file, "--name", "worker", "--address",
"localhost:" + str(port), "--cpu_num", "localhost:" + str(port), "--cpu_num",
str(cpu_num) str(cpu_num), '--log_server_port',
str(log_server_port)
] ]
monitor_command = [ monitor_command = [
sys.executable, '{}/monitor.py'.format(__file__[:__file__.rfind('/')]), sys.executable, monitor_file, "--monitor_port",
"--monitor_port",
str(monitor_port), "--address", "localhost:" + str(port) str(monitor_port), "--address", "localhost:" + str(port)
] ]
...@@ -133,9 +170,19 @@ def start_master(port, cpu_num, monitor_port, debug): ...@@ -133,9 +170,19 @@ def start_master(port, cpu_num, monitor_port, debug):
# Redirect the output to DEVNULL to solve the warning log. # Redirect the output to DEVNULL to solve the warning log.
_ = subprocess.Popen( _ = subprocess.Popen(
master_command, stdout=FNULL, stderr=subprocess.STDOUT) master_command, stdout=FNULL, stderr=subprocess.STDOUT)
if cpu_num > 0: if cpu_num > 0:
# Sleep 1s for master ready
time.sleep(1)
_ = subprocess.Popen( _ = subprocess.Popen(
worker_command, stdout=FNULL, stderr=subprocess.STDOUT) worker_command, stdout=FNULL, stderr=subprocess.STDOUT)
if _IS_WINDOWS:
# TODO(@zenghsh3) redirecting stdout of monitor subprocess to FNULL will cause occasional failure
tmp_file = tempfile.TemporaryFile()
_ = subprocess.Popen(monitor_command, stdout=tmp_file)
tmp_file.close()
else:
_ = subprocess.Popen( _ = subprocess.Popen(
monitor_command, stdout=FNULL, stderr=subprocess.STDOUT) monitor_command, stdout=FNULL, stderr=subprocess.STDOUT)
FNULL.close() FNULL.close()
...@@ -158,16 +205,20 @@ def start_master(port, cpu_num, monitor_port, debug): ...@@ -158,16 +205,20 @@ def start_master(port, cpu_num, monitor_port, debug):
click.echo(monitor_info) click.echo(monitor_info)
# check if monitor is started # check if monitor is started
cmd = r'ps -ef | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format(
monitor_port, port)
monitor_is_started = False monitor_is_started = False
if _IS_WINDOWS:
cmd = r'''wmic process where "commandline like '%remote\\monitor.py --monitor_port {} --address localhost:{}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format(
monitor_port, port)
else:
cmd = r'ps -ef | grep -v grep | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format(
monitor_port, port)
for i in range(3): for i in range(3):
check_monitor_is_started = os.popen(cmd).read().strip().split('\n') check_monitor_is_started = os.popen(cmd).read()
if len(check_monitor_is_started) == 2: if len(check_monitor_is_started) > 0:
monitor_is_started = True monitor_is_started = True
break break
time.sleep(3) time.sleep(3)
master_ip = get_ip_address() master_ip = get_ip_address()
if monitor_is_started: if monitor_is_started:
start_info = """ start_info = """
...@@ -194,6 +245,9 @@ def start_master(port, cpu_num, monitor_port, debug): ...@@ -194,6 +245,9 @@ def start_master(port, cpu_num, monitor_port, debug):
""".format(start_info, master_ip, port) """.format(start_info, master_ip, port)
click.echo(monitor_info) click.echo(monitor_info)
if not is_log_server_started(master_ip, log_server_port):
click.echo("# Fail to start the log server.")
@click.command("connect", short_help="Start a worker node.") @click.command("connect", short_help="Start a worker node.")
@click.option( @click.option(
...@@ -203,36 +257,53 @@ def start_master(port, cpu_num, monitor_port, debug): ...@@ -203,36 +257,53 @@ def start_master(port, cpu_num, monitor_port, debug):
type=int, type=int,
help="Set number of cpu manually. If not set, it will use all " help="Set number of cpu manually. If not set, it will use all "
"cpus of this machine.") "cpus of this machine.")
def start_worker(address, cpu_num): @click.option(
"--log_server_port_range",
help='''
Port range (start-end) of the log server on the worker. Default: 8000-9000.
The worker will pick a random avaliable port in [start, end] for the log server.
''',
default="8000-9000",
type=str)
def start_worker(address, cpu_num, log_server_port_range):
start, end = parse_port_range(log_server_port_range)
log_server_port = get_port_from_range(start, end)
if not is_master_started(address): if not is_master_started(address):
raise Exception("Worker can not connect to the master node, " + raise Exception("Worker can not connect to the master node, " +
"please check if the input address {} ".format( "please check if the input address {} ".format(
address) + "is correct.") address) + "is correct.")
cpu_num = str(cpu_num) if cpu_num else '' cpu_num = str(cpu_num) if cpu_num else ''
start_file = __file__.replace('scripts.pyc', 'start.py')
start_file = start_file.replace('scripts.py', 'start.py')
command = [ command = [
sys.executable, "{}/start.py".format(__file__[:-11]), "--name", sys.executable, start_file, "--name", "worker", "--address", address,
"worker", "--address", address, "--cpu_num", "--cpu_num",
str(cpu_num) str(cpu_num), "--log_server_port",
str(log_server_port)
] ]
p = subprocess.Popen(command) p = subprocess.Popen(command)
if not is_log_server_started(get_ip_address(), log_server_port):
click.echo("# Fail to start the log server.")
@click.command("stop", help="Exit the cluster.") @click.command("stop", help="Exit the cluster.")
def stop(): def stop():
command = ( kill_process('remote/start.py')
"ps aux | grep remote/start.py | awk '{print $2}' | xargs kill -9") kill_process('remote/job.py')
subprocess.call([command], shell=True) kill_process('remote/monitor.py')
command = ( kill_process('remote/log_server.py')
"ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9")
subprocess.call([command], shell=True)
command = (
"ps aux | grep remote/monitor.py | awk '{print $2}' | xargs kill -9")
subprocess.call([command], shell=True)
@click.command("status") @click.command("status")
def status(): def status():
if _IS_WINDOWS:
cmd = r'''wmic process where "commandline like '%remote\\start.py --name worker --address%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''
else:
cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address' cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address'
content = os.popen(cmd).read().strip() content = os.popen(cmd).read().strip()
pattern = re.compile('--address (.*?) --cpu') pattern = re.compile('--address (.*?) --cpu')
clusters = set(pattern.findall(content)) clusters = set(pattern.findall(content))
...@@ -242,6 +313,10 @@ def status(): ...@@ -242,6 +313,10 @@ def status():
ctx = zmq.Context() ctx = zmq.Context()
status = [] status = []
for cluster in clusters: for cluster in clusters:
if _IS_WINDOWS:
cmd = r'''wmic process where "commandline like '%address {}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format(
cluster)
else:
cmd = r'ps -ef | grep address\ {}'.format(cluster) cmd = r'ps -ef | grep address\ {}'.format(cluster)
content = os.popen(cmd).read() content = os.popen(cmd).read()
pattern = re.compile('--monitor_port (.*?)\n', re.S) pattern = re.compile('--monitor_port (.*?)\n', re.S)
......
...@@ -28,13 +28,15 @@ def main(args): ...@@ -28,13 +28,15 @@ def main(args):
if args.name == 'master': if args.name == 'master':
port = args.port port = args.port
master = Master(port) monitor_port = args.monitor_port
master = Master(port, monitor_port)
master.run() master.run()
elif args.name == 'worker': elif args.name == 'worker':
address = args.address address = args.address
log_server_port = args.log_server_port
cpu_num = int(args.cpu_num) if args.cpu_num else None cpu_num = int(args.cpu_num) if args.cpu_num else None
worker = Worker(address, cpu_num) worker = Worker(address, cpu_num, log_server_port)
worker.run() worker.run()
else: else:
...@@ -48,5 +50,7 @@ if __name__ == "__main__": ...@@ -48,5 +50,7 @@ if __name__ == "__main__":
parser.add_argument('--port', default='1234', type=str) parser.add_argument('--port', default='1234', type=str)
parser.add_argument('--address', default='localhost:1234', type=str) parser.add_argument('--address', default='localhost:1234', type=str)
parser.add_argument('--cpu_num', default='', type=str) parser.add_argument('--cpu_num', default='', type=str)
parser.add_argument('--monitor_port', default='', type=str)
parser.add_argument('--log_server_port', default='', type=str)
args = parser.parse_args() args = parser.parse_args()
main(args) main(args)
/* ansi_up.js
* author : Dru Nelson
* license : MIT
* http://github.com/drudru/ansi_up
*/
(function (root, factory) {
if (typeof define === 'function' && define.amd) {
// AMD. Register as an anonymous module.
define(['exports'], factory);
} else if (typeof exports === 'object' && typeof exports.nodeName !== 'string') {
// CommonJS
factory(exports);
} else {
// Browser globals
var exp = {};
factory(exp);
root.AnsiUp = exp.default;
}
}(this, function (exports) {
"use strict";
var __makeTemplateObject = (this && this.__makeTemplateObject) || function (cooked, raw) {
if (Object.defineProperty) { Object.defineProperty(cooked, "raw", { value: raw }); } else { cooked.raw = raw; }
return cooked;
};
var PacketKind;
(function (PacketKind) {
PacketKind[PacketKind["EOS"] = 0] = "EOS";
PacketKind[PacketKind["Text"] = 1] = "Text";
PacketKind[PacketKind["Incomplete"] = 2] = "Incomplete";
PacketKind[PacketKind["ESC"] = 3] = "ESC";
PacketKind[PacketKind["Unknown"] = 4] = "Unknown";
PacketKind[PacketKind["SGR"] = 5] = "SGR";
PacketKind[PacketKind["OSCURL"] = 6] = "OSCURL";
})(PacketKind || (PacketKind = {}));
var AnsiUp = (function () {
function AnsiUp() {
this.VERSION = "4.0.3";
this.setup_palettes();
this._use_classes = false;
this._escape_for_html = true;
this.bold = false;
this.fg = this.bg = null;
this._buffer = '';
this._url_whitelist = { 'http': 1, 'https': 1 };
}
Object.defineProperty(AnsiUp.prototype, "use_classes", {
get: function () {
return this._use_classes;
},
set: function (arg) {
this._use_classes = arg;
},
enumerable: true,
configurable: true
});
Object.defineProperty(AnsiUp.prototype, "escape_for_html", {
get: function () {
return this._escape_for_html;
},
set: function (arg) {
this._escape_for_html = arg;
},
enumerable: true,
configurable: true
});
Object.defineProperty(AnsiUp.prototype, "url_whitelist", {
get: function () {
return this._url_whitelist;
},
set: function (arg) {
this._url_whitelist = arg;
},
enumerable: true,
configurable: true
});
AnsiUp.prototype.setup_palettes = function () {
var _this = this;
this.ansi_colors =
[
[
{ rgb: [0, 0, 0], class_name: "ansi-black" },
{ rgb: [187, 0, 0], class_name: "ansi-red" },
{ rgb: [0, 187, 0], class_name: "ansi-green" },
{ rgb: [187, 187, 0], class_name: "ansi-yellow" },
{ rgb: [0, 0, 187], class_name: "ansi-blue" },
{ rgb: [187, 0, 187], class_name: "ansi-magenta" },
{ rgb: [0, 187, 187], class_name: "ansi-cyan" },
{ rgb: [255, 255, 255], class_name: "ansi-white" }
],
[
{ rgb: [85, 85, 85], class_name: "ansi-bright-black" },
{ rgb: [255, 85, 85], class_name: "ansi-bright-red" },
{ rgb: [0, 255, 0], class_name: "ansi-bright-green" },
{ rgb: [255, 255, 85], class_name: "ansi-bright-yellow" },
{ rgb: [85, 85, 255], class_name: "ansi-bright-blue" },
{ rgb: [255, 85, 255], class_name: "ansi-bright-magenta" },
{ rgb: [85, 255, 255], class_name: "ansi-bright-cyan" },
{ rgb: [255, 255, 255], class_name: "ansi-bright-white" }
]
];
this.palette_256 = [];
this.ansi_colors.forEach(function (palette) {
palette.forEach(function (rec) {
_this.palette_256.push(rec);
});
});
var levels = [0, 95, 135, 175, 215, 255];
for (var r = 0; r < 6; ++r) {
for (var g = 0; g < 6; ++g) {
for (var b = 0; b < 6; ++b) {
var col = { rgb: [levels[r], levels[g], levels[b]], class_name: 'truecolor' };
this.palette_256.push(col);
}
}
}
var grey_level = 8;
for (var i = 0; i < 24; ++i, grey_level += 10) {
var gry = { rgb: [grey_level, grey_level, grey_level], class_name: 'truecolor' };
this.palette_256.push(gry);
}
};
AnsiUp.prototype.escape_txt_for_html = function (txt) {
return txt.replace(/[&<>]/gm, function (str) {
if (str === "&")
return "&amp;";
if (str === "<")
return "&lt;";
if (str === ">")
return "&gt;";
});
};
AnsiUp.prototype.append_buffer = function (txt) {
var str = this._buffer + txt;
this._buffer = str;
};
AnsiUp.prototype.get_next_packet = function () {
var pkt = {
kind: PacketKind.EOS,
text: '',
url: ''
};
var len = this._buffer.length;
if (len == 0)
return pkt;
var pos = this._buffer.indexOf("\x1B");
if (pos == -1) {
pkt.kind = PacketKind.Text;
pkt.text = this._buffer;
this._buffer = '';
return pkt;
}
if (pos > 0) {
pkt.kind = PacketKind.Text;
pkt.text = this._buffer.slice(0, pos);
this._buffer = this._buffer.slice(pos);
return pkt;
}
if (pos == 0) {
if (len == 1) {
pkt.kind = PacketKind.Incomplete;
return pkt;
}
var next_char = this._buffer.charAt(1);
if ((next_char != '[') && (next_char != ']')) {
pkt.kind = PacketKind.ESC;
pkt.text = this._buffer.slice(0, 1);
this._buffer = this._buffer.slice(1);
return pkt;
}
if (next_char == '[') {
if (!this._csi_regex) {
this._csi_regex = rgx(__makeTemplateObject(["\n ^ # beginning of line\n #\n # First attempt\n (?: # legal sequence\n \u001B[ # CSI\n ([<-?]?) # private-mode char\n ([d;]*) # any digits or semicolons\n ([ -/]? # an intermediate modifier\n [@-~]) # the command\n )\n | # alternate (second attempt)\n (?: # illegal sequence\n \u001B[ # CSI\n [ -~]* # anything legal\n ([\0-\u001F:]) # anything illegal\n )\n "], ["\n ^ # beginning of line\n #\n # First attempt\n (?: # legal sequence\n \\x1b\\[ # CSI\n ([\\x3c-\\x3f]?) # private-mode char\n ([\\d;]*) # any digits or semicolons\n ([\\x20-\\x2f]? # an intermediate modifier\n [\\x40-\\x7e]) # the command\n )\n | # alternate (second attempt)\n (?: # illegal sequence\n \\x1b\\[ # CSI\n [\\x20-\\x7e]* # anything legal\n ([\\x00-\\x1f:]) # anything illegal\n )\n "]));
}
var match = this._buffer.match(this._csi_regex);
if (match === null) {
pkt.kind = PacketKind.Incomplete;
return pkt;
}
if (match[4]) {
pkt.kind = PacketKind.ESC;
pkt.text = this._buffer.slice(0, 1);
this._buffer = this._buffer.slice(1);
return pkt;
}
if ((match[1] != '') || (match[3] != 'm'))
pkt.kind = PacketKind.Unknown;
else
pkt.kind = PacketKind.SGR;
pkt.text = match[2];
var rpos = match[0].length;
this._buffer = this._buffer.slice(rpos);
return pkt;
}
if (next_char == ']') {
if (len < 4) {
pkt.kind = PacketKind.Incomplete;
return pkt;
}
if ((this._buffer.charAt(2) != '8')
|| (this._buffer.charAt(3) != ';')) {
pkt.kind = PacketKind.ESC;
pkt.text = this._buffer.slice(0, 1);
this._buffer = this._buffer.slice(1);
return pkt;
}
if (!this._osc_st) {
this._osc_st = rgxG(__makeTemplateObject(["\n (?: # legal sequence\n (\u001B\\) # ESC | # alternate\n (\u0007) # BEL (what xterm did)\n )\n | # alternate (second attempt)\n ( # illegal sequence\n [\0-\u0006] # anything illegal\n | # alternate\n [\b-\u001A] # anything illegal\n | # alternate\n [\u001C-\u001F] # anything illegal\n )\n "], ["\n (?: # legal sequence\n (\\x1b\\\\) # ESC \\\n | # alternate\n (\\x07) # BEL (what xterm did)\n )\n | # alternate (second attempt)\n ( # illegal sequence\n [\\x00-\\x06] # anything illegal\n | # alternate\n [\\x08-\\x1a] # anything illegal\n | # alternate\n [\\x1c-\\x1f] # anything illegal\n )\n "]));
}
this._osc_st.lastIndex = 0;
{
var match_1 = this._osc_st.exec(this._buffer);
if (match_1 === null) {
pkt.kind = PacketKind.Incomplete;
return pkt;
}
if (match_1[3]) {
pkt.kind = PacketKind.ESC;
pkt.text = this._buffer.slice(0, 1);
this._buffer = this._buffer.slice(1);
return pkt;
}
}
{
var match_2 = this._osc_st.exec(this._buffer);
if (match_2 === null) {
pkt.kind = PacketKind.Incomplete;
return pkt;
}
if (match_2[3]) {
pkt.kind = PacketKind.ESC;
pkt.text = this._buffer.slice(0, 1);
this._buffer = this._buffer.slice(1);
return pkt;
}
}
if (!this._osc_regex) {
this._osc_regex = rgx(__makeTemplateObject(["\n ^ # beginning of line\n #\n \u001B]8; # OSC Hyperlink\n [ -:<-~]* # params (excluding ;)\n ; # end of params\n ([!-~]{0,512}) # URL capture\n (?: # ST\n (?:\u001B\\) # ESC | # alternate\n (?:\u0007) # BEL (what xterm did)\n )\n ([!-~]+) # TEXT capture\n \u001B]8;; # OSC Hyperlink End\n (?: # ST\n (?:\u001B\\) # ESC | # alternate\n (?:\u0007) # BEL (what xterm did)\n )\n "], ["\n ^ # beginning of line\n #\n \\x1b\\]8; # OSC Hyperlink\n [\\x20-\\x3a\\x3c-\\x7e]* # params (excluding ;)\n ; # end of params\n ([\\x21-\\x7e]{0,512}) # URL capture\n (?: # ST\n (?:\\x1b\\\\) # ESC \\\n | # alternate\n (?:\\x07) # BEL (what xterm did)\n )\n ([\\x21-\\x7e]+) # TEXT capture\n \\x1b\\]8;; # OSC Hyperlink End\n (?: # ST\n (?:\\x1b\\\\) # ESC \\\n | # alternate\n (?:\\x07) # BEL (what xterm did)\n )\n "]));
}
var match = this._buffer.match(this._osc_regex);
if (match === null) {
pkt.kind = PacketKind.ESC;
pkt.text = this._buffer.slice(0, 1);
this._buffer = this._buffer.slice(1);
return pkt;
}
pkt.kind = PacketKind.OSCURL;
pkt.url = match[1];
pkt.text = match[2];
var rpos = match[0].length;
this._buffer = this._buffer.slice(rpos);
return pkt;
}
}
};
AnsiUp.prototype.ansi_to_html = function (txt) {
this.append_buffer(txt);
var blocks = [];
while (true) {
var packet = this.get_next_packet();
if ((packet.kind == PacketKind.EOS)
|| (packet.kind == PacketKind.Incomplete))
break;
if ((packet.kind == PacketKind.ESC)
|| (packet.kind == PacketKind.Unknown))
continue;
if (packet.kind == PacketKind.Text)
blocks.push(this.transform_to_html(this.with_state(packet)));
else if (packet.kind == PacketKind.SGR)
this.process_ansi(packet);
else if (packet.kind == PacketKind.OSCURL)
blocks.push(this.process_hyperlink(packet));
}
return blocks.join("");
};
AnsiUp.prototype.with_state = function (pkt) {
return { bold: this.bold, fg: this.fg, bg: this.bg, text: pkt.text };
};
AnsiUp.prototype.process_ansi = function (pkt) {
var sgr_cmds = pkt.text.split(';');
while (sgr_cmds.length > 0) {
var sgr_cmd_str = sgr_cmds.shift();
var num = parseInt(sgr_cmd_str, 10);
if (isNaN(num) || num === 0) {
this.fg = this.bg = null;
this.bold = false;
}
else if (num === 1) {
this.bold = true;
}
else if (num === 22) {
this.bold = false;
}
else if (num === 39) {
this.fg = null;
}
else if (num === 49) {
this.bg = null;
}
else if ((num >= 30) && (num < 38)) {
this.fg = this.ansi_colors[0][(num - 30)];
}
else if ((num >= 40) && (num < 48)) {
this.bg = this.ansi_colors[0][(num - 40)];
}
else if ((num >= 90) && (num < 98)) {
this.fg = this.ansi_colors[1][(num - 90)];
}
else if ((num >= 100) && (num < 108)) {
this.bg = this.ansi_colors[1][(num - 100)];
}
else if (num === 38 || num === 48) {
if (sgr_cmds.length > 0) {
var is_foreground = (num === 38);
var mode_cmd = sgr_cmds.shift();
if (mode_cmd === '5' && sgr_cmds.length > 0) {
var palette_index = parseInt(sgr_cmds.shift(), 10);
if (palette_index >= 0 && palette_index <= 255) {
if (is_foreground)
this.fg = this.palette_256[palette_index];
else
this.bg = this.palette_256[palette_index];
}
}
if (mode_cmd === '2' && sgr_cmds.length > 2) {
var r = parseInt(sgr_cmds.shift(), 10);
var g = parseInt(sgr_cmds.shift(), 10);
var b = parseInt(sgr_cmds.shift(), 10);
if ((r >= 0 && r <= 255) && (g >= 0 && g <= 255) && (b >= 0 && b <= 255)) {
var c = { rgb: [r, g, b], class_name: 'truecolor' };
if (is_foreground)
this.fg = c;
else
this.bg = c;
}
}
}
}
}
};
AnsiUp.prototype.transform_to_html = function (fragment) {
var txt = fragment.text;
if (txt.length === 0)
return txt;
if (this._escape_for_html)
txt = this.escape_txt_for_html(txt);
if (!fragment.bold && fragment.fg === null && fragment.bg === null)
return txt;
var styles = [];
var classes = [];
var fg = fragment.fg;
var bg = fragment.bg;
if (fragment.bold)
styles.push('font-weight:bold');
if (!this._use_classes) {
if (fg)
styles.push("color:rgb(" + fg.rgb.join(',') + ")");
if (bg)
styles.push("background-color:rgb(" + bg.rgb + ")");
}
else {
if (fg) {
if (fg.class_name !== 'truecolor') {
classes.push(fg.class_name + "-fg");
}
else {
styles.push("color:rgb(" + fg.rgb.join(',') + ")");
}
}
if (bg) {
if (bg.class_name !== 'truecolor') {
classes.push(bg.class_name + "-bg");
}
else {
styles.push("background-color:rgb(" + bg.rgb.join(',') + ")");
}
}
}
var class_string = '';
var style_string = '';
if (classes.length)
class_string = " class=\"" + classes.join(' ') + "\"";
if (styles.length)
style_string = " style=\"" + styles.join(';') + "\"";
return "<span" + style_string + class_string + ">" + txt + "</span>";
};
;
AnsiUp.prototype.process_hyperlink = function (pkt) {
var parts = pkt.url.split(':');
if (parts.length < 1)
return '';
if (!this._url_whitelist[parts[0]])
return '';
var result = "<a href=\"" + this.escape_txt_for_html(pkt.url) + "\">" + this.escape_txt_for_html(pkt.text) + "</a>";
return result;
};
return AnsiUp;
}());
function rgx(tmplObj) {
var subst = [];
for (var _i = 1; _i < arguments.length; _i++) {
subst[_i - 1] = arguments[_i];
}
var regexText = tmplObj.raw[0];
var wsrgx = /^\s+|\s+\n|\s*#[\s\S]*?\n|\n/gm;
var txt2 = regexText.replace(wsrgx, '');
return new RegExp(txt2);
}
function rgxG(tmplObj) {
var subst = [];
for (var _i = 1; _i < arguments.length; _i++) {
subst[_i - 1] = arguments[_i];
}
var regexText = tmplObj.raw[0];
var wsrgx = /^\s+|\s+\n|\s*#[\s\S]*?\n|\n/gm;
var txt2 = regexText.replace(wsrgx, '');
return new RegExp(txt2, 'g');
}
//# sourceMappingURL=ansi_up.js.map
Object.defineProperty(exports, "__esModule", { value: true });
exports.default = AnsiUp;
}));
/*
jQuery AJAX Cross Origin v1.3 (http://www.ajax-cross-origin.com)
jQuery plugin to bypass Same-origin_policy using Google Apps Script.
references:
http://en.wikipedia.org/wiki/Same-origin_policy
http://www.google.com/script/start/
(c) 2014, Writen by Erez Ninio. site: www.dealhotelbook.com
Licensed under the Creative Commons Attribution 3.0 Unported License.
For details, see http://creativecommons.org/licenses/by/3.0/.
*/
var proxyJsonp =
"https://script.google.com/macros/s/AKfycbwmqG55tt2d2FcT_WQ3WjCSKmtyFpkOcdprSITn45-4UgVJnzp9/exec";
jQuery.ajaxOrig = jQuery.ajax;
jQuery.ajax = function (a, b) {
function d(a) {
a = encodeURI(a).replace(/&/g, "%26");
return proxyJsonp + "?url=" + a + "&callback=?";
}
var c = "object" === typeof a ? a : b || {};
c.url = c.url || ("string" === typeof a ? a : "");
var c = jQuery.ajaxSetup({}, c),
e = (function (a, c) {
var b = document.createElement("a");
b.href = a;
return (
c.crossOrigin &&
"http" == a.substr(0, 4).toLowerCase() &&
"localhost" != b.hostname &&
"127.0.0.1" != b.hostname &&
b.hostname != window.location.hostname
);
})(c.url, c);
c.proxy &&
0 < c.proxy.length &&
((proxyJsonp = c.proxy),
"object" === typeof a
? (a.crossDomain = !0)
: "object" === typeof b && (b.crossDomain = !0));
e &&
("object" === typeof a
? a.url &&
((a.url = d(a.url)),
a.charset && (a.url += "&charset=" + a.charset),
(a.dataType = "json"))
: "string" === typeof a &&
"object" === typeof b &&
((a = d(a)),
b.charset && (a += "&charset=" + b.charset),
(b.dataType = "json")));
return jQuery.ajaxOrig.apply(this, arguments);
};
jQuery.ajax.prototype = new jQuery.ajaxOrig();
jQuery.ajax.prototype.constructor = jQuery.ajax;
...@@ -185,7 +185,8 @@ function autoTable(res) { ...@@ -185,7 +185,8 @@ function autoTable(res) {
var s3 = `<td>${res.clients[i].client_address}</td>`; var s3 = `<td>${res.clients[i].client_address}</td>`;
var s4 = `<td>${res.clients[i].actor_num}</td>`; var s4 = `<td>${res.clients[i].actor_num}</td>`;
var s5 = `<td>${res.clients[i].time}</td>`; var s5 = `<td>${res.clients[i].time}</td>`;
tr.innerHTML = s1 + s2 + s3 + s4 + s5; var s6 = `<td><a href=${res.clients[i].log_monitor_url}>link</a></td>`;
tr.innerHTML = s1 + s2 + s3 + s4 + s5 + s6;
table.appendChild(tr); table.appendChild(tr);
} }
}; };
...@@ -43,10 +43,11 @@ ...@@ -43,10 +43,11 @@
<th scope="col">Hostname</th> <th scope="col">Hostname</th>
<th scope="col">Actor Num</th> <th scope="col">Actor Num</th>
<th scope="col">Time (min)</th> <th scope="col">Time (min)</th>
<th scope="col">Log</th>
</tr> </tr>
</thead> </thead>
<tbody id='table'> <tbody id='table'>
<th colspan="5">Loading Data...</th> <th colspan="6">Loading Data...</th>
</tbody> </tbody>
</table> </table>
</div> </div>
......
<!DOCTYPE html>
<html lang="en">
<head>
<meta charset="utf-8" />
<title>Parl Cluster</title>
<link rel="shortcut icon" href="../static/favicon.ico" />
<script type="text/javascript" src="../static/js/jquery.min.js"></script>
<script type="text/javascript" src="../static/js/ansi_up.js"></script>
<script
type="text/javascript"
src="../static/js/bootstrap-table.min.js"
></script>
<script src="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/4.5.0/js/bootstrap.bundle.min.js"></script>
<link rel="stylesheet" href="../static/css/bootstrap-parl.min.css" />
</head>
<body>
<nav class="navbar navbar-expand-lg navbar-light bg-dark fixed-top">
<div class="container">
<a class="navbar-brand">
<img src="../static/logo.png" style="height: 30px;" />
</a>
<div class="collapse navbar-collapse" id="navbarSupportedContent">
<ul class="navbar-nav">
<li class="nav-item" id="worker_nav">
<a class="btn text-white" href="workers">Worker</a>
</li>
<li class="nav-item" id="client_nav">
<a class="btn text-white" href="clients">Client</a>
</li>
</ul>
</div>
</div>
</nav>
<div class="container" id="main-container">
<h5 class="font-weight-light text-center text-lg-left mt-4 mb-4">
Jobs Monitor
</h5>
<div class="card">
<div class="card-header" style="display: inline;">
<h3 style="display: inline;">
Remote Job Log
</h3>
<p
style="
float: right;
margin-bottom: 0rem;
position: relative;
bottom: -0.5rem;
"
>
Client ID: {{ client_id }}
</p>
</div>
</div>
<table id="table"></table>
</div>
<!-- Modal -->
<div
class="modal fade"
id="log-modal"
tabindex="-1"
role="dialog"
aria-hidden="true"
>
<div
class="modal-dialog modal-lg modal-dialog-scrollable"
role="document"
>
<div class="modal-content">
<div class="modal-header">
<h5 class="modal-title" id="log-modal-title">
Job ID:
</h5>
<button
type="button"
class="close"
data-dismiss="modal"
aria-label="Close"
></button>
</div>
<div class="modal-body">
<p id="log-content">
<div id="loading-spin" class="spinner-border text-primary" role="status">
<span class="sr-only">Loading...</span>
</div>
</p>
</div>
<div class="modal-footer">
<a style="position: relative; left: -160px; font-size: small;">
* Only the lastest 500 lines of the log are shown, <br />
download the log file for the full log.
</a>
<button
type="button"
class="btn btn-secondary"
data-dismiss="modal"
>
Close
</button>
<a
role="button"
id="download-btn"
type="button"
class="btn btn-primary"
href=""
>
Download the complete log
</a>
</div>
</div>
</div>
</div>
<script>
let client_id = "{{ client_id }}";
let $table = $("#table");
function initTable() {
$table.bootstrapTable({
url: "get-jobs?client_id={{ client_id }}",
pagination: true,
pageSize: 10,
pageList: [10, 25, 50, 100],
columns: [
{
field: "id",
title: "ID",
},
{
field: "job_id",
title: "Job ID",
},
{
field: "log_url",
title: "Log",
formatter: urlButtonFormatter,
},
{
field: "download_url",
title: "Download",
formatter: downloadButtonFormatter,
},
],
});
}
function urlButtonFormatter(value, row, index) {
let job_id = value.split("?job_id=")[1];
return `<a
role="button"
data-toggle="modal"
data-target="#log-modal"
data-job-url="${value}"
data-job-id="${job_id}"
class="btn btn-sm btn-outline-primary"
href="">view</a>`;
}
function downloadButtonFormatter(value, row, index) {
return `<a
role="button"
class="btn btn-sm btn-outline-primary"
href=${value}>link</a>`;
}
let refresher_id = null;
let ansi_up = new AnsiUp();
$("#log-modal").on("show.bs.modal", function (e) {
let job_id = $(e.relatedTarget).data("job-id");
let job_url = $(e.relatedTarget).data("job-url");
$("#log-modal-title").text("Job ID: " + job_id);
$("#download-btn").attr("href", job_url.replace("get", "download"));
refresher_id = setInterval(() => {
$.get(job_url, function (data, status) {
html = ansi_up
.ansi_to_html(data.log)
.replace(/\r\n/g, "<br>")
.replace(/\n/g, "<br>");
$("#loading-spin").hide();
$("#log-content").html(html);
});
}, 1000);
});
$("#log-modal").on("hide.bs.modal", function (e) {
clearInterval(refresher_id);
});
$(document).ready(initTable);
setInterval(() => {
$table.bootstrapTable("refresh");
}, 10000);
</script>
</body>
</html>
...@@ -45,7 +45,10 @@ class TestMaxMemory(unittest.TestCase): ...@@ -45,7 +45,10 @@ class TestMaxMemory(unittest.TestCase):
def tearDown(self): def tearDown(self):
disconnect() disconnect()
def actor(self): #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
@staticmethod
def actor(cluster_addr):
parl.connect(cluster_addr)
actor1 = Actor() actor1 = Actor()
time.sleep(10) time.sleep(10)
actor1.add_500mb() actor1.add_500mb()
...@@ -56,16 +59,17 @@ class TestMaxMemory(unittest.TestCase): ...@@ -56,16 +59,17 @@ class TestMaxMemory(unittest.TestCase):
th = threading.Thread(target=master.run) th = threading.Thread(target=master.run)
th.start() th.start()
time.sleep(5) time.sleep(5)
worker = Worker('localhost:{}'.format(port), 1) cluster_addr = 'localhost:{}'.format(port)
cluster_monitor = ClusterMonitor('localhost:{}'.format(port)) worker = Worker(cluster_addr, 1)
cluster_monitor = ClusterMonitor(cluster_addr)
time.sleep(5) time.sleep(5)
parl.connect('localhost:{}'.format(port)) parl.connect(cluster_addr)
actor = Actor() actor = Actor()
time.sleep(20) time.sleep(20)
self.assertEqual(1, cluster_monitor.data['clients'][0]['actor_num']) self.assertEqual(1, cluster_monitor.data['clients'][0]['actor_num'])
del actor del actor
time.sleep(10) time.sleep(10)
p = Process(target=self.actor) p = Process(target=self.actor, args=(cluster_addr, ))
p.start() p.start()
for _ in range(6): for _ in range(6):
......
...@@ -22,7 +22,6 @@ import time ...@@ -22,7 +22,6 @@ import time
import threading import threading
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.remote import exceptions from parl.remote import exceptions
import timeout_decorator
import subprocess import subprocess
......
...@@ -22,7 +22,6 @@ import time ...@@ -22,7 +22,6 @@ import time
import threading import threading
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.remote import exceptions from parl.remote import exceptions
import timeout_decorator
import subprocess import subprocess
......
...@@ -22,7 +22,6 @@ import time ...@@ -22,7 +22,6 @@ import time
import threading import threading
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.remote import exceptions from parl.remote import exceptions
import timeout_decorator
import subprocess import subprocess
......
...@@ -21,8 +21,8 @@ import time ...@@ -21,8 +21,8 @@ import time
import threading import threading
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.remote import exceptions from parl.remote import exceptions
import timeout_decorator
import subprocess import subprocess
from parl.utils import logger
@parl.remote_class @parl.remote_class
...@@ -63,20 +63,24 @@ class TestCluster(unittest.TestCase): ...@@ -63,20 +63,24 @@ class TestCluster(unittest.TestCase):
disconnect() disconnect()
def test_actor_exception(self): def test_actor_exception(self):
master = Master(port=1235) logger.info("running:test_actor_exception")
master = Master(port=8235)
th = threading.Thread(target=master.run) th = threading.Thread(target=master.run)
th.start() th.start()
time.sleep(3) time.sleep(3)
worker1 = Worker('localhost:1235', 1) worker1 = Worker('localhost:8235', 1)
for _ in range(3): for _ in range(3):
if master.cpu_num == 1: if master.cpu_num == 1:
break break
time.sleep(10) time.sleep(10)
self.assertEqual(1, master.cpu_num) self.assertEqual(1, master.cpu_num)
parl.connect('localhost:1235') logger.info("running:test_actor_exception: 0")
parl.connect('localhost:8235')
logger.info("running:test_actor_exception: 1")
with self.assertRaises(exceptions.RemoteError): with self.assertRaises(exceptions.RemoteError):
actor = Actor(abcd='a bug') actor = Actor(abcd='a bug')
logger.info("running:test_actor_exception: 2")
actor2 = Actor() actor2 = Actor()
for _ in range(3): for _ in range(3):
...@@ -89,15 +93,15 @@ class TestCluster(unittest.TestCase): ...@@ -89,15 +93,15 @@ class TestCluster(unittest.TestCase):
master.exit() master.exit()
worker1.exit() worker1.exit()
@timeout_decorator.timeout(seconds=800) def test_actor_exception_2(self):
def test_actor_exception(self): logger.info("running: test_actor_exception_2")
master = Master(port=1236) master = Master(port=8236)
th = threading.Thread(target=master.run) th = threading.Thread(target=master.run)
th.start() th.start()
time.sleep(3) time.sleep(3)
worker1 = Worker('localhost:1236', 1) worker1 = Worker('localhost:8236', 1)
self.assertEqual(1, master.cpu_num) self.assertEqual(1, master.cpu_num)
parl.connect('localhost:1236') parl.connect('localhost:8236')
actor = Actor() actor = Actor()
try: try:
actor.will_raise_exception_func() actor.will_raise_exception_func()
...@@ -116,14 +120,15 @@ class TestCluster(unittest.TestCase): ...@@ -116,14 +120,15 @@ class TestCluster(unittest.TestCase):
master.exit() master.exit()
def test_reset_actor(self): def test_reset_actor(self):
logger.info("running: test_reset_actor")
# start the master # start the master
master = Master(port=1237) master = Master(port=8237)
th = threading.Thread(target=master.run) th = threading.Thread(target=master.run)
th.start() th.start()
time.sleep(3) time.sleep(3)
worker1 = Worker('localhost:1237', 4) worker1 = Worker('localhost:8237', 4)
parl.connect('localhost:1237') parl.connect('localhost:8237')
for _ in range(10): for _ in range(10):
actor = Actor() actor = Actor()
ret = actor.add_one(1) ret = actor.add_one(1)
...@@ -140,19 +145,20 @@ class TestCluster(unittest.TestCase): ...@@ -140,19 +145,20 @@ class TestCluster(unittest.TestCase):
master.exit() master.exit()
def test_add_worker(self): def test_add_worker(self):
master = Master(port=1234) logger.info("running: test_add_worker")
master = Master(port=8234)
th = threading.Thread(target=master.run) th = threading.Thread(target=master.run)
th.start() th.start()
time.sleep(1) time.sleep(1)
worker1 = Worker('localhost:1234', 4) worker1 = Worker('localhost:8234', 4)
for _ in range(3): for _ in range(3):
if master.cpu_num == 4: if master.cpu_num == 4:
break break
time.sleep(10) time.sleep(10)
self.assertEqual(master.cpu_num, 4) self.assertEqual(master.cpu_num, 4)
worker2 = Worker('localhost:1234', 4) worker2 = Worker('localhost:8234', 4)
for _ in range(3): for _ in range(3):
if master.cpu_num == 8: if master.cpu_num == 8:
break break
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. # Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
...@@ -11,14 +11,28 @@ ...@@ -11,14 +11,28 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import os
os.environ['XPARL'] = 'True'
import parl
import unittest
import warnings
warnings.simplefilter('default') @parl.remote_class(max_memory=350)
class Actor(object):
def __init__(self, x=10):
self.x = x
self.data = []
warnings.warn( def add_500mb(self):
"module `parl.framework.model_base.Model` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Model` instead.", self.data.append(os.urandom(500 * 1024**2))
DeprecationWarning, self.x += 1
stacklevel=2) return self.x
from parl.core.fluid.model import *
class TestLocalActor(unittest.TestCase):
def test_create_actors_without_pre_connection(self):
actor = Actor()
if __name__ == '__main__':
unittest.main()
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import json
import multiprocessing
import os
import pickle
import subprocess
import sys
import tempfile
import threading
import time
import unittest
import requests
import parl
from parl.remote.client import disconnect, get_global_client
from parl.remote.master import Master
from parl.remote.worker import Worker
from parl.utils import _IS_WINDOWS
@parl.remote_class
class Actor(object):
def __init__(self, number=None, arg1=None, arg2=None):
self.number = number
self.arg1 = arg1
self.arg2 = arg2
print("Init actor...")
self.init_output = "Init actor...\n"
def sim_output(self, start, end):
output = ""
print(self.number)
output += str(self.number)
output += "\n"
for i in range(start, end):
print(i)
output += str(i)
output += "\n"
return self.init_output + output
class TestLogServer(unittest.TestCase):
def tearDown(self):
disconnect()
#In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
@staticmethod
def _connect_and_create_actor(cluster_addr):
parl.connect(cluster_addr)
outputs = []
for i in range(2):
actor = Actor(number=i)
ret = actor.sim_output(1, 4)
assert ret != ""
outputs.append(ret)
return outputs
def test_log_server(self):
master_port = 8401
# start the master
master = Master(port=master_port)
th = threading.Thread(target=master.run)
th.start()
time.sleep(1)
cluster_addr = 'localhost:{}'.format(master_port)
log_server_port = 8402
worker = Worker(cluster_addr, 4, log_server_port=log_server_port)
outputs = self._connect_and_create_actor(cluster_addr)
# Get status
status = master._get_status()
client_jobs = pickle.loads(status).get('client_jobs')
self.assertIsNotNone(client_jobs)
# Get job id
client = get_global_client()
jobs = client_jobs.get(client.client_id)
self.assertIsNotNone(jobs)
for job_id, log_server_addr in jobs.items():
log_url = "http://{}/get-log".format(log_server_addr)
# Test response without job_id
r = requests.get(log_url)
self.assertEqual(r.status_code, 400)
# Test normal response
r = requests.get(log_url, params={'job_id': job_id})
self.assertEqual(r.status_code, 200)
log_content = json.loads(r.text).get('log')
self.assertIsNotNone(log_content)
log_content = log_content.replace('\r\n', '\n')
self.assertIn(log_content, outputs)
# Test download
download_url = "http://{}/download-log".format(log_server_addr)
r = requests.get(download_url, params={'job_id': job_id})
self.assertEqual(r.status_code, 200)
log_content = r.text.replace('\r\n', '\n')
self.assertIn(log_content, outputs)
disconnect()
worker.exit()
master.exit()
def test_monitor_query_log_server(self):
master_port = 8403
monitor_port = 8404
# start the master
master = Master(port=master_port, monitor_port=monitor_port)
th = threading.Thread(target=master.run)
th.start()
time.sleep(1)
# start the cluster monitor
monitor_file = __file__.replace(
os.path.join('tests', 'log_server_test.pyc'), 'monitor.py')
monitor_file = monitor_file.replace(
os.path.join('tests', 'log_server_test.py'), 'monitor.py')
command = [
sys.executable, monitor_file, "--monitor_port",
str(monitor_port), "--address", "localhost:" + str(master_port)
]
if _IS_WINDOWS:
FNULL = tempfile.TemporaryFile()
else:
FNULL = open(os.devnull, 'w')
monitor_proc = subprocess.Popen(
command,
stdout=FNULL,
stderr=subprocess.STDOUT,
)
# Start worker
cluster_addr = 'localhost:{}'.format(master_port)
log_server_port = 8405
worker = Worker(cluster_addr, 4, log_server_port=log_server_port)
# Test monitor API
outputs = self._connect_and_create_actor(cluster_addr)
time.sleep(5) # Wait for the status update
client = get_global_client()
jobs_url = "{}/get-jobs?client_id={}".format(master.monitor_url,
client.client_id)
r = requests.get(jobs_url)
self.assertEqual(r.status_code, 200)
data = json.loads(r.text)
for job in data:
log_url = job.get('log_url')
self.assertIsNotNone(log_url)
r = requests.get(log_url)
self.assertEqual(r.status_code, 200)
log_content = json.loads(r.text).get('log')
self.assertIsNotNone(log_content)
log_content = log_content.replace('\r\n', '\n')
self.assertIn(log_content, outputs)
# Test download
download_url = job.get('download_url')
r = requests.get(download_url)
self.assertEqual(r.status_code, 200)
log_content = r.text.replace('\r\n', '\n')
self.assertIn(log_content, outputs)
# Clean context
monitor_proc.kill()
monitor_proc.wait()
disconnect()
worker.exit()
master.exit()
if __name__ == '__main__':
unittest.main()
...@@ -16,7 +16,6 @@ import unittest ...@@ -16,7 +16,6 @@ import unittest
import parl import parl
import time import time
import threading import threading
import timeout_decorator
import multiprocessing import multiprocessing
from parl.remote.master import Master from parl.remote.master import Master
...@@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase): ...@@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase):
def tearDown(self): def tearDown(self):
disconnect() disconnect()
def _connect_and_create_actor(self, cluster_addr): #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
@staticmethod
def _connect_and_create_actor(cluster_addr):
parl.connect(cluster_addr) parl.connect(cluster_addr)
for _ in range(2): for _ in range(2):
actor = Actor() actor = Actor()
ret = actor.add_one(1) ret = actor.add_one(1)
self.assertEqual(ret, 2) assert ret == 2
disconnect() disconnect()
def _create_actor(self): def _create_actor(self):
...@@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase): ...@@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase):
ret = actor.add_one(1) ret = actor.add_one(1)
self.assertEqual(ret, 2) self.assertEqual(ret, 2)
@timeout_decorator.timeout(seconds=300)
def test_connect_and_create_actor_in_multiprocessing_with_connected_in_main_process( def test_connect_and_create_actor_in_multiprocessing_with_connected_in_main_process(
self): self):
# start the master # start the master
......
...@@ -16,7 +16,6 @@ import unittest ...@@ -16,7 +16,6 @@ import unittest
import parl import parl
import time import time
import threading import threading
import timeout_decorator
import multiprocessing import multiprocessing
from parl.remote.master import Master from parl.remote.master import Master
...@@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase): ...@@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase):
def tearDown(self): def tearDown(self):
disconnect() disconnect()
def _connect_and_create_actor(self, cluster_addr): #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
@staticmethod
def _connect_and_create_actor(cluster_addr):
parl.connect(cluster_addr) parl.connect(cluster_addr)
for _ in range(2): for _ in range(2):
actor = Actor() actor = Actor()
ret = actor.add_one(1) ret = actor.add_one(1)
self.assertEqual(ret, 2) assert ret == 2
disconnect() disconnect()
def _create_actor(self): def _create_actor(self):
...@@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase): ...@@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase):
ret = actor.add_one(1) ret = actor.add_one(1)
self.assertEqual(ret, 2) self.assertEqual(ret, 2)
@timeout_decorator.timeout(seconds=300)
def test_connect_and_create_actor_in_multiprocessing_without_connected_in_main_process( def test_connect_and_create_actor_in_multiprocessing_without_connected_in_main_process(
self): self):
# start the master # start the master
......
...@@ -4,8 +4,7 @@ ...@@ -4,8 +4,7 @@
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
# You may obtain a copy of the License at # You may obtain a copy of the License at
# #
# http://www.apache.org/licenses/LICENSE-2.0 # http://www.apache.org/licenses/LICENSE-2.0 #
#
# Unless required by applicable law or agreed to in writing, software # Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS, # distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
...@@ -16,12 +15,12 @@ import unittest ...@@ -16,12 +15,12 @@ import unittest
import parl import parl
import time import time
import threading import threading
import timeout_decorator
import multiprocessing import multiprocessing
from parl.remote.master import Master from parl.remote.master import Master
from parl.remote.worker import Worker from parl.remote.worker import Worker
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.utils import _IS_WINDOWS
@parl.remote_class @parl.remote_class
...@@ -39,21 +38,14 @@ class TestCluster(unittest.TestCase): ...@@ -39,21 +38,14 @@ class TestCluster(unittest.TestCase):
def tearDown(self): def tearDown(self):
disconnect() disconnect()
def _connect_and_create_actor(self, cluster_addr): #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
parl.connect(cluster_addr) @staticmethod
for _ in range(2): def _create_actor():
actor = Actor()
ret = actor.add_one(1)
self.assertEqual(ret, 2)
disconnect()
def _create_actor(self):
for _ in range(2): for _ in range(2):
actor = Actor() actor = Actor()
ret = actor.add_one(1) ret = actor.add_one(1)
self.assertEqual(ret, 2) assert ret == 2
@timeout_decorator.timeout(seconds=300)
def test_create_actor_in_multiprocessing(self): def test_create_actor_in_multiprocessing(self):
# start the master # start the master
master = Master(port=8240) master = Master(port=8240)
...@@ -64,6 +56,7 @@ class TestCluster(unittest.TestCase): ...@@ -64,6 +56,7 @@ class TestCluster(unittest.TestCase):
worker1 = Worker('localhost:8240', 4) worker1 = Worker('localhost:8240', 4)
parl.connect('localhost:8240') parl.connect('localhost:8240')
if not _IS_WINDOWS: # In windows, fork process cannot access client created in main process.
proc1 = multiprocessing.Process(target=self._create_actor) proc1 = multiprocessing.Process(target=self._create_actor)
proc2 = multiprocessing.Process(target=self._create_actor) proc2 = multiprocessing.Process(target=self._create_actor)
proc1.start() proc1.start()
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import unittest
from parl.utils import logger
import parl
from parl.remote.client import disconnect
from parl.remote.master import Master
from parl.remote.worker import Worker
import time
import threading
c = 10
port = 3002
if __name__ == '__main__':
master = Master(port=port)
th = threading.Thread(target=master.run)
th.setDaemon(True)
th.start()
time.sleep(5)
cluster_addr = 'localhost:{}'.format(port)
parl.connect(cluster_addr)
worker = Worker(cluster_addr, 1)
@parl.remote_class
class Actor(object):
def add(self, a, b):
return a + b + c
actor = Actor()
class TestRecursive_actor(unittest.TestCase):
def tearDown(self):
disconnect()
def test_global_running(self):
self.assertEqual(actor.add(1, 2), 13)
master.exit()
worker.exit()
if __name__ == '__main__':
unittest.main()
...@@ -23,7 +23,6 @@ import time ...@@ -23,7 +23,6 @@ import time
import threading import threading
import subprocess import subprocess
import sys import sys
import timeout_decorator
@parl.remote_class @parl.remote_class
...@@ -63,7 +62,6 @@ class TestJob(unittest.TestCase): ...@@ -63,7 +62,6 @@ class TestJob(unittest.TestCase):
def tearDown(self): def tearDown(self):
disconnect() disconnect()
@timeout_decorator.timeout(seconds=600)
def test_acor_exit_exceptionally(self): def test_acor_exit_exceptionally(self):
port = 1337 port = 1337
master = Master(port) master = Master(port)
......
...@@ -16,7 +16,8 @@ import parl ...@@ -16,7 +16,8 @@ import parl
from parl.remote.master import Master from parl.remote.master import Master
from parl.remote.worker import Worker from parl.remote.worker import Worker
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.utils import logger from parl.utils import logger, _IS_WINDOWS
import os
import threading import threading
import time import time
import subprocess import subprocess
...@@ -70,8 +71,13 @@ class TestJobAlone(unittest.TestCase): ...@@ -70,8 +71,13 @@ class TestJobAlone(unittest.TestCase):
time.sleep(1) time.sleep(1)
self.assertEqual(master.cpu_num, 4) self.assertEqual(master.cpu_num, 4)
print("We are going to kill all the jobs.") print("We are going to kill all the jobs.")
if _IS_WINDOWS:
command = r'''for /F "skip=2 tokens=2 delims=," %a in ('wmic process where "commandline like '%remote\\job.py%'" get processid^,status /format:csv') do taskkill /F /T /pid %a'''
print(os.popen(command).read())
else:
command = ( command = (
"ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9") "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9"
)
subprocess.call([command], shell=True) subprocess.call([command], shell=True)
parl.connect('localhost:1334') parl.connect('localhost:1334')
actor = Actor() actor = Actor()
......
...@@ -21,6 +21,7 @@ import threading ...@@ -21,6 +21,7 @@ import threading
from parl.remote.master import Master from parl.remote.master import Master
from parl.remote.worker import Worker from parl.remote.worker import Worker
from parl.remote.client import disconnect from parl.remote.client import disconnect
from parl.utils import _IS_WINDOWS
@parl.remote_class @parl.remote_class
...@@ -44,12 +45,15 @@ class TestSendFile(unittest.TestCase): ...@@ -44,12 +45,15 @@ class TestSendFile(unittest.TestCase):
worker = Worker('localhost:{}'.format(port), 1) worker = Worker('localhost:{}'.format(port), 1)
time.sleep(2) time.sleep(2)
os.system('mkdir ./rom_files') tmp_dir = 'rom_files'
os.system('touch ./rom_files/pong.bin') tmp_file = os.path.join(tmp_dir, 'pong.bin')
assert os.path.exists('./rom_files/pong.bin') os.system('mkdir {}'.format(tmp_dir))
parl.connect( if _IS_WINDOWS:
'localhost:{}'.format(port), os.system('type NUL >> {}'.format(tmp_file))
distributed_files=['./rom_files/pong.bin']) else:
os.system('touch {}'.format(tmp_file))
assert os.path.exists(tmp_file)
parl.connect('localhost:{}'.format(port), distributed_files=[tmp_file])
time.sleep(5) time.sleep(5)
actor = Actor() actor = Actor()
for _ in range(10): for _ in range(10):
...@@ -70,8 +74,9 @@ class TestSendFile(unittest.TestCase): ...@@ -70,8 +74,9 @@ class TestSendFile(unittest.TestCase):
worker = Worker('localhost:{}'.format(port), 1) worker = Worker('localhost:{}'.format(port), 1)
time.sleep(2) time.sleep(2)
tmp_file = os.path.join('rom_files', 'no_pong.bin')
self.assertRaises(Exception, parl.connect, 'localhost:{}'.format(port), self.assertRaises(Exception, parl.connect, 'localhost:{}'.format(port),
['./rom_files/no_pong.bin']) [tmp_file])
worker.exit() worker.exit()
master.exit() master.exit()
......
...@@ -17,12 +17,10 @@ import parl ...@@ -17,12 +17,10 @@ import parl
from parl.remote.master import Master from parl.remote.master import Master
from parl.remote.worker import Worker from parl.remote.worker import Worker
from parl.remote.client import disconnect from parl.remote.client import disconnect
import os
import time import time
import threading import threading
import sys import sys
import numpy as np import numpy as np
import json import json
...@@ -65,7 +63,8 @@ class TestConfigfile(unittest.TestCase): ...@@ -65,7 +63,8 @@ class TestConfigfile(unittest.TestCase):
parl.connect('localhost:1335', ['random.npy', 'config.json']) parl.connect('localhost:1335', ['random.npy', 'config.json'])
actor = Actor('random.npy', 'config.json') actor = Actor('random.npy', 'config.json')
time.sleep(5) time.sleep(5)
os.remove('./random.npy')
os.remove('./config.json')
remote_sum = actor.random_sum() remote_sum = actor.random_sum()
self.assertEqual(remote_sum, random_sum) self.assertEqual(remote_sum, random_sum)
time.sleep(10) time.sleep(10)
......
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
from contextlib import contextmanager
__all__ = ['load_remote_class', 'redirect_stdout_to_file']
def simplify_code(code, end_of_file):
"""
@parl.remote_actor has to use this function to simplify the code.
To create a remote object, PARL has to import the module that contains the decorated class.
It may run some unnecessary code when importing the module, and this is why we use this function
to simplify the code.
For example.
@parl.remote_actor
class A(object):
def add(self, a, b):
return a + b
def data_process():
XXXX
------------------>
The last two lines of the above code block will be removed as they are not class related.
"""
to_write_lines = []
for i, line in enumerate(code):
if line.startswith('parl.connect'):
continue
if i < end_of_file - 1:
to_write_lines.append(line)
else:
break
return to_write_lines
def load_remote_class(file_name, class_name, end_of_file):
"""
load a class given its file_name and class_name.
Args:
file_name: specify the file to load the class
class_name: specify the class to be loaded
end_of_file: line ID to indicate the last line that defines the class.
Return:
cls: the class to load
"""
with open(file_name + '.py') as t_file:
code = t_file.readlines()
code = simplify_code(code, end_of_file)
module_name = 'xparl_' + file_name
tmp_file_name = 'xparl_' + file_name + '.py'
with open(tmp_file_name, 'w') as t_file:
for line in code:
t_file.write(line)
mod = __import__(module_name)
cls = getattr(mod, class_name)
return cls
@contextmanager
def redirect_stdout_to_file(file_path):
"""Redirect stdout (e.g., `print`) to specified file.
Example:
>>> print('test')
test
>>> with redirect_stdout_to_file('test.log'):
... print('test') # Output nothing, `test` is printed to `test.log`.
>>> print('test')
test
Args:
file_path: Path of the file to output the stdout.
"""
tmp = sys.stdout
f = open(file_path, 'a')
sys.stdout = f
try:
yield
finally:
sys.stdout = tmp
f.close()
...@@ -20,13 +20,14 @@ import signal ...@@ -20,13 +20,14 @@ import signal
import socket import socket
import subprocess import subprocess
import sys import sys
import tempfile
import time import time
import threading import threading
import warnings import warnings
import zmq import zmq
from datetime import datetime from datetime import datetime
from parl.utils import get_ip_address, to_byte, to_str, logger from parl.utils import get_ip_address, to_byte, to_str, logger, _IS_WINDOWS, kill_process
from parl.remote import remote_constants from parl.remote import remote_constants
from parl.remote.message import InitializedWorker from parl.remote.message import InitializedWorker
from parl.remote.status import WorkerStatus from parl.remote.status import WorkerStatus
...@@ -63,7 +64,7 @@ class Worker(object): ...@@ -63,7 +64,7 @@ class Worker(object):
cpu_num (int): Number of cpu to be used on the worker. cpu_num (int): Number of cpu to be used on the worker.
""" """
def __init__(self, master_address, cpu_num=None): def __init__(self, master_address, cpu_num=None, log_server_port=None):
self.lock = threading.Lock() self.lock = threading.Lock()
self.heartbeat_socket_initialized = threading.Event() self.heartbeat_socket_initialized = threading.Event()
self.ctx = zmq.Context.instance() self.ctx = zmq.Context.instance()
...@@ -75,9 +76,13 @@ class Worker(object): ...@@ -75,9 +76,13 @@ class Worker(object):
self._set_cpu_num(cpu_num) self._set_cpu_num(cpu_num)
self.job_buffer = queue.Queue(maxsize=self.cpu_num) self.job_buffer = queue.Queue(maxsize=self.cpu_num)
self._create_sockets() self._create_sockets()
# create log server
self.log_server_proc, self.log_server_address = self._create_log_server(
port=log_server_port)
# create a thread that waits commands from the job to kill the job. # create a thread that waits commands from the job to kill the job.
self.kill_job_thread = threading.Thread(target=self._reply_kill_job) self.kill_job_thread = threading.Thread(target=self._reply_kill_job)
self.kill_job_thread.setDaemon(True)
self.kill_job_thread.start() self.kill_job_thread.start()
self._create_jobs() self._create_jobs()
...@@ -169,6 +174,7 @@ class Worker(object): ...@@ -169,6 +174,7 @@ class Worker(object):
def _fill_job_buffer(self): def _fill_job_buffer(self):
"""An endless loop that adds initialized job into the job buffer""" """An endless loop that adds initialized job into the job buffer"""
initialized_jobs = []
while self.worker_is_alive: while self.worker_is_alive:
if self.job_buffer.full() is False: if self.job_buffer.full() is False:
job_num = self.cpu_num - self.job_buffer.qsize() job_num = self.cpu_num - self.job_buffer.qsize()
...@@ -178,13 +184,7 @@ class Worker(object): ...@@ -178,13 +184,7 @@ class Worker(object):
self.job_buffer.put(job) self.job_buffer.put(job)
time.sleep(0.02) time.sleep(0.02)
self.exit()
# release jobs if the worker is not alive
for job in initialized_jobs:
try:
os.kill(job.pid, signal.SIGTERM)
except OSError:
pass
def _init_jobs(self, job_num): def _init_jobs(self, job_num):
"""Create jobs. """Create jobs.
...@@ -196,7 +196,8 @@ class Worker(object): ...@@ -196,7 +196,8 @@ class Worker(object):
job_file = job_file.replace('worker.py', 'job.py') job_file = job_file.replace('worker.py', 'job.py')
command = [ command = [
sys.executable, job_file, "--worker_address", sys.executable, job_file, "--worker_address",
self.reply_job_address self.reply_job_address, "--log_server_address",
self.log_server_address
] ]
if sys.version_info.major == 3: if sys.version_info.major == 3:
...@@ -223,6 +224,7 @@ class Worker(object): ...@@ -223,6 +224,7 @@ class Worker(object):
# a thread for sending heartbeat signals to job # a thread for sending heartbeat signals to job
thread = threading.Thread( thread = threading.Thread(
target=self._create_job_monitor, args=(initialized_job, )) target=self._create_job_monitor, args=(initialized_job, ))
thread.setDaemon(True)
thread.start() thread.start()
self.lock.release() self.lock.release()
assert len(new_jobs) > 0, "init jobs failed" assert len(new_jobs) > 0, "init jobs failed"
...@@ -311,6 +313,9 @@ class Worker(object): ...@@ -311,6 +313,9 @@ class Worker(object):
total_memory = round(virtual_memory[0] / (1024**3), 2) total_memory = round(virtual_memory[0] / (1024**3), 2)
used_memory = round(virtual_memory[3] / (1024**3), 2) used_memory = round(virtual_memory[3] / (1024**3), 2)
vacant_memory = round(total_memory - used_memory, 2) vacant_memory = round(total_memory - used_memory, 2)
if _IS_WINDOWS:
load_average = round(psutil.getloadavg()[0], 2)
else:
load_average = round(os.getloadavg()[0], 2) load_average = round(os.getloadavg()[0], 2)
return (vacant_memory, used_memory, now, load_average) return (vacant_memory, used_memory, now, load_average)
...@@ -329,7 +334,7 @@ class Worker(object): ...@@ -329,7 +334,7 @@ class Worker(object):
logger.set_dir( logger.set_dir(
os.path.expanduser('~/.parl_data/worker/{}'.format( os.path.expanduser('~/.parl_data/worker/{}'.format(
self.master_heartbeat_address))) self.master_heartbeat_address.replace(':', '_'))))
self.heartbeat_socket_initialized.set() self.heartbeat_socket_initialized.set()
logger.info("[Worker] Connect to the master node successfully. " logger.info("[Worker] Connect to the master node successfully. "
...@@ -351,15 +356,47 @@ class Worker(object): ...@@ -351,15 +356,47 @@ class Worker(object):
break break
socket.close(0) socket.close(0)
logger.warning( logger.warning(
"[Worker] lost connection with the master, will exit replying heartbeat for master." "[Worker] lost connection with the master, will exit reply heartbeat for master."
) )
self.worker_status.clear() self.worker_status.clear()
self.log_server_proc.kill()
self.log_server_proc.wait()
# exit the worker # exit the worker
self.worker_is_alive = False self.worker_is_alive = False
self.exit()
def _create_log_server(self, port):
log_server_file = __file__.replace('worker.pyc', 'log_server.py')
log_server_file = log_server_file.replace('worker.py', 'log_server.py')
if port is None:
port = "0" # `0` means using a random port in flask
command = [
sys.executable, log_server_file, "--port",
str(port), "--log_dir", "~/.parl_data/job/", "--line_num", "500"
]
if sys.version_info.major == 3:
warnings.simplefilter("ignore", ResourceWarning)
if _IS_WINDOWS:
FNULL = tempfile.TemporaryFile()
else:
FNULL = open(os.devnull, 'w')
log_server_proc = subprocess.Popen(
command,
stdout=FNULL,
stderr=subprocess.STDOUT,
)
FNULL.close()
log_server_address = "{}:{}".format(self.worker_ip, port)
return log_server_proc, log_server_address
def exit(self): def exit(self):
"""close the worker""" """close the worker"""
self.worker_is_alive = False self.worker_is_alive = False
kill_process('remote/job.py.*{}'.format(self.reply_job_address))
def run(self): def run(self):
"""Keep running until it lost connection with the master. """Keep running until it lost connection with the master.
......
...@@ -14,6 +14,8 @@ ...@@ -14,6 +14,8 @@
import cloudpickle import cloudpickle
import pyarrow import pyarrow
import subprocess
import os
from parl.utils import SerializeError, DeserializeError from parl.utils import SerializeError, DeserializeError
__all__ = ['dumps_argument', 'loads_argument', 'dumps_return', 'loads_return'] __all__ = ['dumps_argument', 'loads_argument', 'dumps_return', 'loads_return']
......
...@@ -14,28 +14,28 @@ ...@@ -14,28 +14,28 @@
import os import os
import platform import platform
import random
import socket
import subprocess import subprocess
from parl.utils import logger from parl.utils import logger, _HAS_FLUID, _IS_WINDOWS
from parl.utils import utils
__all__ = ['get_gpu_count', 'get_ip_address', 'is_gpu_available'] __all__ = [
'get_gpu_count', 'get_ip_address', 'is_gpu_available', 'get_free_tcp_port',
'is_port_available', 'get_port_from_range'
]
def get_ip_address(): def get_ip_address():
""" """
get the IP address of the host. get the IP address of the host.
""" """
platform_sys = platform.system()
# Only support Linux and MacOS
if platform_sys != 'Linux' and platform_sys != 'Darwin':
logger.warning(
'get_ip_address only support Linux and MacOS, please set ip address manually.'
)
return None
# Windows
if _IS_WINDOWS:
local_ip = socket.gethostbyname(socket.gethostname())
else:
# Linux and MacOS
local_ip = None local_ip = None
import socket
try: try:
# First way, tested in Ubuntu and MacOS # First way, tested in Ubuntu and MacOS
s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
...@@ -97,10 +97,40 @@ def is_gpu_available(): ...@@ -97,10 +97,40 @@ def is_gpu_available():
True if a gpu device can be found. True if a gpu device can be found.
""" """
ret = get_gpu_count() > 0 ret = get_gpu_count() > 0
if utils._HAS_FLUID: if _HAS_FLUID:
from paddle import fluid from paddle import fluid
if ret is True and not fluid.is_compiled_with_cuda(): if ret is True and not fluid.is_compiled_with_cuda():
logger.warning("Found non-empty CUDA_VISIBLE_DEVICES. \ logger.warning("Found non-empty CUDA_VISIBLE_DEVICES. \
But PARL found that Paddle was not complied with CUDA, which may cause issues." But PARL found that Paddle was not complied with CUDA, which may cause issues. \
) Thus PARL will not use GPU.")
return False
return ret return ret
def get_free_tcp_port():
tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
tcp.bind(('', 0))
addr, port = tcp.getsockname()
tcp.close()
return str(port)
def is_port_available(port):
""" Check if a port is used.
True if the port is available for connection.
"""
port = int(port)
sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
available = sock.connect_ex(('localhost', port))
sock.close()
return available
def get_port_from_range(start, end):
while True:
port = random.randint(start, end)
if is_port_available(port):
break
return port
...@@ -11,3 +11,8 @@ ...@@ -11,3 +11,8 @@
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
try:
from parl.utils.visualdl import *
except:
from parl.utils.tensorboard import *
...@@ -14,6 +14,7 @@ ...@@ -14,6 +14,7 @@
from tensorboardX import SummaryWriter from tensorboardX import SummaryWriter
from parl.utils import logger from parl.utils import logger
from parl.utils.machine_info import get_ip_address
__all__ = [] __all__ = []
...@@ -29,8 +30,8 @@ def create_file_after_first_call(func_name): ...@@ -29,8 +30,8 @@ def create_file_after_first_call(func_name):
if logdir is None: if logdir is None:
logdir = logger.auto_set_dir(action='d') logdir = logger.auto_set_dir(action='d')
logger.warning( logger.warning(
"[tensorboard] logdir is None, will save tensorboard files to {}" "[tensorboard] logdir is None, will save tensorboard files to {}\nView the data using: tensorboard --logdir=./{} --host={}"
.format(logdir)) .format(logdir, logdir, get_ip_address()))
_writer = SummaryWriter(logdir=logger.get_dir()) _writer = SummaryWriter(logdir=logger.get_dir())
func = getattr(_writer, func_name) func = getattr(_writer, func_name)
func(*args, **kwargs) func(*args, **kwargs)
......
...@@ -12,7 +12,7 @@ ...@@ -12,7 +12,7 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
import unittest import unittest
from parl.utils import tensorboard from parl.utils import summary
import numpy as np import numpy as np
from parl.utils import logger from parl.utils import logger
import os import os
...@@ -20,18 +20,21 @@ import os ...@@ -20,18 +20,21 @@ import os
class TestUtils(unittest.TestCase): class TestUtils(unittest.TestCase):
def tearDown(self): def tearDown(self):
tensorboard.flush() if hasattr(summary, 'flush'):
summary.flush()
def test_add_scalar(self): def test_add_scalar(self):
x = range(100) x = range(100)
for i in x: for i in x:
tensorboard.add_scalar('y=2x', i * 2, i) summary.add_scalar('y=2x', i * 2, i)
self.assertTrue(os.path.exists('./train_log/tensorboard_test')) self.assertTrue(os.path.exists('./train_log/summary_test'))
def test_add_histogram(self): def test_add_histogram(self):
if not hasattr(summary, 'add_histogram'):
return
for i in range(10): for i in range(10):
x = np.random.random(1000) x = np.random.random(1000)
tensorboard.add_histogram('distribution centers', x + i, i) summary.add_histogram('distribution centers', x + i, i)
if __name__ == '__main__': if __name__ == '__main__':
......
...@@ -13,10 +13,14 @@ ...@@ -13,10 +13,14 @@
# limitations under the License. # limitations under the License.
import sys import sys
import os
import subprocess
import numpy as np
__all__ = [ __all__ = [
'has_func', 'action_mapping', 'to_str', 'to_byte', 'is_PY2', 'is_PY3', 'has_func', 'action_mapping', 'to_str', 'to_byte', 'is_PY2', 'is_PY3',
'MAX_INT32', '_HAS_FLUID', '_HAS_TORCH' 'MAX_INT32', '_HAS_FLUID', '_HAS_TORCH', '_IS_WINDOWS', '_IS_MAC',
'kill_process'
] ]
...@@ -45,9 +49,12 @@ def action_mapping(model_output_act, low_bound, high_bound): ...@@ -45,9 +49,12 @@ def action_mapping(model_output_act, low_bound, high_bound):
Returns: Returns:
action: np.array, which value is in [low_bound, high_bound] action: np.array, which value is in [low_bound, high_bound]
""" """
assert np.all(((model_output_act<=1.0), (model_output_act>=-1.0))), \
'the action should be in range [-1.0, 1.0]'
assert high_bound > low_bound assert high_bound > low_bound
action = low_bound + (model_output_act - (-1.0)) * ( action = low_bound + (model_output_act - (-1.0)) * (
(high_bound - low_bound) / 2.0) (high_bound - low_bound) / 2.0)
action = np.clip(action, low_bound, high_bound)
return action return action
...@@ -82,7 +89,7 @@ MAX_INT32 = 0x7fffffff ...@@ -82,7 +89,7 @@ MAX_INT32 = 0x7fffffff
try: try:
from paddle import fluid from paddle import fluid
fluid_version = get_fluid_version() fluid_version = get_fluid_version()
assert fluid_version >= 151, "PARL requires paddle>=1.5.1" assert fluid_version >= 161 or fluid_version == 0, "PARL requires paddle>=1.6.1"
_HAS_FLUID = True _HAS_FLUID = True
except ImportError: except ImportError:
_HAS_FLUID = False _HAS_FLUID = False
...@@ -92,3 +99,26 @@ try: ...@@ -92,3 +99,26 @@ try:
_HAS_TORCH = True _HAS_TORCH = True
except ImportError: except ImportError:
_HAS_TORCH = False _HAS_TORCH = False
_IS_WINDOWS = (sys.platform == 'win32')
_IS_MAC = (sys.platform == 'darwin')
def kill_process(regex_pattern):
"""kill process whose execution commnad is matched by regex pattern
Args:
regex_pattern(string): regex pattern used to filter the process to be killed
NOTE:
In windows, we will replace sep `/` with `\\\\`
"""
if _IS_WINDOWS:
regex_pattern = regex_pattern.replace('/', '\\\\')
command = r'''for /F "skip=2 tokens=2 delims=," %a in ('wmic process where "commandline like '%{}%'" get processid^,status /format:csv') do taskkill /F /T /pid %a'''.format(
regex_pattern)
os.popen(command).read()
else:
command = "ps aux | grep {} | awk '{{print $2}}' | xargs kill -9".format(
regex_pattern)
subprocess.call([command], shell=True)
...@@ -12,37 +12,35 @@ ...@@ -12,37 +12,35 @@
# See the License for the specific language governing permissions and # See the License for the specific language governing permissions and
# limitations under the License. # limitations under the License.
from rlschool import LiftSim from visualdl import LogWriter
from wrapper import Wrapper, ActionWrapper, ObservationWrapper from parl.utils import logger
from rl_benchmark.dispatcher import RL_dispatcher from parl.utils.machine_info import get_ip_address
import sys
import argparse __all__ = []
_writer = None
# run main program with args _WRITTER_METHOD = ['add_scalar']
def run_main(args):
parser = argparse.ArgumentParser(description='demo configuration') def create_file_after_first_call(func_name):
parser.add_argument( def call(*args, **kwargs):
'--iterations', global _writer
type=int, if _writer is None:
default=100000000, logdir = logger.get_dir()
help='total number of iterations') if logdir is None:
args = parser.parse_args(args) logdir = logger.auto_set_dir(action='d')
print('iterations:', args.iterations) logger.warning(
"[VisualDL] logdir is None, will save VisualDL files to {}\nView the data using: visualdl --logdir=./{} --host={}"
mansion_env = LiftSim() .format(logdir, logdir, get_ip_address()))
# mansion_env.seed(1988) _writer = LogWriter(logdir=logger.get_dir())
func = getattr(_writer, func_name)
mansion_env = Wrapper(mansion_env) func(*args, **kwargs)
mansion_env = ActionWrapper(mansion_env) _writer.flush()
mansion_env = ObservationWrapper(mansion_env)
return call
dispatcher = RL_dispatcher(mansion_env, args.iterations)
dispatcher.run_episode()
# export writter functions
return 0 for func_name in _WRITTER_METHOD:
locals()[func_name] = create_file_after_first_call(func_name)
__all__.append(func_name)
if __name__ == "__main__":
run_main(sys.argv[1:])
...@@ -31,7 +31,12 @@ def _find_packages(prefix=''): ...@@ -31,7 +31,12 @@ def _find_packages(prefix=''):
prefix = prefix prefix = prefix
for root, _, files in os.walk(path): for root, _, files in os.walk(path):
if '__init__.py' in files: if '__init__.py' in files:
packages.append(re.sub('^[^A-z0-9_]', '', root.replace('/', '.'))) if sys.platform == 'win32':
packages.append(
re.sub('^[^A-z0-9_]', '', root.replace('\\', '.')))
else:
packages.append(
re.sub('^[^A-z0-9_]', '', root.replace('/', '.')))
return packages return packages
...@@ -72,9 +77,11 @@ setup( ...@@ -72,9 +77,11 @@ setup(
"cloudpickle==1.2.1", "cloudpickle==1.2.1",
"tensorboardX==1.8", "tensorboardX==1.8",
"tb-nightly==1.15.0a20190801", "tb-nightly==1.15.0a20190801",
"flask==1.0.4", "flask>=1.0.4",
"click", "click",
"psutil", "psutil>=5.6.2",
"flask_cors",
"visualdl>=2.0.0b;python_version>='3' and platform_system=='Linux'",
], ],
classifiers=[ classifiers=[
'Intended Audience :: Developers', 'Intended Audience :: Developers',
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册