提交 dfce491c 编写于 作者: T TomorrowIsAnOtherDay

Merge branch 'develop' into CN_docs

from __future__ import absolute_import from __future__ import absolute_import
from __future__ import print_function from __future__ import print_function
from __future__ import unicode_literals
import argparse import argparse
import io, re import io, re
......
...@@ -18,3 +18,7 @@ ...@@ -18,3 +18,7 @@
FROM parl/parl-test:cuda9.0-cudnn7-v2 FROM parl/parl-test:cuda9.0-cudnn7-v2
COPY ./requirements.txt /root/ COPY ./requirements.txt /root/
RUN apt-get install -y libgflags-dev libgoogle-glog-dev libomp-dev unzip
RUN apt-get install -y libgtest-dev && cd /usr/src/gtest && mkdir build \
&& cd build && cmake .. && make && cp libgtest*.a /usr/local/lib
...@@ -69,7 +69,7 @@ function run_test_with_gpu() { ...@@ -69,7 +69,7 @@ function run_test_with_gpu() {
Running unit tests with GPU... Running unit tests with GPU...
======================================== ========================================
EOF EOF
ctest --output-on-failure -j10 ctest --output-on-failure -j20 --verbose
cd ${REPO_ROOT} cd ${REPO_ROOT}
rm -rf ${REPO_ROOT}/build rm -rf ${REPO_ROOT}/build
} }
...@@ -90,7 +90,7 @@ function run_test_with_cpu() { ...@@ -90,7 +90,7 @@ function run_test_with_cpu() {
===================================================== =====================================================
EOF EOF
if [ $# -eq 1 ];then if [ $# -eq 1 ];then
ctest --output-on-failure -j10 ctest --output-on-failure -j20 --verbose
else else
ctest --output-on-failure ctest --output-on-failure
fi fi
...@@ -145,7 +145,8 @@ function main() { ...@@ -145,7 +145,8 @@ function main() {
;; ;;
test) test)
# test code compability in environments with various python versions # test code compability in environments with various python versions
declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37") #declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
declare -a envs=("py27" "py36")
for env in "${envs[@]}";do for env in "${envs[@]}";do
cd /work cd /work
source ~/.bashrc source ~/.bashrc
...@@ -158,7 +159,7 @@ function main() { ...@@ -158,7 +159,7 @@ function main() {
echo ======================================== echo ========================================
pip install . pip install .
if [ \( $env == "py27" -o $env == "py36" -o $env == "py37" \) ] if [ \( $env == "py27" -o $env == "py36" -o $env == "py37" \) ]
then then
pip install -r .teamcity/requirements.txt pip install -r .teamcity/requirements.txt
run_test_with_cpu $env run_test_with_cpu $env
run_test_with_cpu $env "DIS_TESTING_SERIALLY" run_test_with_cpu $env "DIS_TESTING_SERIALLY"
...@@ -169,6 +170,10 @@ function main() { ...@@ -169,6 +170,10 @@ function main() {
pip install -r .teamcity/requirements_torch.txt pip install -r .teamcity/requirements_torch.txt
run_test_with_cpu $env "DIS_TESTING_TORCH" run_test_with_cpu $env "DIS_TESTING_TORCH"
fi fi
# clean env
export LC_ALL=C.UTF-8
export LANG=C.UTF-8
xparl stop
done done
run_test_with_gpu run_test_with_gpu
......
...@@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97 ...@@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97
gym gym
details details
parameterized parameterized
timeout_decorator
...@@ -2,4 +2,3 @@ ...@@ -2,4 +2,3 @@
gym gym
details details
parameterized parameterized
timeout_decorator
...@@ -37,7 +37,8 @@ if __name__ == '__main__': ...@@ -37,7 +37,8 @@ if __name__ == '__main__':
exclude_examples = [ exclude_examples = [
'NeurIPS2019-Learn-to-Move-Challenge', 'NeurIPS2019-Learn-to-Move-Challenge',
'NeurIPS2018-AI-for-Prosthetics-Challenge', 'EagerMode' 'NeurIPS2018-AI-for-Prosthetics-Challenge', 'LiftSim_baseline',
'EagerMode'
] ]
for example in os.listdir('../examples/'): for example in os.listdir('../examples/'):
if example not in exclude_examples: if example not in exclude_examples:
......
#!/usr/bin/env bash
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# NOTE: You need install mingw-cmake.
function init() {
RED='\033[0;31m'
BLUE='\033[0;34m'
BOLD='\033[1m'
NONE='\033[0m'
REPO_ROOT=`pwd`
}
function abort(){
echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
echo "Please use pre-commit to check what is wrong." 1>&2
exit 1
}
function run_test_with_cpu() {
export CUDA_VISIBLE_DEVICES="-1"
mkdir -p ${REPO_ROOT}/build
cd ${REPO_ROOT}/build
if [ $# -eq 1 ];then
cmake -G "MinGW Makefiles" ..
else
cmake -G "MinGW Makefiles" .. -$2=ON
fi
cat <<EOF
=====================================================
Running unit tests with CPU in the environment: $1
=====================================================
EOF
if [ $# -eq 1 ];then
ctest --output-on-failure -j10
else
ctest --output-on-failure
fi
cd ${REPO_ROOT}
rm -rf ${REPO_ROOT}/build
}
function main() {
set -e
local CMD=$1
init
env="unused_variable"
# run unittest in windows (used in local machine)
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple .
pip uninstall -y torch torchvision
pip install -i https://pypi.tuna.tsinghua.edu.cn/simple paddlepaddle==1.6.1 gym details parameterized
run_test_with_cpu $env
run_test_with_cpu $env "DIS_TESTING_SERIALLY"
pip uninstall -y paddlepaddle
pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html
run_test_with_cpu $env "DIS_TESTING_TORCH"
}
main $@
...@@ -33,6 +33,7 @@ function(py_test TARGET_NAME) ...@@ -33,6 +33,7 @@ function(py_test TARGET_NAME)
add_test(NAME ${TARGET_NAME} add_test(NAME ${TARGET_NAME}
COMMAND python -u ${py_test_SRCS} ${py_test_ARGS} COMMAND python -u ${py_test_SRCS} ${py_test_ARGS}
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}) WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 300)
endfunction() endfunction()
function(import_test TARGET_NAME) function(import_test TARGET_NAME)
......
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
</p> </p>
[English](./README.md) | 简体中文 [English](./README.md) | 简体中文
[**文档**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md) [**文档**](https://parl.readthedocs.io/en/stable/index.html)
> PARL 是一个高性能、灵活的强化学习框架。 > PARL 是一个高性能、灵活的强化学习框架。
# 特点 # 特点
...@@ -48,7 +48,7 @@ class Agent(object): ...@@ -48,7 +48,7 @@ class Agent(object):
parl.connect('localhost:8037') parl.connect('localhost:8037')
agent = Agent() agent = Agent()
agent.say_hello() agent.say_hello()
ans = agent.sum(1,5) # run remotely and not comsume any local computation resources ans = agent.sum(1,5) # run remotely and not comsume any local computation resources
``` ```
两步调度外部的计算资源: 两步调度外部的计算资源:
1. 使用`parl.remote_class`修饰一个类,之后这个类就被转化为可以运行在其他CPU或者机器上的类。 1. 使用`parl.remote_class`修饰一个类,之后这个类就被转化为可以运行在其他CPU或者机器上的类。
...@@ -61,8 +61,8 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour ...@@ -61,8 +61,8 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour
# 安装: # 安装:
### 依赖 ### 依赖
- Python 2.7 or 3.5+. - Python 2.7 or 3.5+. (**Windows系统**目前仅支持python3.6+以上的环境)
- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle) - [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle)
``` ```
...@@ -83,6 +83,6 @@ pip install parl ...@@ -83,6 +83,6 @@ pip install parl
- [冠军解决方案:NIPS2018强化学习假肢挑战赛](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/) - [冠军解决方案:NIPS2018强化学习假肢挑战赛](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/)
- [冠军解决方案:NIPS2019强化学习仿生人控制赛事](examples/NeurIPS2019-Learn-to-Move-Challenge/) - [冠军解决方案:NIPS2019强化学习仿生人控制赛事](examples/NeurIPS2019-Learn-to-Move-Challenge/)
<img src="examples/NeurIPS2019-Learn-to-Move-Challenge/image/performance.gif" width = "300" height ="200" alt="NeurlIPS2018"/> <img src=".github/Half-Cheetah.gif" width = "300" height ="200" alt="Half-Cheetah"/> <img src=".github/Breakout.gif" width = "200" height ="200" alt="Breakout"/> <img src="examples/NeurIPS2019-Learn-to-Move-Challenge/image/performance.gif" width = "300" height ="200" alt="NeurlIPS2018"/> <img src=".github/Half-Cheetah.gif" width = "300" height ="200" alt="Half-Cheetah"/> <img src=".github/Breakout.gif" width = "200" height ="200" alt="Breakout"/>
<br> <br>
<img src=".github/Aircraft.gif" width = "808" height ="300" alt="NeurlIPS2018"/> <img src=".github/Aircraft.gif" width = "808" height ="300" alt="NeurlIPS2018"/>
...@@ -3,7 +3,7 @@ ...@@ -3,7 +3,7 @@
</p> </p>
English | [简体中文](./README.cn.md) English | [简体中文](./README.cn.md)
[**Documentation**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md) [**Documentation**](https://parl.readthedocs.io/en/stable/index.html)
> PARL is a flexible and high-efficient reinforcement learning framework. > PARL is a flexible and high-efficient reinforcement learning framework.
...@@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c ...@@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c
# Install: # Install:
### Dependencies ### Dependencies
- Python 2.7 or 3.5+. - Python 2.7 or 3.5+(On **Windows**, PARL only supprorts the enviroment with python3.6+).
- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone) - [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone)
......
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
from tqdm import tqdm
from parl.utils import logger
class Arena():
"""
An Arena class where any 2 agents can be pit against each other.
"""
def __init__(self, player1, player2, game, display=None):
"""
Input:
player 1,2: two functions that takes board as input, return action
game: Game object
display: a function that takes board as input and prints it (e.g.
display in othello/OthelloGame). Is necessary for verbose
mode.
see othello/OthelloPlayers.py for an example. See pit.py for pitting
human players/other baselines with each other.
"""
self.player1 = player1
self.player2 = player2
self.game = game
self.display = display
def playGame(self, verbose=False):
"""
Executes one episode of a game.
Returns:
either
winner: player who won the game (1 if player1, -1 if player2)
or
draw result returned from the game that is neither 1, -1, nor 0.
"""
players = [self.player2, None, self.player1]
curPlayer = 1
board = self.game.getInitBoard()
it = 0
while self.game.getGameEnded(board, curPlayer) == 0:
it += 1
if verbose:
assert self.display
print("Turn ", str(it), "Player ", str(curPlayer))
self.display(board)
action = players[curPlayer + 1](self.game.getCanonicalForm(
board, curPlayer))
valids = self.game.getValidMoves(
self.game.getCanonicalForm(board, curPlayer), 1)
if valids[action] == 0:
logger.error('Action {} is not valid!'.format(action))
logger.debug('valids = {}'.format(valids))
assert valids[action] > 0
board, curPlayer = self.game.getNextState(board, curPlayer, action)
if verbose:
assert self.display
print("Game over: Turn ", str(it), "Result ",
str(self.game.getGameEnded(board, 1)))
self.display(board)
return curPlayer * self.game.getGameEnded(board, curPlayer)
def playGames(self, num, verbose=False):
"""
Plays num games in which player1 starts num/2 games and player2 starts
num/2 games.
Returns:
oneWon: games won by player1
twoWon: games won by player2
draws: games won by nobody
"""
num = int(num / 2)
oneWon = 0
twoWon = 0
draws = 0
for _ in tqdm(range(num), desc="Arena.playGames (1)"):
gameResult = self.playGame(verbose=verbose)
if gameResult == 1:
oneWon += 1
elif gameResult == -1:
twoWon += 1
else:
draws += 1
self.player1, self.player2 = self.player2, self.player1
for _ in tqdm(range(num), desc="Arena.playGames (2)"):
gameResult = self.playGame(verbose=verbose)
if gameResult == -1:
oneWon += 1
elif gameResult == 1:
twoWon += 1
else:
draws += 1
return oneWon, twoWon, draws
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import sys
import threading
import queue
import pickle
from pickle import Pickler, Unpickler
from random import shuffle
from parl.utils import tensorboard
import numpy as np
from tqdm import tqdm
import parl
from parl.utils import logger
from actor import Actor
from utils import split_group, get_test_dataset
from alphazero_agent import create_agent
class Coach():
"""
This class executes the self-play, learning and evaluating.
"""
def __init__(self, game, args):
self.game = game
self.args = args
# neural network of current generation
self.current_agent = create_agent(self.game)
# neural network of previous generation
self.previous_agent = create_agent(self.game)
# history of examples from args.numItersForTrainExamplesHistory latest iterations
self.trainExamplesHistory = []
self.remote_actors_signal_queues = []
self.remote_actors_return_queue = queue.Queue()
self.test_dataset = get_test_dataset()
def _run_remote_tasks(self, signal_queue):
# The remote actor will actually run on the local machine or other machines of xparl cluster
remote_actor = Actor(self.game, self.args)
while True:
# receive running task signal
# signal: specify task type and task input data (optional)
signal = signal_queue.get()
if signal["task"] == "self-play":
episode_num_each_actor = self.args.numEps // self.args.actors_num
result = remote_actor.self_play(
self.current_agent.get_weights(), episode_num_each_actor)
self.remote_actors_return_queue.put({"self-play": result})
elif signal["task"] == "pitting":
games_num_each_actor = self.args.arenaCompare // self.args.actors_num
result = remote_actor.pitting(
self.previous_agent.get_weights(),
self.current_agent.get_weights(), games_num_each_actor)
self.remote_actors_return_queue.put({"pitting": result})
elif signal["task"] == "evaluate_test_dataset":
test_dataset = signal["test_dataset"]
result = remote_actor.evaluate_test_dataset(
self.current_agent.get_weights(), test_dataset)
self.remote_actors_return_queue.put({
"evaluate_test_dataset":
result
})
else:
raise NotImplementedError
def _create_remote_actors(self):
# connect to xparl cluster to submit jobs
parl.connect(self.args.master_address)
for i in range(self.args.actors_num):
signal_queue = queue.Queue()
self.remote_actors_signal_queues.append(signal_queue)
remote_thread = threading.Thread(
target=self._run_remote_tasks, args=(signal_queue, ))
remote_thread.setDaemon(True)
remote_thread.start()
def learn(self):
"""Each iteration:
1. Performs numEps episodes of self-play.
2. Retrains neural network with examples in trainExamplesHistory
(which has a maximum length of numItersForTrainExamplesHistory).
3. Evaluates the new neural network with the test dataset.
4. Pits the new neural network against the old one and accepts it
only if it wins >= updateThreshold fraction of games.
"""
# create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel.
self._create_remote_actors()
for iteration in range(1, self.args.numIters + 1):
logger.info('Starting Iter #{} ...'.format(iteration))
####################
logger.info('Step1: self-play in parallel...')
iterationTrainExamples = []
# update weights of remote actors to the latest weights, and ask them to run self-play task
for signal_queue in self.remote_actors_signal_queues:
signal_queue.put({"task": "self-play"})
# wait for all remote actors (a total of self.args.actors_num) to return the self-play results
for _ in range(self.args.actors_num):
result = self.remote_actors_return_queue.get()
iterationTrainExamples.extend(result["self-play"])
# save the iteration examples to the history
self.trainExamplesHistory.append(iterationTrainExamples)
if len(self.trainExamplesHistory
) > self.args.numItersForTrainExamplesHistory:
logger.warning("Removing the oldest entry in trainExamples.")
self.trainExamplesHistory.pop(0)
self.saveTrainExamples(iteration) # backup history to a file
####################
logger.info('Step2: train neural network...')
# shuffle examples before training
trainExamples = []
for e in self.trainExamplesHistory:
trainExamples.extend(e)
shuffle(trainExamples)
# training new network, keeping a copy of the old one
self.current_agent.save(
os.path.join(self.args.checkpoint, 'temp.pth.tar'))
self.previous_agent.restore(
os.path.join(self.args.checkpoint, 'temp.pth.tar'))
self.current_agent.learn(trainExamples)
####################
logger.info('Step3: evaluate test dataset in parallel...')
cnt = 0
# update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset
for i, data in enumerate(
split_group(
self.test_dataset,
len(self.test_dataset) // self.args.actors_num)):
self.remote_actors_signal_queues[i].put({
"task":
"evaluate_test_dataset",
"test_dataset":
data
})
cnt += len(data)
perfect_moves_cnt, good_moves_cnt = 0, 0
# wait for all remote actors (a total of self.args.actors_num) to return the evaluating results
for _ in range(self.args.actors_num):
(perfect_moves,
good_moves) = self.remote_actors_return_queue.get(
)["evaluate_test_dataset"]
perfect_moves_cnt += perfect_moves
good_moves_cnt += good_moves
logger.info('perfect moves rate: {}, good moves rate: {}'.format(
perfect_moves_cnt / cnt, good_moves_cnt / cnt))
tensorboard.add_scalar('perfect_moves_rate',
perfect_moves_cnt / cnt, iteration)
tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt,
iteration)
####################
logger.info(
'Step4: pitting against previous generation in parallel...')
# transfer weights of previous generation and current generation to the remote actors, and ask them to pit.
for signal_queue in self.remote_actors_signal_queues:
signal_queue.put({"task": "pitting"})
previous_wins, current_wins, draws = 0, 0, 0
for _ in range(self.args.actors_num):
(pwins_, cwins_,
draws_) = self.remote_actors_return_queue.get()["pitting"]
previous_wins += pwins_
current_wins += cwins_
draws += draws_
logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
(current_wins, previous_wins, draws))
if previous_wins + current_wins == 0 or float(current_wins) / (
previous_wins + current_wins) < self.args.updateThreshold:
logger.info('REJECTING NEW MODEL')
self.current_agent.restore(
os.path.join(self.args.checkpoint, 'temp.pth.tar'))
else:
logger.info('ACCEPTING NEW MODEL')
self.current_agent.save(
os.path.join(self.args.checkpoint, 'best.pth.tar'))
self.current_agent.save(
os.path.join(self.args.checkpoint,
self.getCheckpointFile(iteration)))
def getCheckpointFile(self, iteration):
return 'checkpoint_' + str(iteration) + '.pth.tar'
def saveTrainExamples(self, iteration):
folder = self.args.checkpoint
if not os.path.exists(folder):
os.makedirs(folder)
filename = os.path.join(
folder,
self.getCheckpointFile(iteration) + ".examples")
with open(filename, "wb+") as f:
Pickler(f).dump(self.trainExamplesHistory)
f.closed
def loadModel(self):
self.current_agent.restore(
os.path.join(self.args.load_folder_file[0],
self.args.load_folder_file[1]))
def loadTrainExamples(self):
modelFile = os.path.join(self.args.load_folder_file[0],
self.args.load_folder_file[1])
examplesFile = modelFile + ".examples"
if not os.path.isfile(examplesFile):
logger.warning(
"File {} with trainExamples not found!".format(examplesFile))
r = input("Continue? [y|n]")
if r != "y":
sys.exit()
else:
logger.info("File with trainExamples found. Loading it...")
with open(examplesFile, "rb") as f:
self.trainExamplesHistory = Unpickler(f).load()
logger.info('Loading done!')
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
import math
import time
import numpy as np
EPS = 1e-8
class MCTS():
"""
This class handles the MCTS tree.
"""
def __init__(self, game, nn_agent, args, dirichlet_noise=False):
self.game = game
self.nn_agent = nn_agent
self.args = args
self.dirichlet_noise = dirichlet_noise
self.Qsa = {} # stores Q values for s,a (as defined in the paper)
self.Nsa = {} # stores #times edge s,a was visited
self.Ns = {} # stores #times board s was visited
self.Ps = {} # stores initial policy (returned by neural net)
self.Es = {} # stores game.getGameEnded ended for board s
self.Vs = {} # stores game.getValidMoves for board s
def getActionProb(self, canonicalBoard, temp=1):
"""
This function performs numMCTSSims simulations of MCTS starting from
canonicalBoard.
Returns:
probs: a policy vector where the probability of the ith action is
proportional to Nsa[(s,a)]**(1./temp)
"""
for i in range(self.args.numMCTSSims):
dir_noise = (i == 0 and self.dirichlet_noise)
self.search(canonicalBoard, dirichlet_noise=dir_noise)
s = self.game.stringRepresentation(canonicalBoard)
counts = [
self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
for a in range(self.game.getActionSize())
]
if temp == 0:
bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
bestA = np.random.choice(bestAs)
probs = [0] * len(counts)
probs[bestA] = 1
return probs
counts = [x**(1. / temp) for x in counts]
counts_sum = float(sum(counts))
probs = [x / counts_sum for x in counts]
return probs
def search(self, canonicalBoard, dirichlet_noise=False):
"""
This function performs one iteration of MCTS. It is recursively called
till a leaf node is found. The action chosen at each node is one that
has the maximum upper confidence bound as in the paper.
Once a leaf node is found, the neural network is called to return an
initial policy P and a value v for the state. This value is propagated
up the search path. In case the leaf node is a terminal state, the
outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
updated.
NOTE: the return values are the negative of the value of the current
state. This is done since v is in [-1,1] and if v is the value of a
state for the current player, then its value is -v for the other player.
Returns:
v: the negative of the value of the current canonicalBoard
"""
s = self.game.stringRepresentation(canonicalBoard)
if s not in self.Es:
self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
if self.Es[s] != 0:
# terminal node
return -self.Es[s]
if s not in self.Ps:
# leaf node
self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
valids = self.game.getValidMoves(canonicalBoard, 1)
self.Ps[s] = self.Ps[s] * valids # masking invalid moves
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
if sum_Ps_s > 0:
self.Ps[s] /= sum_Ps_s # renormalize
else:
# if all valid moves were masked make all valid moves equally probable
# NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
# If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
print("All valid moves were masked, doing a workaround.")
self.Ps[s] = self.Ps[s] + valids
self.Ps[s] /= np.sum(self.Ps[s])
self.Vs[s] = valids
self.Ns[s] = 0
return -v
valids = self.Vs[s]
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
self.Ps[s] /= sum_Ps_s # renormalize
cur_best = -float('inf')
best_act = -1
# pick the action with the highest upper confidence bound
for a in range(self.game.getActionSize()):
if valids[a]:
if (s, a) in self.Qsa:
u = self.Qsa[
(s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s]) / (1 + self.Nsa[(s, a)])
else:
u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s] + EPS) # Q = 0 ?
if u > cur_best:
cur_best = u
best_act = a
a = best_act
next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
next_s = self.game.getCanonicalForm(next_s, next_player)
v = self.search(next_s)
if (s, a) in self.Qsa:
self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
(s, a)] + v) / (self.Nsa[(s, a)] + 1)
self.Nsa[(s, a)] += 1
else:
self.Qsa[(s, a)] = v
self.Nsa[(s, a)] = 1
self.Ns[s] += 1
return -v
def applyDirNoise(self, s, valids):
dir_values = np.random.dirichlet(
[self.args.dirichletAlpha] * np.count_nonzero(valids))
dir_idx = 0
for idx in range(len(self.Ps[s])):
if self.Ps[s][idx]:
self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
0.25 * dir_values[dir_idx])
dir_idx += 1
## AlphaZero baseline for Connect4 game (distributed version)
- In this example, we provide a fine-tuned AlphaZero baseline to solve the Connect4 game, based on the code of [alpha-zero-general](https://github.com/suragnair/alpha-zero-general) repo.
- We take advantage of the parallelism capacity of [PARL](https://github.com/PaddlePaddle/PARL) to support running self-play and evaluating tasks in parallel.
- We also provide scripts to pack your well-trained model to a submission file, which can be submitted to the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition directly.
### Dependencies
- python3
- [parl==1.3](https://github.com/PaddlePaddle/PARL)
- torch
- tqdm
### Training
1. Download the [1k connect4 validation set](https://www.kaggle.com/petercnudde/1k-connect4-validation-set) to the current directory. (filename: `refmoves1k_kaggle`)
2. Start xparl cluster
```bash
# You can change following `cpu_num` and `args.actor_nums` in the main.py
# based on the CPU number of your machine.
xparl start --port 8010 --cpu_num 25
```
```bash
# [OPTIONAL] You can also run the following script in other machines to add more CPU resource
# to the xparl cluster, so you can increase the parallelism (args.actor_nums).
xparl connect --address MASTER_IP:8010 --cpu_num [CPU_NUM]
```
3. Run training script
```bash
python main.py
```
4. Visualize (good moves rate and perfect moves rate)
```
tensorboard --logdir .
```
### Submitting
To submit the well-trained model to the Kaggle, you can use our provided script to generate `submission.py`, for example:
```bash
python gen_submission.py saved_model/best.pth.tar
```
### Performance
- Following are `good moves rate` and `perfect moves rate` indicators in tensorbaord, please refer to the [link](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) for specific meaning.
<img src=".pic/good_moves.png" width = "300" alt="good moves rate"/> <img src=".pic/perfect_moves.png" width = "300" alt="perfect moves rate"/>
> It takes about 1 day to run 25 iterations on the machine with 25 cpus.
- It can reach about score 1368 (rank 5 on 2020/06/04) in the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition.
### Reference
- [suragnair/alpha-zero-general](https://github.com/suragnair/alpha-zero-general)
- [Scoring connect-x agents](https://www.kaggle.com/petercnudde/scoring-connect-x-agents)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import parl
import os
from alphazero_agent import create_agent
from MCTS import MCTS
from Arena import Arena
from utils import win_loss_draw
@parl.remote_class
class Actor(object):
def __init__(self, game, args):
os.environ['OMP_NUM_THREADS'] = "1"
self.game = game
self.args = args
# neural network of previous generation
self.previous_agent = create_agent(self.game, cuda=False)
# neural network of current generation
self.current_agent = create_agent(self.game, cuda=False)
# MCTS of previous generation
self.previous_mcts = MCTS(
self.game, self.previous_agent, self.args, dirichlet_noise=True)
# MCTS of current generation
self.current_mcts = MCTS(
self.game, self.current_agent, self.args, dirichlet_noise=True)
def self_play(self, current_weights, game_num):
"""Collecting training data by self-play.
Args:
current_weights (numpy.array): latest weights of neural network
game_num (int): game number of self-play
Returns:
train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v)
"""
# update weights of current neural network with latest weights
self.current_agent.set_weights(current_weights)
train_examples = []
for _ in range(game_num):
# reset node state of MCTS
self.current_mcts = MCTS(
self.game, self.current_agent, self.args, dirichlet_noise=True)
train_examples.extend(self._executeEpisode())
return train_examples
def pitting(self, previous_weights, current_weights, games_num):
"""Fighting between previous generation agent and current generation agent
Args:
previous_weights (numpy.array): weights of previous generation neural network
current_weights (numpy.array): weights of current generation neural network
game_num (int): game number of fighting
Returns:
tuple of (game number of previous agent won, game number of current agent won, game number of draw)
"""
# update weights of previous and current neural network
self.previous_agent.set_weights(previous_weights)
self.current_agent.set_weights(current_weights)
# reset node state of MCTS
self.previous_mcts = MCTS(self.game, self.previous_agent, self.args)
self.current_mcts = MCTS(self.game, self.current_agent, self.args)
arena = Arena(
lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)),
lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)),
self.game)
previous_wins, current_wins, draws = arena.playGames(games_num)
return (previous_wins, current_wins, draws)
def evaluate_test_dataset(self, current_weights, test_dataset):
"""Evaluate performance of latest neural nerwork
Args:
current_weights (numpy.array): latest weights of neural network
test_dataset (list): game number of self-play
Returns:
tuple of (number of perfect moves, number of good moves)
"""
# update weights of current neural network with latest weights
self.current_agent.set_weights(current_weights)
perfect_move_count, good_move_count = 0, 0
for data in test_dataset:
self.current_mcts = MCTS(self.game, self.current_agent, self.args)
x = self.game.getCanonicalForm(data['board'], data['player'])
agent_move = int(
np.argmax(self.current_mcts.getActionProb(x, temp=0)))
moves = data["move_score"]
perfect_score = max(moves)
perfect_moves = [i for i in range(7) if moves[i] == perfect_score]
if agent_move in perfect_moves:
perfect_move_count += 1
if win_loss_draw(
moves[agent_move]) == win_loss_draw(perfect_score):
good_move_count += 1
return (perfect_move_count, good_move_count)
def _executeEpisode(self):
"""
This function executes one episode of self-play, starting with player 1.
As the game goes on, each turn is added as a training example to
trainExamples. The game is played till the game ends. After the game
ends, the outcome of the game is used to assign values to each example
in trainExamples.
It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter
uses temp=0.
Returns:
trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
pi is the MCTS informed policy vector, v is +1 if
the player eventually won the game, else -1.
"""
trainExamples = []
board = self.game.getInitBoard()
self.curPlayer = 1
episodeStep = 0
while True:
episodeStep += 1
canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
temp = int(episodeStep < self.args.tempThresholdStep)
pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp)
sym = self.game.getSymmetries(canonicalBoard, pi)
for b, p in sym: # board, pi
trainExamples.append([b, self.curPlayer, p, None])
action = np.random.choice(len(pi), p=pi)
board, self.curPlayer = self.game.getNextState(
board, self.curPlayer, action)
r = self.game.getGameEnded(board, self.curPlayer)
if r != 0:
return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
for x in trainExamples]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import os
import numpy as np
import parl
import torch
import torch.optim as optim
from tqdm import tqdm
from utils import *
from connect4_model import Connect4Model
args = dotdict({
'lr': 0.001,
'dropout': 0.3,
'epochs': 5,
'batch_size': 64,
'num_channels': 64,
})
class AlphaZero(parl.Algorithm):
def __init__(self, model):
self.model = model
def learn(self, boards, target_pis, target_vs, optimizer):
self.model.train() # train mode
# compute model output
out_log_pi, out_v = self.model(boards)
pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0]
v_loss = torch.sum(
(target_vs - out_v.view(-1))**2) / target_vs.size()[0]
total_loss = pi_loss + v_loss
# compute gradient and do SGD step
optimizer.zero_grad()
total_loss.backward()
optimizer.step()
return total_loss, pi_loss, v_loss
def predict(self, board):
self.model.eval() # eval mode
with torch.no_grad():
log_pi, v = self.model(board)
pi = torch.exp(log_pi)
return pi, v
def create_agent(game, cuda=True):
cuda = cuda and torch.cuda.is_available()
model = Connect4Model(game, args)
if cuda:
model.cuda()
algorithm = AlphaZero(model)
alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
return alphazero_agent
class AlphaZeroAgent(parl.Agent):
def __init__(self, algorithm, game, cuda):
super(AlphaZeroAgent, self).__init__(algorithm)
self.cuda = cuda
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
def learn(self, examples):
"""
Args:
examples: list of examples, each example is of form (board, pi, v)
"""
optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr)
for epoch in range(args.epochs):
print('EPOCH ::: ' + str(epoch + 1))
batch_count = int(len(examples) / args.batch_size)
pbar = tqdm(range(batch_count), desc='Training Net')
for _ in pbar:
sample_ids = np.random.randint(
len(examples), size=args.batch_size)
boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
boards = torch.FloatTensor(np.array(boards).astype(np.float64))
target_pis = torch.FloatTensor(np.array(pis))
target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))
if self.cuda:
boards, target_pis, target_vs = boards.contiguous().cuda(
), target_pis.contiguous().cuda(), target_vs.contiguous(
).cuda()
total_loss, pi_loss, v_loss = self.algorithm.learn(
boards, target_pis, target_vs, optimizer)
# record loss with tqdm
pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item())
def predict(self, board):
"""
Args:
board (np.array): input board
Return:
pi (np.array): probability of actions
v (np.array): estimated value of input
"""
# preparing input
board = torch.FloatTensor(board.astype(np.float64))
if self.cuda:
board = board.contiguous().cuda()
board = board.view(1, self.board_x, self.board_y)
pi, v = self.algorithm.predict(board)
return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
def create_agent(game, cuda=True):
cuda = cuda and torch.cuda.is_available()
model = Connect4Model(game, args)
if cuda:
model.cuda()
algorithm = AlphaZero(model)
alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
return alphazero_agent
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
import numpy as np
from collections import namedtuple
DEFAULT_HEIGHT = 6
DEFAULT_WIDTH = 7
DEFAULT_WIN_LENGTH = 4
WinState = namedtuple('WinState', 'is_ended winner')
class Board():
"""
Connect4 Board.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
"Set up initial board configuration."
self.height = height or DEFAULT_HEIGHT
self.width = width or DEFAULT_WIDTH
self.win_length = win_length or DEFAULT_WIN_LENGTH
if np_pieces is None:
self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
else:
self.np_pieces = np_pieces
assert self.np_pieces.shape == (self.height, self.width)
def add_stone(self, column, player):
"Create copy of board containing new stone."
available_idx, = np.where(self.np_pieces[:, column] == 0)
if len(available_idx) == 0:
raise ValueError(
"Can't play column %s on board %s" % (column, self))
self.np_pieces[available_idx[-1]][column] = player
def get_valid_moves(self):
"Any zero value in top row in a valid move"
return self.np_pieces[0] == 0
def get_win_state(self):
for player in [-1, 1]:
player_pieces = self.np_pieces == -player
# Check rows & columns for win
if (self._is_straight_winner(player_pieces)
or self._is_straight_winner(player_pieces.transpose())
or self._is_diagonal_winner(player_pieces)):
return WinState(True, -player)
# draw has very little value.
if not self.get_valid_moves().any():
return WinState(True, None)
# Game is not ended yet.
return WinState(False, None)
def with_np_pieces(self, np_pieces):
"""Create copy of board with specified pieces."""
if np_pieces is None:
np_pieces = self.np_pieces
return Board(self.height, self.width, self.win_length, np_pieces)
def _is_diagonal_winner(self, player_pieces):
"""Checks if player_pieces contains a diagonal win."""
win_length = self.win_length
for i in range(len(player_pieces) - win_length + 1):
for j in range(len(player_pieces[0]) - win_length + 1):
if all(player_pieces[i + x][j + x] for x in range(win_length)):
return True
for j in range(win_length - 1, len(player_pieces[0])):
if all(player_pieces[i + x][j - x] for x in range(win_length)):
return True
return False
def _is_straight_winner(self, player_pieces):
"""Checks if player_pieces contains a vertical or horizontal win."""
run_lengths = [
player_pieces[:, i:i + self.win_length].sum(axis=1)
for i in range(len(player_pieces) - self.win_length + 2)
]
return max([x.max() for x in run_lengths]) >= self.win_length
def __str__(self):
return str(self.np_pieces)
class Connect4Game(object):
"""
Connect4 Game class implementing the alpha-zero-general Game interface.
Use 1 for player1 and -1 for player2.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
self._base_board = Board(height, width, win_length, np_pieces)
def getInitBoard(self):
"""
Returns:
startBoard: a representation of the board (ideally this is the form
that will be the input to your neural network)
"""
return self._base_board.np_pieces
def getBoardSize(self):
"""
Returns:
(x,y): a tuple of board dimensions
"""
return (self._base_board.height, self._base_board.width)
def getActionSize(self):
"""
Returns:
actionSize: number of all possible actions
"""
return self._base_board.width
def getNextState(self, board, player, action):
"""Returns a copy of the board with updated move, original board is unmodified.
Input:
board: current board
player: current player (1 or -1)
action: action taken by current player
Returns:
nextBoard: board after applying action
nextPlayer: player who plays in the next turn (should be -player)
"""
b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
b.add_stone(action, player)
return b.np_pieces, -player
def getValidMoves(self, board, player):
"""Any zero value in top row in a valid move.
Input:
board: current board
player: current player
Returns:
validMoves: a binary vector of length self.getActionSize(), 1 for
moves that are valid from the current board and player,
0 for invalid moves
"""
return self._base_board.with_np_pieces(
np_pieces=board).get_valid_moves()
def getGameEnded(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
r: 0 if game has not ended. 1 if player won, -1 if player lost,
small non-zero value for draw.
"""
b = self._base_board.with_np_pieces(np_pieces=board)
winstate = b.get_win_state()
if winstate.is_ended:
if winstate.winner is None:
# draw has very little value.
return 1e-4
elif winstate.winner == player:
return +1
elif winstate.winner == -player:
return -1
else:
raise ValueError('Unexpected winstate found: ', winstate)
else:
# 0 used to represent unfinished game.
return 0
def getCanonicalForm(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
canonicalBoard: returns canonical form of board. The canonical form
should be independent of player. For e.g. in chess,
the canonical form can be chosen to be from the pov
of white. When the player is white, we can return
board as is. When the player is black, we can invert
the colors and return the board.
"""
return board * player
def getSymmetries(self, board, pi):
"""Board is left/right board symmetric
Input:
board: current board
pi: policy vector of size self.getActionSize()
Returns:
symmForms: a list of [(board,pi)] where each tuple is a symmetrical
form of the board and the corresponding pi vector. This
is used when training the neural network from examples.
"""
return [(board, pi),
(np.array(board[:, ::-1], copy=True),
np.array(pi[::-1], copy=True))]
def stringRepresentation(self, board):
"""
Input:
board: current board
Returns:
boardString: a quick conversion of board to a string format.
Required by MCTS for hashing.
"""
return board.tostring()
@staticmethod
def display(board):
print(" -----------------------")
print(' '.join(map(str, range(len(board[0])))))
print(board)
print(" -----------------------")
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Connect4Model(parl.Model):
def __init__(self, game, args):
# game params
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
self.args = args
super(Connect4Model, self).__init__()
self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.conv4 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.bn1 = nn.BatchNorm2d(args.num_channels)
self.bn2 = nn.BatchNorm2d(args.num_channels)
self.bn3 = nn.BatchNorm2d(args.num_channels)
self.bn4 = nn.BatchNorm2d(args.num_channels)
self.fc1 = nn.Linear(
args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
self.fc_bn1 = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 64)
self.fc_bn2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, self.action_size)
self.fc4 = nn.Linear(64, 1)
def forward(self, s):
"""
Args:
s(torch.Tensor): batch_size x board_x x board_y
"""
# batch_size x 1 x board_x x board_y
s = s.view(-1, 1, self.board_x, self.board_y)
# batch_size x num_channels x board_x x board_y
s = F.relu(self.bn1(self.conv1(s)))
# batch_size x num_channels x board_x x board_y
s = F.relu(self.bn2(self.conv2(s)))
# batch_size x num_channels x (board_x-2) x (board_y-2)
s = F.relu(self.bn3(self.conv3(s)))
# batch_size x num_channels x (board_x-4) x (board_y-4)
s = F.relu(self.bn4(self.conv4(s)))
s = s.view(
-1,
self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
s = F.dropout(
F.relu(self.fc_bn1(self.fc1(s))),
p=self.args.dropout,
training=self.training) # batch_size x 128
s = F.dropout(
F.relu(self.fc_bn2(self.fc2(s))),
p=self.args.dropout,
training=self.training) # batch_size x 64
pi = self.fc3(s) # batch_size x action_size
v = self.fc4(s) # batch_size x 1
return F.log_softmax(pi, dim=1), torch.tanh(v)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import sys
import base64
import inspect
import os
assert len(sys.argv) == 2, "please specify model path."
model_path = sys.argv[1]
with open(model_path, 'rb') as f:
raw_bytes = f.read()
encoded_weights = base64.encodebytes(raw_bytes)
# encode weights of model to byte string
submission_file = """
import base64
decoded = base64.b64decode({})
""".format(encoded_weights)
# insert code snippet of loading weights
with open('submission_template.py', 'r') as f:
submission_file += ''.join(f.readlines())
# generate final submission file
with open('submission.py', 'w') as f:
f.write(submission_file)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from Coach import Coach
from connect4_game import Connect4Game
from utils import *
from parl.utils import logger
args = dotdict({
# master address of xparl cluster
'master_address': 'localhost:8010',
# number of remote actors (execute tasks [self-play/pitting/evaluate_test_dataset] in parallel).
'actors_num': 25,
# total number of iteration
'numIters': 200,
# Number of complete self-play games to simulate during a new iteration.
'numEps': 500,
# Number of games to play during arena (pitting) play to determine if new neural network will be accepted.
'arenaCompare': 50,
# Number of games moves for MCTS to simulate.
'numMCTSSims': 800,
# temp=1 (Temperature, τ (tau)) if episodeStep < tempThresholdStep, and thereafter uses temp=0.
'tempThresholdStep': 15,
# During arena playoff, new neural net will be accepted if threshold or more of games are won.
'updateThreshold': 0.6,
# CPUCT parameter
'cpuct': 4,
# alpha parameter of dirichlet noise which is added to the policy (pi)
'dirichletAlpha': 1.0,
# history of examples from numItersForTrainExamplesHistory latest iterations (training data)
'numItersForTrainExamplesHistory': 20,
# folder to save model and training examples
'checkpoint': './saved_model/',
# whether to load saved model and training examples
'load_model': False,
'load_folder_file': ('./saved_model', 'checkpoint_1.pth.tar'),
})
# Plays arenaCompare games in which player1 starts arenaCompare/2 games and player2 starts arenaCompare/2 games.
assert args.arenaCompare % 2 == 0
# make sure the tasks can be split evenly among different remote actors
assert args.numEps % args.actors_num == 0
assert (args.arenaCompare // 2) % args.actors_num == 0
assert 1000 % args.actors_num == 0 # there are 1000 boards state in test_dataset
def main():
game = Connect4Game()
c = Coach(game, args)
if args.load_model:
logger.info('Loading checkpoint {}...'.format(args.load_folder_file))
c.loadModel()
logger.info("Loading 'trainExamples' from file {}...".format(
args.load_folder_file))
c.loadTrainExamples()
c.learn()
if __name__ == "__main__":
main()
# Third party code
#
# The following code are copied or modified from:
# https://github.com/suragnair/alpha-zero-general
import os
os.environ['OMP_NUM_THREADS'] = "1"
# ===== utils.py =====
class dotdict(dict):
def __getattr__(self, name):
return self[name]
# ===== MCTS.py ======
import math
import time
import numpy as np
EPS = 1e-8
class MCTS():
"""
This class handles the MCTS tree.
"""
def __init__(self, game, nn_agent, args, dirichlet_noise=False):
self.game = game
self.nn_agent = nn_agent
self.args = args
self.dirichlet_noise = dirichlet_noise
self.Qsa = {} # stores Q values for s,a (as defined in the paper)
self.Nsa = {} # stores #times edge s,a was visited
self.Ns = {} # stores #times board s was visited
self.Ps = {} # stores initial policy (returned by neural net)
self.Es = {} # stores game.getGameEnded ended for board s
self.Vs = {} # stores game.getValidMoves for board s
def getActionProb(self, canonicalBoard, temp=1, timelimit=4.9):
"""
This function performs numMCTSSims simulations of MCTS starting from
canonicalBoard.
Returns:
probs: a policy vector where the probability of the ith action is
proportional to Nsa[(s,a)]**(1./temp)
"""
dir_noise = self.dirichlet_noise
start_time = time.time()
while time.time() - start_time < timelimit:
self.search(canonicalBoard, dirichlet_noise=dir_noise)
s = self.game.stringRepresentation(canonicalBoard)
counts = [
self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
for a in range(self.game.getActionSize())
]
if temp == 0:
bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
bestA = np.random.choice(bestAs)
probs = [0] * len(counts)
probs[bestA] = 1
return probs
counts = [x**(1. / temp) for x in counts]
counts_sum = float(sum(counts))
probs = [x / counts_sum for x in counts]
return probs
def search(self, canonicalBoard, dirichlet_noise=False):
"""
This function performs one iteration of MCTS. It is recursively called
till a leaf node is found. The action chosen at each node is one that
has the maximum upper confidence bound as in the paper.
Once a leaf node is found, the neural network is called to return an
initial policy P and a value v for the state. This value is propagated
up the search path. In case the leaf node is a terminal state, the
outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
updated.
NOTE: the return values are the negative of the value of the current
state. This is done since v is in [-1,1] and if v is the value of a
state for the current player, then its value is -v for the other player.
Returns:
v: the negative of the value of the current canonicalBoard
"""
s = self.game.stringRepresentation(canonicalBoard)
if s not in self.Es:
self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
if self.Es[s] != 0:
# terminal node
return -self.Es[s]
if s not in self.Ps:
# leaf node
self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
valids = self.game.getValidMoves(canonicalBoard, 1)
self.Ps[s] = self.Ps[s] * valids # masking invalid moves
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
if sum_Ps_s > 0:
self.Ps[s] /= sum_Ps_s # renormalize
else:
# if all valid moves were masked make all valid moves equally probable
# NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
# If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
print("All valid moves were masked, doing a workaround.")
self.Ps[s] = self.Ps[s] + valids
self.Ps[s] /= np.sum(self.Ps[s])
self.Vs[s] = valids
self.Ns[s] = 0
return -v
valids = self.Vs[s]
if dirichlet_noise:
self.applyDirNoise(s, valids)
sum_Ps_s = np.sum(self.Ps[s])
self.Ps[s] /= sum_Ps_s # renormalize
cur_best = -float('inf')
best_act = -1
# pick the action with the highest upper confidence bound
for a in range(self.game.getActionSize()):
if valids[a]:
if (s, a) in self.Qsa:
u = self.Qsa[
(s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s]) / (1 + self.Nsa[(s, a)])
else:
u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
self.Ns[s] + EPS) # Q = 0 ?
if u > cur_best:
cur_best = u
best_act = a
a = best_act
next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
next_s = self.game.getCanonicalForm(next_s, next_player)
v = self.search(next_s)
if (s, a) in self.Qsa:
self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
(s, a)] + v) / (self.Nsa[(s, a)] + 1)
self.Nsa[(s, a)] += 1
else:
self.Qsa[(s, a)] = v
self.Nsa[(s, a)] = 1
self.Ns[s] += 1
return -v
def applyDirNoise(self, s, valids):
dir_values = np.random.dirichlet(
[self.args.dirichletAlpha] * np.count_nonzero(valids))
dir_idx = 0
for idx in range(len(self.Ps[s])):
if self.Ps[s][idx]:
self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
0.25 * dir_values[dir_idx])
dir_idx += 1
# ===== connect4_game.py ======
import numpy as np
from collections import namedtuple
DEFAULT_HEIGHT = 6
DEFAULT_WIDTH = 7
DEFAULT_WIN_LENGTH = 4
WinState = namedtuple('WinState', 'is_ended winner')
class Board():
"""
Connect4 Board.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
"Set up initial board configuration."
self.height = height or DEFAULT_HEIGHT
self.width = width or DEFAULT_WIDTH
self.win_length = win_length or DEFAULT_WIN_LENGTH
if np_pieces is None:
self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
else:
self.np_pieces = np_pieces
assert self.np_pieces.shape == (self.height, self.width)
def add_stone(self, column, player):
"Create copy of board containing new stone."
available_idx, = np.where(self.np_pieces[:, column] == 0)
if len(available_idx) == 0:
raise ValueError(
"Can't play column %s on board %s" % (column, self))
self.np_pieces[available_idx[-1]][column] = player
def get_valid_moves(self):
"Any zero value in top row in a valid move"
return self.np_pieces[0] == 0
def get_win_state(self):
for player in [-1, 1]:
player_pieces = self.np_pieces == -player
# Check rows & columns for win
if (self._is_straight_winner(player_pieces)
or self._is_straight_winner(player_pieces.transpose())
or self._is_diagonal_winner(player_pieces)):
return WinState(True, -player)
# draw has very little value.
if not self.get_valid_moves().any():
return WinState(True, None)
# Game is not ended yet.
return WinState(False, None)
def with_np_pieces(self, np_pieces):
"""Create copy of board with specified pieces."""
if np_pieces is None:
np_pieces = self.np_pieces
return Board(self.height, self.width, self.win_length, np_pieces)
def _is_diagonal_winner(self, player_pieces):
"""Checks if player_pieces contains a diagonal win."""
win_length = self.win_length
for i in range(len(player_pieces) - win_length + 1):
for j in range(len(player_pieces[0]) - win_length + 1):
if all(player_pieces[i + x][j + x] for x in range(win_length)):
return True
for j in range(win_length - 1, len(player_pieces[0])):
if all(player_pieces[i + x][j - x] for x in range(win_length)):
return True
return False
def _is_straight_winner(self, player_pieces):
"""Checks if player_pieces contains a vertical or horizontal win."""
run_lengths = [
player_pieces[:, i:i + self.win_length].sum(axis=1)
for i in range(len(player_pieces) - self.win_length + 2)
]
return max([x.max() for x in run_lengths]) >= self.win_length
def __str__(self):
return str(self.np_pieces)
class Connect4Game(object):
"""
Connect4 Game class implementing the alpha-zero-general Game interface.
Use 1 for player1 and -1 for player2.
"""
def __init__(self,
height=None,
width=None,
win_length=None,
np_pieces=None):
self._base_board = Board(height, width, win_length, np_pieces)
def getInitBoard(self):
"""
Returns:
startBoard: a representation of the board (ideally this is the form
that will be the input to your neural network)
"""
return self._base_board.np_pieces
def getBoardSize(self):
"""
Returns:
(x,y): a tuple of board dimensions
"""
return (self._base_board.height, self._base_board.width)
def getActionSize(self):
"""
Returns:
actionSize: number of all possible actions
"""
return self._base_board.width
def getNextState(self, board, player, action):
"""Returns a copy of the board with updated move, original board is unmodified.
Input:
board: current board
player: current player (1 or -1)
action: action taken by current player
Returns:
nextBoard: board after applying action
nextPlayer: player who plays in the next turn (should be -player)
"""
b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
b.add_stone(action, player)
return b.np_pieces, -player
def getValidMoves(self, board, player):
"""Any zero value in top row in a valid move.
Input:
board: current board
player: current player
Returns:
validMoves: a binary vector of length self.getActionSize(), 1 for
moves that are valid from the current board and player,
0 for invalid moves
"""
return self._base_board.with_np_pieces(
np_pieces=board).get_valid_moves()
def getGameEnded(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
r: 0 if game has not ended. 1 if player won, -1 if player lost,
small non-zero value for draw.
"""
b = self._base_board.with_np_pieces(np_pieces=board)
winstate = b.get_win_state()
if winstate.is_ended:
if winstate.winner is None:
# draw has very little value.
return 1e-4
elif winstate.winner == player:
return +1
elif winstate.winner == -player:
return -1
else:
raise ValueError('Unexpected winstate found: ', winstate)
else:
# 0 used to represent unfinished game.
return 0
def getCanonicalForm(self, board, player):
"""
Input:
board: current board
player: current player (1 or -1)
Returns:
canonicalBoard: returns canonical form of board. The canonical form
should be independent of player. For e.g. in chess,
the canonical form can be chosen to be from the pov
of white. When the player is white, we can return
board as is. When the player is black, we can invert
the colors and return the board.
"""
return board * player
def getSymmetries(self, board, pi):
"""Board is left/right board symmetric
Input:
board: current board
pi: policy vector of size self.getActionSize()
Returns:
symmForms: a list of [(board,pi)] where each tuple is a symmetrical
form of the board and the corresponding pi vector. This
is used when training the neural network from examples.
"""
return [(board, pi),
(np.array(board[:, ::-1], copy=True),
np.array(pi[::-1], copy=True))]
def stringRepresentation(self, board):
"""
Input:
board: current board
Returns:
boardString: a quick conversion of board to a string format.
Required by MCTS for hashing.
"""
return board.tostring()
@staticmethod
def display(board):
print(" -----------------------")
print(' '.join(map(str, range(len(board[0])))))
print(board)
print(" -----------------------")
# ===== connect4_model ======
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
#class Connect4Model(parl.Model): # Kaggle doesn't support parl package
class Connect4Model(nn.Module):
def __init__(self, game, args):
# game params
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
self.args = args
super(Connect4Model, self).__init__()
self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
self.conv2 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1, padding=1)
self.conv3 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.conv4 = nn.Conv2d(
args.num_channels, args.num_channels, 3, stride=1)
self.bn1 = nn.BatchNorm2d(args.num_channels)
self.bn2 = nn.BatchNorm2d(args.num_channels)
self.bn3 = nn.BatchNorm2d(args.num_channels)
self.bn4 = nn.BatchNorm2d(args.num_channels)
self.fc1 = nn.Linear(
args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
self.fc_bn1 = nn.BatchNorm1d(128)
self.fc2 = nn.Linear(128, 64)
self.fc_bn2 = nn.BatchNorm1d(64)
self.fc3 = nn.Linear(64, self.action_size)
self.fc4 = nn.Linear(64, 1)
def forward(self, s):
# s: batch_size x board_x x board_y
s = s.view(-1, 1, self.board_x,
self.board_y) # batch_size x 1 x board_x x board_y
s = F.relu(self.bn1(
self.conv1(s))) # batch_size x num_channels x board_x x board_y
s = F.relu(self.bn2(
self.conv2(s))) # batch_size x num_channels x board_x x board_y
s = F.relu(self.bn3(self.conv3(
s))) # batch_size x num_channels x (board_x-2) x (board_y-2)
s = F.relu(self.bn4(self.conv4(
s))) # batch_size x num_channels x (board_x-4) x (board_y-4)
s = s.view(
-1,
self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
s = F.dropout(
F.relu(self.fc_bn1(self.fc1(s))),
p=self.args.dropout,
training=self.training) # batch_size x 128
s = F.dropout(
F.relu(self.fc_bn2(self.fc2(s))),
p=self.args.dropout,
training=self.training) # batch_size x 64
pi = self.fc3(s) # batch_size x action_size
v = self.fc4(s) # batch_size x 1
return F.log_softmax(pi, dim=1), torch.tanh(v)
# ===== simple agent ======
args = dotdict({
'dropout': 0.3,
'num_channels': 64,
})
class SimpleAgent():
def __init__(self, game, cuda=True):
self.cuda = cuda and torch.cuda.is_available()
self.model = Connect4Model(game, args)
if self.cuda:
self.model.cuda()
self.board_x, self.board_y = game.getBoardSize()
self.action_size = game.getActionSize()
def predict(self, board):
"""
Args:
board (np.array): input board
Return:
pi (np.array): probability of actions
v (np.array): estimated value of input
"""
# preparing input
board = torch.FloatTensor(board.astype(np.float64))
if self.cuda:
board = board.contiguous().cuda()
board = board.view(1, self.board_x, self.board_y)
self.model.eval() # eval mode
with torch.no_grad():
log_pi, v = self.model(board)
pi = torch.exp(log_pi)
return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
def load_checkpoint(self, buffer):
map_location = None if self.cuda else 'cpu'
checkpoint = torch.load(buffer, map_location=map_location)
self.model.load_state_dict(checkpoint)
# ===== predict function ======
import base64
import io
game = Connect4Game()
# AlphaZero players
agent = SimpleAgent(game)
buffer = io.BytesIO(decoded)
agent.load_checkpoint(buffer)
mcts_args = dotdict({'numMCTSSims': 800, 'cpuct': 1.0})
mcts = MCTS(game, agent, mcts_args)
def alphazero_agent(obs, config):
board = np.reshape(obs.board.copy(), game.getBoardSize()).astype(int)
board[np.where(board == 2)] = -1
player = 1
if obs.mark == 2:
player = -1
x = game.getCanonicalForm(board, player)
action = np.argmax(
mcts.getActionProb(x, temp=0, timelimit=config.timeout - 0.5))
return int(action)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
class dotdict(dict):
def __getattr__(self, name):
try:
return self[name]
except KeyError:
raise AttributeError(name)
def win_loss_draw(score):
if score > 0:
return 'win'
if score < 0:
return 'loss'
return 'draw'
"""
split one list to multiple lists
"""
split_group = lambda the_list, group_size: zip(*(iter(the_list), ) * group_size)
import numpy as np
import json
from connect4_game import Connect4Game
def get_test_dataset():
game = Connect4Game()
test_dataset = []
with open("refmoves1k_kaggle") as f:
for line in f:
data = json.loads(line)
board = data["board"]
board = np.reshape(board, game.getBoardSize()).astype(int)
board[np.where(board == 2)] = -1
# find out how many moves are played to set the correct mark.
ply = len([x for x in data["board"] if x > 0])
if ply & 1:
player = -1
else:
player = 1
test_dataset.append({
'board': board,
'player': player,
'move_score': data['move score'],
})
return test_dataset
...@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind ...@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind
from parl.utils.window_stat import WindowStat from parl.utils.window_stat import WindowStat
from parl.utils.time_stat import TimeStat from parl.utils.time_stat import TimeStat
from parl.utils import machine_info from parl.utils import machine_info
from parl.utils import logger, get_gpu_count, tensorboard from parl.utils import logger, get_gpu_count, summary
from parl.algorithms import A2C from parl.algorithms import A2C
from atari_model import ActorCritic from atari_model import ActorCritic
...@@ -205,19 +205,19 @@ class Learner(object): ...@@ -205,19 +205,19 @@ class Learner(object):
} }
if metric['mean_episode_rewards'] is not None: if metric['mean_episode_rewards'] is not None:
tensorboard.add_scalar('train/mean_reward', summary.add_scalar('train/mean_reward',
metric['mean_episode_rewards'], metric['mean_episode_rewards'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/total_loss', metric['total_loss'], summary.add_scalar('train/total_loss', metric['total_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/pi_loss', metric['pi_loss'], summary.add_scalar('train/pi_loss', metric['pi_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/vf_loss', metric['vf_loss'], summary.add_scalar('train/vf_loss', metric['vf_loss'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/entropy', metric['entropy'], summary.add_scalar('train/entropy', metric['entropy'],
self.sample_total_steps) self.sample_total_steps)
tensorboard.add_scalar('train/learn_rate', metric['lr'], summary.add_scalar('train/learn_rate', metric['lr'],
self.sample_total_steps) self.sample_total_steps)
logger.info(metric) logger.info(metric)
......
...@@ -16,16 +16,16 @@ import numpy as np ...@@ -16,16 +16,16 @@ import numpy as np
import copy import copy
from collections import deque, namedtuple from collections import deque, namedtuple
Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver']) Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
class ReplayMemory(object): class ReplayMemory(object):
def __init__(self, max_size, state_shape, context_len): def __init__(self, max_size, obs_shape, context_len):
self.max_size = int(max_size) self.max_size = int(max_size)
self.state_shape = state_shape self.obs_shape = obs_shape
self.context_len = int(context_len) self.context_len = int(context_len)
self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8') self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
self.action = np.zeros((self.max_size, ), dtype='int32') self.action = np.zeros((self.max_size, ), dtype='int32')
self.reward = np.zeros((self.max_size, ), dtype='float32') self.reward = np.zeros((self.max_size, ), dtype='float32')
self.isOver = np.zeros((self.max_size, ), dtype='bool') self.isOver = np.zeros((self.max_size, ), dtype='bool')
...@@ -48,42 +48,41 @@ class ReplayMemory(object): ...@@ -48,42 +48,41 @@ class ReplayMemory(object):
else: else:
self._context.append(exp) self._context.append(exp)
def recent_state(self): def recent_obs(self):
""" maintain recent state for training""" """ maintain recent obs for training"""
lst = list(self._context) lst = list(self._context)
states = [np.zeros(self.state_shape, dtype='uint8')] * \ obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
(self._context.maxlen - len(lst)) (self._context.maxlen - len(lst))
states.extend([k.state for k in lst]) obs.extend([k.obs for k in lst])
return states return obs
def sample(self, idx): def sample(self, idx):
""" return state, action, reward, isOver, """ return obs, action, reward, isOver,
note that some frames in state may be generated from last episode, note that some frames in obs may be generated from last episode,
they should be removed from state they should be removed from obs
""" """
state = np.zeros( obs = np.zeros(
(self.context_len + 1, ) + self.state_shape, dtype=np.uint8) (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
state_idx = np.arange(idx, obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
idx + self.context_len + 1) % self._curr_size
# confirm that no frame was generated from last episode # confirm that no frame was generated from last episode
has_last_episode = False has_last_episode = False
for k in range(self.context_len - 2, -1, -1): for k in range(self.context_len - 2, -1, -1):
to_check_idx = state_idx[k] to_check_idx = obs_idx[k]
if self.isOver[to_check_idx]: if self.isOver[to_check_idx]:
has_last_episode = True has_last_episode = True
state_idx = state_idx[k + 1:] obs_idx = obs_idx[k + 1:]
state[k + 1:] = self.state[state_idx] obs[k + 1:] = self.obs[obs_idx]
break break
if not has_last_episode: if not has_last_episode:
state = self.state[state_idx] obs = self.obs[obs_idx]
real_idx = (idx + self.context_len - 1) % self._curr_size real_idx = (idx + self.context_len - 1) % self._curr_size
action = self.action[real_idx] action = self.action[real_idx]
reward = self.reward[real_idx] reward = self.reward[real_idx]
isOver = self.isOver[real_idx] isOver = self.isOver[real_idx]
return state, reward, action, isOver return obs, reward, action, isOver
def __len__(self): def __len__(self):
return self._curr_size return self._curr_size
...@@ -92,7 +91,7 @@ class ReplayMemory(object): ...@@ -92,7 +91,7 @@ class ReplayMemory(object):
return self._curr_size return self._curr_size
def _assign(self, pos, exp): def _assign(self, pos, exp):
self.state[pos] = exp.state self.obs[pos] = exp.obs
self.reward[pos] = exp.reward self.reward[pos] = exp.reward
self.action[pos] = exp.action self.action[pos] = exp.action
self.isOver[pos] = exp.isOver self.isOver[pos] = exp.isOver
...@@ -107,8 +106,8 @@ class ReplayMemory(object): ...@@ -107,8 +106,8 @@ class ReplayMemory(object):
return self._process_batch(batch_exp) return self._process_batch(batch_exp)
def _process_batch(self, batch_exp): def _process_batch(self, batch_exp):
state = np.asarray([e[0] for e in batch_exp], dtype='uint8') obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
reward = np.asarray([e[1] for e in batch_exp], dtype='float32') reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
action = np.asarray([e[2] for e in batch_exp], dtype='int8') action = np.asarray([e[2] for e in batch_exp], dtype='int8')
isOver = np.asarray([e[3] for e in batch_exp], dtype='bool') isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
return [state, action, reward, isOver] return [obs, action, reward, isOver]
...@@ -22,11 +22,11 @@ import parl ...@@ -22,11 +22,11 @@ import parl
import numpy as np import numpy as np
from tqdm import tqdm from tqdm import tqdm
from parl.utils import tensorboard, logger from parl.utils import summary, logger
from parl.algorithms import DQN, DDQN from parl.algorithms import DQN, DDQN
from agent import AtariAgent from agent import AtariAgent
from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState from atari_wrapper import FireResetEnv, FrameStack, LimitLength
from model import AtariModel from model import AtariModel
from replay_memory import ReplayMemory, Experience from replay_memory import ReplayMemory, Experience
from utils import get_player from utils import get_player
...@@ -43,57 +43,57 @@ GAMMA = 0.99 ...@@ -43,57 +43,57 @@ GAMMA = 0.99
def run_train_episode(env, agent, rpm): def run_train_episode(env, agent, rpm):
total_reward = 0 total_reward = 0
all_cost = [] all_cost = []
state = env.reset() obs = env.reset()
steps = 0 steps = 0
while True: while True:
steps += 1 steps += 1
context = rpm.recent_state() context = rpm.recent_obs()
context.append(state) context.append(obs)
context = np.stack(context, axis=0) context = np.stack(context, axis=0)
action = agent.sample(context) action = agent.sample(context)
next_state, reward, isOver, _ = env.step(action) next_obs, reward, isOver, _ = env.step(action)
rpm.append(Experience(state, action, reward, isOver)) rpm.append(Experience(obs, action, reward, isOver))
if rpm.size() > MEMORY_WARMUP_SIZE: if rpm.size() > MEMORY_WARMUP_SIZE:
if steps % UPDATE_FREQ == 0: if steps % UPDATE_FREQ == 0:
batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch( batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size) args.batch_size)
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
batch_next_state = batch_all_state[:, 1:, :, :] batch_next_obs = batch_all_obs[:, 1:, :, :]
cost = agent.learn(batch_state, batch_action, batch_reward, cost = agent.learn(batch_obs, batch_action, batch_reward,
batch_next_state, batch_isOver) batch_next_obs, batch_isOver)
all_cost.append(cost) all_cost.append(cost)
total_reward += reward total_reward += reward
state = next_state obs = next_obs
if isOver: if isOver:
mean_loss = np.mean(all_cost) if all_cost else None mean_loss = np.mean(all_cost) if all_cost else None
return total_reward, steps, mean_loss return total_reward, steps, mean_loss
def run_evaluate_episode(env, agent): def run_evaluate_episode(env, agent):
state = env.reset() obs = env.reset()
total_reward = 0 total_reward = 0
while True: while True:
pred_Q = agent.predict(state) pred_Q = agent.predict(obs)
action = pred_Q.max(1)[1].item() action = pred_Q.max(1)[1].item()
state, reward, isOver, _ = env.step(action) obs, reward, isOver, _ = env.step(action)
total_reward += reward total_reward += reward
if isOver: if isOver:
return total_reward return total_reward
def get_fixed_states(rpm, batch_size): def get_fixed_obs(rpm, batch_size):
states = [] obs = []
for _ in range(3): for _ in range(3):
batch_all_state = rpm.sample_batch(batch_size)[0] batch_all_obs = rpm.sample_batch(batch_size)[0]
batch_state = batch_all_state[:, :CONTEXT_LEN, :, :] batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
states.append(batch_state) obs.append(batch_obs)
fixed_states = np.concatenate(states, axis=0) fixed_obs = np.concatenate(obs, axis=0)
return fixed_states return fixed_obs
def evaluate_fixed_Q(agent, states): def evaluate_fixed_Q(agent, obs):
with torch.no_grad(): with torch.no_grad():
max_pred_Q = agent.alg.model(states).max(1)[0].mean() max_pred_Q = agent.alg.model(obs).max(1)[0].mean()
return max_pred_Q.item() return max_pred_Q.item()
...@@ -131,9 +131,9 @@ def main(): ...@@ -131,9 +131,9 @@ def main():
total_reward, steps, _ = run_train_episode(env, agent, rpm) total_reward, steps, _ = run_train_episode(env, agent, rpm)
pbar.update(steps) pbar.update(steps)
# Get fixed states to check value function. # Get fixed obs to check value function.
fixed_states = get_fixed_states(rpm, args.batch_size) fixed_obs = get_fixed_obs(rpm, args.batch_size)
fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device) fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device)
# train # train
test_flag = 0 test_flag = 0
...@@ -152,18 +152,17 @@ def main(): ...@@ -152,18 +152,17 @@ def main():
for _ in range(3): for _ in range(3):
eval_rewards.append(run_evaluate_episode(test_env, agent)) eval_rewards.append(run_evaluate_episode(test_env, agent))
tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards), summary.add_scalar('dqn/eval', np.mean(eval_rewards),
total_steps) total_steps)
tensorboard.add_scalar('dqn/score', total_reward, total_steps) summary.add_scalar('dqn/score', total_reward, total_steps)
tensorboard.add_scalar('dqn/loss', loss, total_steps) summary.add_scalar('dqn/loss', loss, total_steps)
tensorboard.add_scalar('dqn/exploration', agent.exploration, summary.add_scalar('dqn/exploration', agent.exploration,
total_steps) total_steps)
tensorboard.add_scalar('dqn/Q value', summary.add_scalar('dqn/Q value',
evaluate_fixed_Q(agent, fixed_states), evaluate_fixed_Q(agent, fixed_obs),
total_steps) total_steps)
tensorboard.add_scalar('dqn/grad_norm', summary.add_scalar('dqn/grad_norm',
get_grad_norm(agent.alg.model), get_grad_norm(agent.alg.model), total_steps)
total_steps)
if __name__ == '__main__': if __name__ == '__main__':
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import torch
def get_args():
parser = argparse.ArgumentParser(description='RL')
parser.add_argument(
'--lr', type=float, default=3e-4, help='learning rate (default: 3e-4)')
parser.add_argument(
'--eps',
type=float,
default=1e-5,
help='RMSprop optimizer epsilon (default: 1e-5)')
parser.add_argument(
'--gamma',
type=float,
default=0.99,
help='discount factor for rewards (default: 0.99)')
parser.add_argument(
'--gae-lambda',
type=float,
default=0.95,
help='gae lambda parameter (default: 0.95)')
parser.add_argument(
'--entropy-coef',
type=float,
default=0.,
help='entropy term coefficient (default: 0.)')
parser.add_argument(
'--value-loss-coef',
type=float,
default=0.5,
help='value loss coefficient (default: 0.5)')
parser.add_argument(
'--max-grad-norm',
type=float,
default=0.5,
help='max norm of gradients (default: 0.5)')
parser.add_argument(
'--seed', type=int, default=1, help='random seed (default: 1)')
parser.add_argument(
'--num-steps',
type=int,
default=2048,
help='number of maximum forward steps in ppo (default: 2048)')
parser.add_argument(
'--ppo-epoch',
type=int,
default=10,
help='number of ppo epochs (default: 10)')
parser.add_argument(
'--num-mini-batch',
type=int,
default=32,
help='number of batches for ppo (default: 32)')
parser.add_argument(
'--clip-param',
type=float,
default=0.2,
help='ppo clip parameter (default: 0.2)')
parser.add_argument(
'--log-interval',
type=int,
default=1,
help='log interval, one log per n updates (default: 1)')
parser.add_argument(
'--eval-interval',
type=int,
default=10,
help='eval interval, one eval per n updates (default: 10)')
parser.add_argument(
'--num-env-steps',
type=int,
default=10e5,
help='number of environment steps to train (default: 10e5)')
parser.add_argument(
'--env-name',
default='Hopper-v2',
help='environment to train on (default: Hopper-v2)')
parser.add_argument(
'--use-linear-lr-decay',
action='store_true',
default=False,
help='use a linear schedule on the learning rate')
args = parser.parse_args()
args.cuda = torch.cuda.is_available()
return args
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import torch
import utils
from wrapper import make_env
def evaluate(agent, ob_rms, env_name, seed, device):
if seed != None:
seed += 1
eval_envs = make_env(env_name, seed, None)
vec_norm = utils.get_vec_normalize(eval_envs)
if vec_norm is not None:
vec_norm.eval()
vec_norm.ob_rms = ob_rms
eval_episode_rewards = []
obs = eval_envs.reset()
eval_masks = torch.zeros(1, 1, device=device)
while len(eval_episode_rewards) < 10:
with torch.no_grad():
action = agent.predict(obs)
# Obser reward and next obs
obs, _, done, infos = eval_envs.step(action)
eval_masks = torch.tensor(
[[0.0] if done_ else [1.0] for done_ in done],
dtype=torch.float32,
device=device)
for info in infos:
if 'episode' in info.keys():
eval_episode_rewards.append(info['episode']['r'])
eval_envs.close()
print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
len(eval_episode_rewards), np.mean(eval_episode_rewards)))
return np.mean(eval_episode_rewards)
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import torch
class MujocoAgent(parl.Agent):
def __init__(self, algorithm, device):
self.alg = algorithm
self.device = device
def predict(self, obs):
obs = torch.from_numpy(obs).float().to(self.device)
action = self.alg.predict(obs)
return action.cpu().numpy()
def sample(self, obs):
obs = torch.from_numpy(obs).to(self.device)
value, action, action_log_probs = self.alg.sample(obs)
return value.cpu().numpy(), action.cpu().numpy(), \
action_log_probs.cpu().numpy()
def learn(self, next_value, gamma, gae_lambda, ppo_epoch, num_mini_batch,
rollouts):
value_loss_epoch = 0
action_loss_epoch = 0
dist_entropy_epoch = 0
for e in range(ppo_epoch):
data_generator = rollouts.sample_batch(next_value, gamma,
gae_lambda, num_mini_batch)
for sample in data_generator:
obs_batch, actions_batch, \
value_preds_batch, return_batch, old_action_log_probs_batch, \
adv_targ = sample
obs_batch = torch.from_numpy(obs_batch).to('cuda')
actions_batch = torch.from_numpy(actions_batch).to('cuda').to(
'cuda')
value_preds_batch = torch.from_numpy(value_preds_batch).to(
'cuda')
return_batch = torch.from_numpy(return_batch).to('cuda')
old_action_log_probs_batch = torch.from_numpy(
old_action_log_probs_batch).to('cuda')
adv_targ = torch.from_numpy(adv_targ).to('cuda')
value_loss, action_loss, dist_entropy = self.alg.learn(
obs_batch, actions_batch, value_preds_batch, return_batch,
old_action_log_probs_batch, adv_targ)
value_loss_epoch += value_loss
action_loss_epoch += action_loss
dist_entropy_epoch += dist_entropy
num_updates = ppo_epoch * num_mini_batch
value_loss_epoch /= num_updates
action_loss_epoch /= num_updates
dist_entropy_epoch /= num_updates
return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
def value(self, obs):
obs = torch.from_numpy(obs).to(self.device)
return self.alg.value(obs).cpu().numpy()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import parl
import torch
import torch.nn as nn
import torch.nn.functional as F
from torch.distributions import Normal
class MujocoModel(parl.Model):
def __init__(self, obs_dim, act_dim):
super(MujocoModel, self).__init__()
self.actor = Actor(obs_dim, act_dim)
self.critic = Critic(obs_dim)
def policy(self, obs):
return self.actor(obs)
def value(self, obs):
return self.critic(obs)
class Actor(parl.Model):
def __init__(self, obs_dim, act_dim):
super(Actor, self).__init__()
self.fc1 = nn.Linear(obs_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc_mean = nn.Linear(64, act_dim)
self.log_std = nn.Parameter(torch.zeros(act_dim))
def forward(self, obs):
x = torch.tanh(self.fc1(obs))
x = torch.tanh(self.fc2(x))
mean = self.fc_mean(x)
return mean, self.log_std
class Critic(parl.Model):
def __init__(self, obs_dim):
super(Critic, self).__init__()
self.fc1 = nn.Linear(obs_dim, 64)
self.fc2 = nn.Linear(64, 64)
self.fc3 = nn.Linear(64, 1)
def forward(self, obs):
x = torch.tanh(self.fc1(obs))
x = torch.tanh(self.fc2(x))
value = self.fc3(x)
return value
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
class RolloutStorage(object):
def __init__(self, num_steps, obs_dim, act_dim):
self.num_steps = num_steps
self.obs_dim = obs_dim
self.act_dim = act_dim
self.obs = np.zeros((num_steps + 1, obs_dim), dtype='float32')
self.actions = np.zeros((num_steps, act_dim), dtype='float32')
self.value_preds = np.zeros((num_steps + 1, ), dtype='float32')
self.returns = np.zeros((num_steps + 1, ), dtype='float32')
self.action_log_probs = np.zeros((num_steps, ), dtype='float32')
self.rewards = np.zeros((num_steps, ), dtype='float32')
self.masks = np.ones((num_steps + 1, ), dtype='bool')
self.bad_masks = np.ones((num_steps + 1, ), dtype='bool')
self.step = 0
def append(self, obs, actions, action_log_probs, value_preds, rewards,
masks, bad_masks):
"""
print("obs")
print(obs)
print("masks")
print(masks)
print("rewards")
print(rewards)
exit()
"""
self.obs[self.step + 1] = obs
self.actions[self.step] = actions
self.rewards[self.step] = rewards
self.action_log_probs[self.step] = action_log_probs
self.value_preds[self.step] = value_preds
self.masks[self.step + 1] = masks
self.bad_masks[self.step + 1] = bad_masks
self.step = (self.step + 1) % self.num_steps
def sample_batch(self,
next_value,
gamma,
gae_lambda,
num_mini_batch,
mini_batch_size=None):
# calculate return and advantage first
self.compute_returns(next_value, gamma, gae_lambda)
advantages = self.returns[:-1] - self.value_preds[:-1]
advantages = (advantages - advantages.mean()) / (
advantages.std() + 1e-5)
# generate sample batch
mini_batch_size = self.num_steps // num_mini_batch
sampler = BatchSampler(
SubsetRandomSampler(range(self.num_steps)),
mini_batch_size,
drop_last=True)
for indices in sampler:
obs_batch = self.obs[:-1][indices]
actions_batch = self.actions[indices]
value_preds_batch = self.value_preds[:-1][indices]
returns_batch = self.returns[:-1][indices]
old_action_log_probs_batch = self.action_log_probs[indices]
value_preds_batch = value_preds_batch.reshape(-1, 1)
returns_batch = returns_batch.reshape(-1, 1)
old_action_log_probs_batch = old_action_log_probs_batch.reshape(
-1, 1)
adv_targ = advantages[indices]
adv_targ = adv_targ.reshape(-1, 1)
yield obs_batch, actions_batch, value_preds_batch, returns_batch, old_action_log_probs_batch, adv_targ
def after_update(self):
self.obs[0] = np.copy(self.obs[-1])
self.masks[0] = np.copy(self.masks[-1])
self.bad_masks[0] = np.copy(self.bad_masks[-1])
def compute_returns(self, next_value, gamma, gae_lambda):
self.value_preds[-1] = next_value
gae = 0
for step in reversed(range(self.rewards.size)):
delta = self.rewards[step] + gamma * self.value_preds[
step + 1] * self.masks[step + 1] - self.value_preds[step]
gae = delta + gamma * gae_lambda * self.masks[step + 1] * gae
gae = gae * self.bad_masks[step + 1]
self.returns[step] = gae + self.value_preds[step]
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# modified from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail
import copy
import os
from collections import deque
import gym
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import utils
from arguments import get_args
from wrapper import make_env
from mujoco_model import MujocoModel
from parl.algorithms import PPO
from mujoco_agent import MujocoAgent
from storage import RolloutStorage
from evaluation import evaluate
def main():
args = get_args()
torch.manual_seed(args.seed)
torch.cuda.manual_seed_all(args.seed)
torch.set_num_threads(1)
device = torch.device("cuda:0" if args.cuda else "cpu")
envs = make_env(args.env_name, args.seed, args.gamma)
model = MujocoModel(envs.observation_space.shape[0],
envs.action_space.shape[0])
model.to(device)
algorithm = PPO(
model,
args.clip_param,
args.value_loss_coef,
args.entropy_coef,
initial_lr=args.lr,
eps=args.eps,
max_grad_norm=args.max_grad_norm)
agent = MujocoAgent(algorithm, device)
rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
envs.action_space.shape[0])
obs = envs.reset()
rollouts.obs[0] = np.copy(obs)
episode_rewards = deque(maxlen=10)
num_updates = int(args.num_env_steps) // args.num_steps
for j in range(num_updates):
if args.use_linear_lr_decay:
# decrease learning rate linearly
utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
args.lr)
for step in range(args.num_steps):
# Sample actions
with torch.no_grad():
value, action, action_log_prob = agent.sample(
rollouts.obs[step]) # why use obs from rollouts???有病吧
# Obser reward and next obs
obs, reward, done, infos = envs.step(action)
for info in infos:
if 'episode' in info.keys():
episode_rewards.append(info['episode']['r'])
# If done then clean the history of observations.
masks = torch.FloatTensor(
[[0.0] if done_ else [1.0] for done_ in done])
bad_masks = torch.FloatTensor(
[[0.0] if 'bad_transition' in info.keys() else [1.0]
for info in infos])
rollouts.append(obs, action, action_log_prob, value, reward, masks,
bad_masks)
with torch.no_grad():
next_value = agent.value(rollouts.obs[-1])
value_loss, action_loss, dist_entropy = agent.learn(
next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
args.num_mini_batch, rollouts)
rollouts.after_update()
if j % args.log_interval == 0 and len(episode_rewards) > 1:
total_num_steps = (j + 1) * args.num_steps
print(
"Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
.format(j, total_num_steps, len(episode_rewards),
np.mean(episode_rewards), np.median(episode_rewards),
np.min(episode_rewards), np.max(episode_rewards),
dist_entropy, value_loss, action_loss))
if (args.eval_interval is not None and len(episode_rewards) > 1
and j % args.eval_interval == 0):
ob_rms = utils.get_vec_normalize(envs).ob_rms
eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
args.seed, device)
if __name__ == "__main__":
main()
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import glob
import os
import torch
import torch.nn as nn
from wrapper import VecNormalize
def get_vec_normalize(venv):
if isinstance(venv, VecNormalize):
return venv
elif hasattr(venv, 'venv'):
return get_vec_normalize(venv.venv)
return None
def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
"""Decreases the learning rate linearly"""
lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
for param_group in optimizer.param_groups:
param_group['lr'] = lr
def init(module, weight_init, bias_init, gain=1):
weight_init(module.weight.data, gain=gain)
bias_init(module.bias.data)
return module
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# Simplified version of https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl/component/envs.py
import numpy as np
import gym
from gym.core import Wrapper
import time
class TimeLimitMask(gym.Wrapper):
def step(self, action):
obs, rew, done, info = self.env.step(action)
if done and self.env._max_episode_steps == self.env._elapsed_steps:
info['bad_transition'] = True
return obs, rew, done, info
def reset(self, **kwargs):
return self.env.reset(**kwargs)
class MonitorEnv(gym.Wrapper):
def __init__(self, env):
Wrapper.__init__(self, env=env)
self.tstart = time.time()
self.rewards = None
def step(self, action):
ob, rew, done, info = self.env.step(action)
self.update(ob, rew, done, info)
return (ob, rew, done, info)
def update(self, ob, rew, done, info):
self.rewards.append(rew)
if done:
eprew = sum(self.rewards)
eplen = len(self.rewards)
epinfo = {
"r": round(eprew, 6),
"l": eplen,
"t": round(time.time() - self.tstart, 6)
}
assert isinstance(info, dict)
info['episode'] = epinfo
self.reset()
def reset(self, **kwargs):
self.rewards = []
return self.env.reset(**kwargs)
class VectorEnv(gym.Wrapper):
def step(self, action):
ob, rew, done, info = self.env.step(action)
ob = np.array(ob)
ob = ob[np.newaxis, :]
rew = np.array([rew])
done = np.array([done])
info = [info]
return (ob, rew, done, info)
class RunningMeanStd(object):
# https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
def __init__(self, epsilon=1e-4, shape=()):
self.mean = np.zeros(shape, 'float64')
self.var = np.ones(shape, 'float64')
self.count = epsilon
def update(self, x):
batch_mean = np.mean(x, axis=0)
batch_var = np.var(x, axis=0)
batch_count = x.shape[0]
self.update_from_moments(batch_mean, batch_var, batch_count)
def update_from_moments(self, batch_mean, batch_var, batch_count):
self.mean, self.var, self.count = update_mean_var_count_from_moments(
self.mean, self.var, self.count, batch_mean, batch_var,
batch_count)
def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
batch_count):
delta = batch_mean - mean
tot_count = count + batch_count
new_mean = mean + delta * batch_count / tot_count
m_a = var * count
m_b = batch_var * batch_count
M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
new_var = M2 / tot_count
new_count = tot_count
return new_mean, new_var, new_count
class VecNormalize(gym.Wrapper):
def __init__(self,
env,
ob=True,
ret=True,
clipob=10.,
cliprew=10.,
gamma=0.99,
epsilon=1e-8):
Wrapper.__init__(self, env=env)
observation_space = env.observation_space.shape[0]
self.ob_rms = RunningMeanStd(shape=observation_space) if ob else None
self.ret_rms = RunningMeanStd(shape=()) if ret else None
self.clipob = clipob
self.cliprew = cliprew
self.gamma = gamma
self.epsilon = epsilon
self.ret = np.zeros(1)
self.training = True
def step(self, action):
ob, rew, new, info = self.env.step(action)
self.ret = self.ret * self.gamma + rew
# normalize observation
ob = self._obfilt(ob)
# normalize reward
if self.ret_rms:
self.ret_rms.update(self.ret)
rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon),
-self.cliprew, self.cliprew)
self.ret[new] = 0.
return ob, rew, new, info
def reset(self):
self.ret = np.zeros(1)
ob = self.env.reset()
return self._obfilt(ob)
def _obfilt(self, ob, update=True):
if self.ob_rms:
if self.training and update:
self.ob_rms.update(ob)
ob = np.clip((ob - self.ob_rms.mean) /
np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob,
self.clipob)
return ob
else:
return ob
def train(self):
self.training = True
def eval(self):
self.trainint = False
def make_env(env_name, seed, gamma):
env = gym.make(env_name)
env.seed(seed)
env = TimeLimitMask(env)
env = MonitorEnv(env)
env = VectorEnv(env)
if gamma is None:
env = VecNormalize(env, ret=False)
else:
env = VecNormalize(env, gamma=gamma)
return env
...@@ -15,7 +15,7 @@ ...@@ -15,7 +15,7 @@
import gym import gym
import argparse import argparse
import numpy as np import numpy as np
from parl.utils import logger, tensorboard, ReplayMemory from parl.utils import logger, summary, ReplayMemory
from mujoco_model import MujocoModel from mujoco_model import MujocoModel
from mujoco_agent import MujocoAgent from mujoco_agent import MujocoAgent
...@@ -103,8 +103,7 @@ def main(): ...@@ -103,8 +103,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm) train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward)) logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
tensorboard.add_scalar('train/episode_reward', train_reward, summary.add_scalar('train/episode_reward', train_reward, total_steps)
total_steps)
if total_steps // args.test_every_steps >= test_flag: if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag: while total_steps // args.test_every_steps >= test_flag:
...@@ -112,8 +111,8 @@ def main(): ...@@ -112,8 +111,8 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent) evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format( logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward)) total_steps, evaluate_reward))
tensorboard.add_scalar('eval/episode_reward', evaluate_reward, summary.add_scalar('eval/episode_reward', evaluate_reward,
total_steps) total_steps)
if __name__ == '__main__': if __name__ == '__main__':
......
minimal example
---------------------
``本教程的目标:
演示如何通过EvoKit库来解决经典的CartPole 问题。``
*本教程假定读者曾经使用过PaddlePaddle, 了解基本的进化算法迭代流程。*
CartPole 介绍
#############
CartPole又叫倒立摆。小车上放了一根杆,杆会因重力而倒下。为了不让杆倒下,我们要通过移动小车,来保持其是直立的。如下图所示。
在每一个时间步,模型的输入是一个4维的向量,表示当前小车和杆的状态,模型输出的信号用于控制小车往左或者右移动。当杆没有倒下的时候,每个时间步,环境会给1分的奖励;当杆倒下后,环境不会给任何的奖励,游戏结束。
.. image:: ../../examples/QuickStart/performance.gif
:width: 300px
step1: 生成预测网络
########################
根据上面的环境介绍,我们需要构造一个神经网络,输入为4维的向量,输出为2维的概率分布向量(表示左/右)移动的概率。
在这里,我们使用Paddle来实现预测网络,并保存到本地。
.. code-block:: python
from paddle import fluid
def net(obs, act_dim):
hid1 = fluid.layers.fc(obs, size=20)
prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
return prob
if __name__ == '__main__':
obs_dim = 4
act_dim = 2
obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
prob = net(obs, act_dim)
exe = fluid.Executor(fluid.CPUPlace())
exe.run(fluid.default_startup_program())
fluid.io.save_inference_model(
dirname='init_model',
feeded_var_names=['obs'],
target_vars=[prob],
params_filename='params',
model_filename='model',
executor=exe)
step2: 构造ESAgent
###################
- 调用 ``load_config`` 加载配置文件。
- 调用 ``load_inference_model`` 函数加载模型参数。
- 调用 ``init_solver`` 初始化solver。
配置文件主要是用于指定进化算法类型(比如Gaussian或者CMA),使用的optimizer类型(Adam或者SGD)。
.. code-block:: c++
ESAgent agent = ESAgent();
agent.load_config(config);
agent.load_inference_model(model_dir);
agent.init_solver();
// 附:EvoKit配置项示范
solver {
type: BASIC_ES
optimizer { // 线下Adam更新
type: ADAM
base_lr: 0.05
adam {
beta1: 0.9
beta2: 0.999
epsilon: 1e-08
}
}
sampling { // 线上高斯采样
type: GAUSSIAN_SAMPLING
gaussian_sampling {
std: 0.5
cached: true
seed: 1024
cache_size : 100000
}
}
}
step3: 生成用于采样的Agent
###################
主要关注三个接口:
- 调用 ``clone`` 生成一个用于sampling的agent。
- 调用 ``add_noise`` 给这个agent的参数空间增加噪声,同时返回该噪声对应的唯一信息,这个信息得记录在log中,用于线下更新。
- 调用 ``predict`` 提供预测接口。
.. code-block:: c++
auto sampling_agent = agent.clone();
auto sampling_info = sampling_agent.add_noise();
sampling_agent.predict(feature);
step4: 用采样的数据更新模型参数
###################
用户提供两组数据:
- 采样参数过程中用于线下复现采样噪声的sampling_info
- 扰动参数后,新参数的评估结果
.. code-block:: c++
agent.update(sampling_infos, rewards);
主代码以及注释
#################
以下的代码演示通过多线程同时采样, 提升解决问题的效率。
.. code-block:: c++
int main(int argc, char* argv[]) {
std::vector<CartPole> envs;
// 构造10个环境,用于多线程训练
for (int i = 0; i < ITER; ++i) {
envs.push_back(CartPole());
}
// 初始化ESAgent
std::string model_dir = "./demo/cartpole/init_model";
std::string config_path = "./demo/cartpole/config.prototxt";
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(config_path); // 加载配置
agent->load_inference_model(FLAGS_model_dir); // 加载初始预测模型
agent->init_solver(); // 初始化solver,注意要在load_inference_model后执行
// 生成10个agent用于同时采样
std::vector<std::shared_ptr<ESAgent>> sampling_agents;
for (int i = 0; i < ITER; ++i) {
sampling_agents.push_back(agent->clone());
}
std::vector<SamplingInfo> sampling_infos;
std::vector<float> rewards(ITER, 0.0f);
sampling_infos.resize(ITER);
omp_set_num_threads(10);
// 共迭代100轮
for (int epoch = 0; epoch < 100; ++epoch) {
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < ITER; ++i) {
std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
SamplingInfo sampling_info;
sampling_agent->add_noise(sampling_info);
float reward = evaluate(envs[i], sampling_agent);
// 保存采样的sampling_info以及对应的评估结果reward
sampling_infos[i] = sampling_info;
rewards[i] = reward;
}
// 更新模型参数,注意:参数更新后会自动同步到sampling_agent中
agent->update(sampling_infos, rewards);
int reward = evaluate(envs[0], agent);
LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; // 打印每一轮reward
}
}
如何运行demo
#################
- 下载代码
在icode上clone代码,我们的仓库路径是: ``baidu/nlp/deep-es`` ``TO DO: 修改库路径``
- 编译demo
通过bcloud的云端集群编译即可,命令为: ``bb``
- 运行demo
编译完成后,我们需要增加动态库查找路径:
``export LD_LIBRARY_PATH=./output/so/:$LD_LIBRARY_PATH``
运行demo: ``./output/bin/cartpole/train``
问题解决
####################
在使用过程中有任何问题,请加hi群: 1692822 (PARL官方答疑群)进行咨询,开发同学会直接回答任何的使用问题。
Example for Online Products
#########################
``本教程的目标: 演示通过EvoKit库上线后,如何迭代算法,更新模型参数。``
在产品线中,线上无法实时拿到用户日志,经常是通过保存用户点击/时长日志,在线下根据用户数据更新模型,然后再推送到线上,完成算法的更新。
本教程继续围绕经典的CartPole环境,展示如何通过在线采样/离线更新的方式,来更新迭代ES算法。
demo的完整代码示例放在demp/online_example文件夹中。
``TO DO: 文件夹``
初始化solver
---------------------
构造solver,对它初始化,并保存到文件。初始化solver仅需在开始时调用一次。
.. code-block:: c++
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(FLAGS_config_path);
agent->load_inference_model(FLAGS_model_dir);
agent->init_solver();
agent->save_solver(FLAGS_model_dir);
线上采样
---------------------
加载模型和solver,记录线上采样返回的sampling_info以及评估的reward,并通过二进制的方式记录到log文件中。
.. code-block:: c++
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(FLAGS_config_path);
agent->load_inference_model(FLAGS_model_dir);
agent->load_solver(FLAGS_model_dir);
#pragma omp parallel for schedule(dynamic, 1)
for (int i = 0; i < ITER; ++i) {
std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
SamplingInfo sampling_info;
sampling_agent->add_noise(sampling_info);
float reward = evaluate(envs[i], sampling_agent);
sampling_infos[i] = sampling_info;
rewards[i] = reward;
}
// save sampling information and log in binary fomrat
std::ofstream log_stream(FLAGS_log_path, std::ios::binary);
for (int i = 0; i < ITER; ++i) {
std::string data;
sampling_infos[i].SerializeToString(&data);
int size = data.size();
log_stream.write((char*) &rewards[i], sizeof(float));
log_stream.write((char*) &size, sizeof(int));
log_stream.write(data.c_str(), size);
}
log_stream.close();
线下更新
-----------------------
在加载好之前记录的log之后,调用 ``update`` 函数进行更新,然后通过 ``save_inference_model`` 和 ``save_solver`` 函数保存更新后的参数到本地,推送到线上。
.. code-block:: c++
std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
agent->load_config(FLAGS_config_path);
agent->load_inference_model(FLAGS_model_dir);
agent->load_solver(FLAGS_model_dir);
// load training data
std::vector<SamplingInfo> sampling_infos;
std::vector<float> rewards(ITER, 0.0f);
sampling_infos.resize(ITER);
std::ifstream log_stream(FLAGS_log_path);
CHECK(log_stream.good()) << "[EvoKit] cannot open log: " << FLAGS_log_path;
char buffer[1000];
for (int i = 0; i < ITER; ++i) {
int size;
log_stream.read((char*) &rewards[i], sizeof(float));
log_stream.read((char*) &size, sizeof(int));
log_stream.read(buffer, size);
buffer[size] = 0;
std::string data(buffer);
sampling_infos[i].ParseFromString(data);
}
// update model and save parameter
agent->update(sampling_infos, rewards);
agent->save_inference_model(FLAGS_updated_model_dir);
agent->save_solver(FLAGS_updated_model_dir);
主代码
-----------------------
将以上代码分别编译成可执行文件。
- 初始化solver: ``init_solver`` 。
- 线上采样: ``online_sampling`` 。
- 线下更新: ``offline update`` 。
.. code-block:: shell
#------------------------init solver------------------------
./init_solver \
--model_dir="./model_warehouse/model_dir_0" \
--config_path="config.prototxt"
for ((epoch=0;epoch<200;++epoch));do
#------------------------online sampling------------------------
./online_sampling \
--log_path="./sampling_log" \
--model_dir="./model_warehouse/model_dir_$epoch" \
--config_path="./config.prototxt"
#------------------------offline update------------------------
next_epoch=$((epoch+1))
./offline_update \
--log_path='./sampling_log' \
--model_dir="./model_warehouse/model_dir_$epoch" \
--updated_model_dir="./model_warehouse/model_dir_${next_epoch}" \
--config_path="./config.prototxt"
done
Overview
------------------
``EvoKit`` 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打 **快速上线验证** 。
.. image:: ../../evo_kit/DeepES.gif
:align: center
:width: 400px
特性
#########
**1. 多种进化算法支持。** 支持高斯采样、CMA、GA等算法,更多算法持续接入中。
**2. 主流优化器支持。** 支持SGD/Momentum/Adam等多个主流优化器,有效提升算法收敛效率。
**3. 一站式上线。** 整合了线上采样和线下更新流程, 提供Bcloud/Cmake等编译方式, 助力快速上线。
**4. 深度学习框架全系列兼容。** 裸写的网络,paddle/lego/Torch等深度学习框架,EvoKit都支持。
**5. 同步/异步更新方式。** 支持多个采样模型/多份采样数据异步更新,完美契合业务场景。
...@@ -101,3 +101,37 @@ def setup(app): ...@@ -101,3 +101,37 @@ def setup(app):
add_module_names = False add_module_names = False
latex_engine = 'xelatex'
latex_use_xindy = False
latex_elements = {
'preamble': '\\usepackage[UTF8]{ctex}\n',
}
latex_elements = {
# The paper size ('letterpaper' or 'a4paper').
#'papersize': 'letterpaper',
# The font size ('10pt', '11pt' or '12pt').
#'pointsize': '10pt',
# Additional stuff for the LaTeX preamble.
#'preamble': '',
'preamble':
r'''
\hypersetup{unicode=true}
\usepackage{CJKutf8}
\DeclareUnicodeCharacter{00A0}{\nobreakspace}
\DeclareUnicodeCharacter{2203}{\ensuremath{\exists}}
\DeclareUnicodeCharacter{2200}{\ensuremath{\forall}}
\DeclareUnicodeCharacter{2286}{\ensuremath{\subseteq}}
\DeclareUnicodeCharacter{2713}{x}
\DeclareUnicodeCharacter{27FA}{\ensuremath{\Longleftrightarrow}}
\DeclareUnicodeCharacter{221A}{\ensuremath{\sqrt{}}}
\DeclareUnicodeCharacter{221B}{\ensuremath{\sqrt[3]{}}}
\DeclareUnicodeCharacter{2295}{\ensuremath{\oplus}}
\DeclareUnicodeCharacter{2297}{\ensuremath{\otimes}}
\begin{CJK}{UTF8}{gbsn}
\AtEndDocument{\end{CJK}}
''',
}
...@@ -46,7 +46,7 @@ Abstractions ...@@ -46,7 +46,7 @@ Abstractions
:maxdepth: 1 :maxdepth: 1
:caption: Installation :caption: Installation
installation.rst installation.rst
.. toctree:: .. toctree::
:maxdepth: 1 :maxdepth: 1
...@@ -58,9 +58,10 @@ Abstractions ...@@ -58,9 +58,10 @@ Abstractions
:maxdepth: 1 :maxdepth: 1
:caption: Tutorial :caption: Tutorial
getting_started.rst tutorial/getting_started.rst
new_alg.rst tutorial/new_alg.rst
save_param.rst tutorial/save_param.rst
tutorial/tensorboard.rst
.. toctree:: .. toctree::
:maxdepth: 2 :maxdepth: 2
...@@ -83,3 +84,11 @@ Abstractions ...@@ -83,3 +84,11 @@ Abstractions
model.rst model.rst
algorithm.rst algorithm.rst
agent.rst agent.rst
.. toctree::
:maxdepth: 2
:caption: EvoKit
EvoKit/overview.rst
EvoKit/minimal_example.rst
EvoKit/online_example.rst
...@@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi ...@@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi
Summary Summary
----------- -----------
.. image:: ../examples/QuickStart/performance.gif .. image:: ../../examples/QuickStart/performance.gif
:width: 300px :width: 300px
.. image:: ./images/quickstart.png .. image:: ../images/quickstart.png
:width: 300px :width: 300px
In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem. In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem.
......
...@@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods: ...@@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods:
Args: Args:
model (parl.Model): model defining forward network of Q function model (parl.Model): model defining forward network of Q function
hyperparas (dict): (deprecated) dict of hyper parameters.
act_dim (int): dimension of the action space act_dim (int): dimension of the action space
gamma (float): discounted factor for reward computation. gamma (float): discounted factor for reward computation.
lr (float): learning rate. lr (float): learning rate.
......
...@@ -22,5 +22,5 @@ Here is a demonstration of usage: ...@@ -22,5 +22,5 @@ Here is a demonstration of usage:
agent.restore('./model.ckpt') agent.restore('./model.ckpt')
# restore the parameters from ./model.ckpt to another_agent # restore the parameters from ./model.ckpt to another_agent
another_agent = AtariAgent() another_agent = AtariAgent()
another_agent.restore('./model.ckpt') another_agent.restore('./model.ckpt')
summary
===============
Visualize the results with tensorboard.
add_scalar
-------------
Common used arguments:
* summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier
* scalar_value *(float or string/blobname)* – Value to save
* global_step *(int)* – Global step value to record
Example:
.. code-block:: python
from parl.utils import summary
x = range(100)
for i in x:
summary.add_scalar('y=2x', i * 2, i)
Expected result:
.. image:: add_scalar.jpg
:scale: 50 %
add_histogram
----------------
Common used arguments:
* summary.add_scalar(tag, scalar_value, global_step=None)
* tag *(string)* – Data identifier
* values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram
* global_step *(int)* – Global step value to record
Example:
.. code-block:: python
from parl.utils import summary
import numpy as np
for i in range(10):
x = np.random.random(1000)
summary.add_histogram('distribution centers', x + i, i)
Expected result:
.. image:: add_histogram.jpg
:scale: 50 %
cmake_minimum_required (VERSION 2.6)
project (EvoKit)
########## options ##########
option(WITH_PADDLE "Compile EvoKit with PaddleLite framework." OFF)
option(WITH_TORCH "Compile EvoKit with Torch framework." OFF)
message("WITH_PADDLE: "${WITH_PADDLE})
message("WITH_TORCH: "${WITH_TORCH})
if (NOT (WITH_PADDLE OR WITH_TORCH))
message("ERROR: You should choose at least one framework to compile EvoKit.")
return()
elseif(WITH_PADDLE AND WITH_TORCH)
message("ERROR: You cannot choose more than one framework to compile EvoKit.")
return()
endif()
set(CMAKE_CXX_STANDARD 11)
set(CMAKE_CXX_STANDARD_REQUIRED ON)
set(CMAKE_CXX_EXTENSIONS OFF)
find_package(OpenMP)
if (OPENMP_FOUND)
set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
endif()
file(GLOB src "core/src/*.cc" "core/proto/evo_kit/*.cc")
include_directories("core/include")
include_directories("core/proto")
include_directories("benchmark")
########## PaddleLite config ##########
if (WITH_PADDLE)
add_definitions(-g -O3 -pthread)
include_directories("paddle/include")
include_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/include"
"${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/include")
link_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/lib"
"${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/lib")
file(GLOB framework_src "paddle/src/*.cc")
set(TARGET EvoKit_paddle)
########## Torch config ##########
elseif (WITH_TORCH)
# list(APPEND CMAKE_PREFIX_PATH "./libtorch")
# find_package(Torch REQUIRED ON) # TODO: not necessary for now
include_directories("torch/include")
file(GLOB framework_src "torch/src/*.cc")
set(TARGET EvoKit_torch)
else ()
message("ERROR: You should choose at least one framework to compile EvoKit.")
endif()
add_library(${TARGET} STATIC ${src} ${framework_src})
target_link_libraries(${TARGET} gflags protobuf pthread glog)
# ########## PaddleLite libraries ##########
# if (WITH_PADDLE)
# target_link_libraries(${TARGET} -lpaddle_full_api_shared)
# target_link_libraries(${TARGET} -lmklml_intel)
# target_link_libraries(${TARGET} -ldl)
# ########## Torch libraries ##########
# elseif (WITH_TORCH)
# target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
# endif()
file(GLOB include "core/include/evo_kit/*.h")
file(GLOB proto_include "core/proto/evo_kit/*.h")
file(GLOB torch_include "torch/include/evo_kit/*.h")
file(GLOB paddle_include "paddle/include/evo_kit/*.h")
file(GLOB benchmark_include "benchmark/*.h")
file(GLOB findcmake "cmake/Torch/*.cmake")
set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/libevokit")
install(TARGETS ${TARGET} ARCHIVE DESTINATION "lib")
install(FILES ${include} ${proto_include} DESTINATION "include/evo_kit")
install(FILES ${torch_include} DESTINATION "torch/evo_kit")
install(FILES ${paddle_include} DESTINATION "paddle/evo_kit")
install(FILES ${benchmark_include} DESTINATION "include")
install(FILES ${findcmake} DESTINATION "cmake/Torch")
# EvoKit
EvoKit 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打快速上线验证 。
<p align="center">
<img src="DeepES.gif" alt="PARL" width="500"/>
</p>
## 使用示范
```c++
//实例化一个预测,根据配置文件加载模型,采样方式(Gaussian\CMA sampling..)、更新方式(SGD\Adam)等
auto agent = ESAgent(config);
for (int i = 0; i < 10; ++i) {
auto sampling_agnet = agent->clone(); // clone出一个sampling agent
SamplingInfo info;
sampling_agent->add_noise(info); // 参数扰动,同时保存随机种子到info中
int reward = evaluate(env, sampling_agent); //评估参数
noisy_info.push_back(info); // 记录随机噪声对应种子
noisy_rewards.push_back(reward); // 记录评估结果
}
//根据评估结果、随机种子更新参数,然后重复以上过程,直到收敛。
agent->update(noisy_info, noisy_rewards);
```
## 一键运行demo列表
- **PaddleLite**: sh ./scripts/build.sh paddle
- **Torch**: sh ./scripts/build.sh torch
- **裸写网络**
## 相关依赖:
- Protobuf2
- OpenMP
- [glog](https://github.com/gflags/gflags/blob/master/INSTALL.md)
- [gflag](https://github.com/google/glog)
## 额外依赖:
### 使用PaddleLite
下载PaddleLite的X86预编译库,或者编译PaddleLite源码,得到inference_lite_lib文件夹,放在当前目录中。(可参考:[PaddleLite使用X86预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html))
### 使用torch
下载[libtorch](https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip)或者编译torch源码,得到libtorch文件夹,放在当前目录中。
// Third party code
// This code is copied or modified from openai/gym's cartpole.py
#include <iostream>
#include <random>
#include <cassert>
#include <vector>
const double kPi = 3.1415926535898;
class CartPole {
public:
double gravity = 9.8;
double masscart = 1.0;
double masspole = 0.1;
double total_mass = (masspole + masscart);
double length = 0.5; // actually half the pole's length;
double polemass_length = (masspole * length);
double force_mag = 10.0;
double tau = 0.02; // seconds between state updates;
// Angle at which to fail the episode
double theta_threshold_radians = 12 * 2 * kPi / 360;
double x_threshold = 2.4;
int steps_beyond_done = -1;
std::vector<float> state = {0, 0, 0, 0};
double reward;
bool done;
int step_ = 0;
const float* getState() {
return state.data();
}
double getReward() {
return reward;
}
double isDone() {
return done;
}
void reset() {
std::random_device rd;
std::default_random_engine generator(rd());
std::uniform_real_distribution<float> distribution(-0.05, 0.05);
for (int i = 0; i < 4; ++i) {
state[i] = distribution(generator);
}
steps_beyond_done = -1;
step_ = 0;
}
CartPole() {
reset();
}
void step(int action) {
float x = state[0];
float x_dot = state[1];
float theta = state[2];
float theta_dot = state[3];
auto force = (action == 1) ? force_mag : -force_mag;
auto costheta = std::cos(theta);
auto sintheta = std::sin(theta);
auto temp = (force + polemass_length * theta_dot * theta_dot * sintheta) /
total_mass;
auto thetaacc = (gravity * sintheta - costheta * temp) /
(length * (4.0 / 3.0 - masspole * costheta * costheta / total_mass));
auto xacc = temp - polemass_length * thetaacc * costheta / total_mass;
x = x + tau * x_dot;
x_dot = x_dot + tau * xacc;
theta = theta + tau * theta_dot;
theta_dot = theta_dot + tau * thetaacc;
state = {x, x_dot, theta, theta_dot};
done = x < -x_threshold || x > x_threshold ||
theta < -theta_threshold_radians || theta > theta_threshold_radians ||
step_ > 200;
if (!done) {
reward = 1.0;
} else if (steps_beyond_done == -1) {
// Pole just fell!
steps_beyond_done = 0;
reward = 0;
} else {
if (steps_beyond_done == 0) {
assert(false); // Can't do this
}
}
step_++;
}
};
# FindEvoKit
# -------
#
# Finds the EvoKit library
#
# This will define the following variables:
#
# EVOKIT_FOUND -- True if the system has the EvoKit library
# EVOKIT_INCLUDE_DIRS -- The include directories for EvoKit
# EVOKIT_LIBRARY -- Libraries to link against
#
# and the following imported targets:
#
# EvoKit
include(FindPackageHandleStandardArgs)
if (DEFINED ENV{EVOKIT_INSTALL_PREFIX})
set(EVOKIT_INSTALL_PREFIX $ENV{EVOKIT_INSTALL_PREFIX})
else()
# Assume we are in <install-prefix>/cmake/Torch/EvoKitConfig.cmake
get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
get_filename_component(EVOKIT_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
endif()
# Include directories.
if (EXISTS "${EVOKIT_INSTALL_PREFIX}/include")
set(EVOKIT_INCLUDE_DIRS
${EVOKIT_INSTALL_PREFIX}/include
${EVOKIT_INSTALL_PREFIX}/torch)
else()
set(EVOKIT_INCLUDE_DIRS
${EVOKIT_INSTALL_PREFIX}/include
${EVOKIT_INSTALL_PREFIX}/torch)
endif()
find_library(EVOKIT_LIBRARY libEvoKit_torch.a PATHS "${EVOKIT_INSTALL_PREFIX}/lib")
include_directories("${EVOKIT_INSTALL_PREFIX}/torch")
include_directories("${EVOKIT_INSTALL_PREFIX}/include")
find_package_handle_standard_args(EvoKit DEFAULT_MSG EVOKIT_LIBRARY EVOKIT_INCLUDE_DIRS)
message(STATUS "EVOKIT_FOUND: ${EVOKIT_FOUND}")
message(STATUS "EVOKIT_INCLUDE_DIRS: ${EVOKIT_INCLUDE_DIRS}")
message(STATUS "EVOKIT_LIBRARY: ${EVOKIT_LIBRARY}")
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_ADAM_OPTIMIZER_H
#define EVO_KIT_ADAM_OPTIMIZER_H
#include <cmath>
#include <unordered_map>
#include "evo_kit/optimizer.h"
namespace evo_kit {
/*@brief AdamOptimizer.
* Implements Adam algorithm.
*
*@Args:
* base_lr: learning rate (default: 1e-3).
* beta1: coefficients used for computing running averages of gradient (default: 0.9).
* beta2: coefficients used for computing running averages of gradient's square (default: 0.999).
* epsilon: term added to the denominator to improve numerical stability (default: 1e-8).
*/
class AdamOptimizer: public Optimizer {
public:
AdamOptimizer(float base_lr, float beta1 = 0.9, float beta2 = 0.999,
float epsilon = 1e-8): Optimizer(base_lr), \
_beta1(beta1), _beta2(beta2), _epsilon(epsilon) {}
~AdamOptimizer();
protected:
void compute_step(float* gradient, int size, std::string param_name);
private:
float _beta1;
float _beta2;
float _epsilon;
std::unordered_map<std::string, float*> _momentum;
std::unordered_map<std::string, float*> _velocity;
};
}//namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
#define EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
#include <glog/logging.h>
#include <random>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "sampling_method.h"
#include "utils.h"
namespace evo_kit {
class CachedGaussianSampling: public SamplingMethod {
public:
CachedGaussianSampling();
~CachedGaussianSampling();
/*Initialize the sampling algorithm given the config with the protobuf format.
*EvoKit library uses only one configuration file for all sampling algorithms.
A defalut configuration file can be found at: . // TODO: where?
Usally you won't have to modify the configuration items of other algorithms
if you are not using them.
*/
bool load_config(const EvoKitConfig& config);
/*@brief generate Gaussian noise and the related key.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: generate Gaussian successfully or not.
*/
bool sampling(int* key, float* noise, int64_t size);
/*@brief reconstruct the Gaussion noise given the key.
* This function is often used for updating the neuron network parameters in the offline environment.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: reconstruct Gaussian successfully or not.
*/
bool resampling(int key, float* noise, int64_t size);
private:
float _std;
int _cache_size;
float* _noise_cache = nullptr;
bool _create_noise_cache();
};
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
//
#ifndef EVO_KIT_GAUSSIAN_SAMPLING_H
#define EVO_KIT_GAUSSIAN_SAMPLING_H
#include <random>
#include <stdio.h>
#include <stdlib.h>
#include <time.h>
#include "evo_kit/sampling_method.h"
#include "evo_kit/utils.h"
namespace evo_kit {
class GaussianSampling: public SamplingMethod {
public:
GaussianSampling() {}
~GaussianSampling() {}
/*Initialize the sampling algorithm given the config with the protobuf format.
*EvoKit library uses only one configuration file for all sampling algorithms.
A defalut configuration file can be found at: . // TODO: where?
Usally you won't have to modify the configuration items of other algorithms
if you are not using them.
*/
bool load_config(const EvoKitConfig& config);
/*@brief generate Gaussian noise and the related key.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: generate Gaussian successfully or not.
*/
bool sampling(int* key, float* noise, int64_t size);
/*@brief reconstruct the Gaussion noise given the key.
* This function is often used for updating the neuron network parameters in the offline environment.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: reconstruct Gaussian successfully or not.
*/
bool resampling(int key, float* noise, int64_t size);
private:
float _std;
};
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_OPTIMIZER_H
#define EVO_KIT_OPTIMIZER_H
#include <glog/logging.h>
#include <unordered_map>
namespace evo_kit {
/*@brief Optimizer. Base class for optimizers.
*
*@Args:
* base_lr: learning rate (default: 1e-3).
*
* .. warning: update () is based on the parameter level,
* you need to perform update () on each parameter.
*
* Subclasses are required to implement the following functions:
* 1. compute_steps
*/
class Optimizer {
public:
Optimizer() : _base_lr(1e-3), _update_times(0) {}
Optimizer(float base_lr) : _base_lr(base_lr), _update_times(0) {}
virtual ~Optimizer() {
_params_size.clear();
}
template<typename T>
bool update(T weights, float* gradient, int size, std::string param_name = "") {
/*@ Performs a single optimization step (parameter update) at the parameter level.
*
*@Args:
* weights (array): parameter weights.
* gradient (array): gradient for updating weights.
* size: size of gradient.
* param_name: the name corresponding to the weights.
*/
if (_params_size.count(param_name) == 0) {
_params_size[param_name] = size;
} else if (_params_size[param_name] != size) {
LOG(WARNING) << "[Warning] Update times: " << int(_update_times / _params_size.size()) \
<< ". Size of weights[" << param_name << "] is " << _params_size[param_name] << ", not " << size;
return false;
}
++_update_times;
compute_step(gradient, size, param_name);
for (int i = 0; i < size; ++i) {
weights[i] -= _base_lr * gradient[i];
}
return true;
} // template function
protected:
virtual void compute_step(float* graident, int size, std::string param_name = "") = 0;
float _base_lr;
float _update_times;
std::unordered_map<std::string, int> _params_size;
};
}//namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_OPTIMIZER_FACTORY_H
#define EVO_KIT_OPTIMIZER_FACTORY_H
#include <algorithm>
#include <glog/logging.h>
#include <memory>
#include "evo_kit/adam_optimizer.h"
#include "evo_kit/evo_kit.pb.h"
#include "evo_kit/optimizer.h"
#include "evo_kit/sgd_optimizer.h"
namespace evo_kit {
/* @brief: create an optimizer according to the configuration"
* @args:
* config: configuration for the optimizer
*
*/
std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config);
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_SAMPLING_FACTORY_H
#define EVO_KIT_SAMPLING_FACTORY_H
#include <algorithm>
#include <glog/logging.h>
#include <memory>
#include "evo_kit/cached_gaussian_sampling.h"
#include "evo_kit/evo_kit.pb.h"
#include "evo_kit/gaussian_sampling.h"
#include "evo_kit/sampling_method.h"
namespace evo_kit {
/* @brief: create an sampling_method according to the configuration"
* @args:
* config: configuration for the EvoKit
*
*/
std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& Config);
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_SAMPLING_METHOD_H
#define EVO_KIT_SAMPLING_METHOD_H
#include <string>
#include <random>
#include "evo_kit/evo_kit.pb.h"
namespace evo_kit {
/*Base class for sampling algorithms. All algorithms are required to override the following functions:
*
* 1. load_config
* 2. sampling
* 3. resampling
*
* View an demostrative algorithm in gaussian_sampling.h
* */
class SamplingMethod {
public:
SamplingMethod(): _seed(0) {}
virtual ~SamplingMethod() {}
/*Initialize the sampling algorithm given the config with the protobuf format.
*EvoKit library uses only one configuration file for all sampling algorithms.
A defalut configuration file can be found at: . // TODO: where?
Usally you won't have to modify the configuration items of other algorithms
if you are not using them.
*/
virtual bool load_config(const EvoKitConfig& config) = 0;
/*@brief generate Gaussian noise and the related key.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: generate Gaussian successfully or not.
*/
virtual bool sampling(int* key, float* noise, int64_t size) = 0;
/*@brief reconstruct the Gaussion noise given the key.
* This function is often used for updating the neuron network parameters in the offline environment.
*
*@Args:
* key: a unique key associated with the sampled noise.
* noise: a pointer pointed to the memory that stores the noise
* size: the number of float to be sampled.
*
*@return:
* success: reconstruct Gaussian successfully or not.
*/
virtual bool resampling(int key, float* noise, int64_t size) = 0;
bool set_seed(int seed) {
_seed = seed;
srand(_seed);
return true;
}
int get_seed() {
return _seed;
}
protected:
int _seed;
};
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_SGD_OPTIMIZER_H
#define EVO_KIT_SGD_OPTIMIZER_H
#include <cmath>
#include <unordered_map>
#include "evo_kit/optimizer.h"
namespace evo_kit {
/*@brief SGDOptimizer.
* Implements stochastic gradient descent (optionally with momentum).
*
*@Args:
* base_lr: learning rate (default: 1e-3).
* momentum: momentum factor (default: 0.9).
*/
class SGDOptimizer: public Optimizer {
public:
SGDOptimizer(float base_lr, float momentum = 0.9): Optimizer(base_lr), _momentum(momentum) {}
~SGDOptimizer();
protected:
void compute_step(float* gradient, int size, std::string param_name);
private:
float _momentum;
std::unordered_map<std::string, float*> _velocity;
};
} // namespace
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#ifndef EVO_KIT_UTILS_H
#define EVO_KIT_UTILS_H
#include <algorithm>
#include <fstream>
#include <glog/logging.h>
#include <google/protobuf/text_format.h>
#include <string>
#include "evo_kit/evo_kit.pb.h"
namespace evo_kit {
/*Return ranks that is normliazed to [-0.5, 0.5] with the rewards as input.
Args:
reward: an array of rewards
*/
bool compute_centered_ranks(std::vector<float>& reward);
std::string read_file(const std::string& filename);
/* Load a protobuf-based configuration from the file.
* Args:
* config_file: file path.
* proto_config: protobuff message for configuration.
* return
*/
template<typename T>
bool load_proto_conf(const std::string& config_file, T& proto_config) {
bool success = true;
std::ifstream fin(config_file);
if (!fin || fin.fail()) {
LOG(ERROR) << "open prototxt config failed: " << config_file;
success = false;
} else {
fin.seekg(0, std::ios::end);
size_t file_size = fin.tellg();
fin.seekg(0, std::ios::beg);
char* file_content_buffer = new char[file_size];
fin.read(file_content_buffer, file_size);
std::string proto_str(file_content_buffer, file_size);
if (!google::protobuf::TextFormat::ParseFromString(proto_str, &proto_config)) {
LOG(ERROR) << "Failed to load config: " << config_file;
success = false;
}
delete[] file_content_buffer;
fin.close();
}
return success;
}
template<typename T>
bool save_proto_conf(const std::string& config_file, T& proto_config) {
bool success = true;
std::ofstream ofs(config_file, std::ofstream::out);
if (!ofs || ofs.fail()) {
LOG(ERROR) << "open prototxt config failed: " << config_file;
success = false;
} else {
std::string config_str;
success = google::protobuf::TextFormat::PrintToString(proto_config, &config_str);
if (!success) {
return success;
}
ofs << config_str;
}
return success;
}
std::vector<std::string> list_all_model_dirs(std::string path);
}
#endif
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
syntax = "proto2";
package evo_kit;
message EvoKitConfig {
//sampling configuration
optional int32 seed = 1 [default = 18];
optional int32 buffer_size = 2 [default = 100000];
optional GaussianSamplingConfig gaussian_sampling = 3;
// Optimizer Configuration
optional OptimizerConfig optimizer = 4;
// AsyncESAgent Configuration
optional AsyncESConfig async_es = 5;
}
message GaussianSamplingConfig {
optional float std = 1 [default = 1.0];
optional bool cached = 2 [default = false];
optional int32 cache_size = 3 [default = 100000];
}
message OptimizerConfig{
optional string type = 1 [default = "SGD"];
optional float base_lr = 2 [default = 1e-3]; // The base learning rate.
optional float momentum = 3 [default = 0.9]; // The momentum value for SGD.
// ------------Adam Optimizer---------
optional float beta1 = 4 [default = 0.9];
optional float beta2 = 5 [default = 0.999];
optional float epsilon = 6 [default = 1e-8];
}
message SamplingInfo{
repeated int32 key = 1;
optional int32 model_iter_id = 2;
}
message AsyncESConfig{
optional string model_warehouse = 1 [default = "./model_warehouse"];
repeated string model_md5 = 2;
optional int32 max_to_keep = 3 [default = 5];
optional int32 model_iter_id = 4 [default = 0];
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/adam_optimizer.h"
namespace evo_kit {
AdamOptimizer::~AdamOptimizer() {
for (auto iter = _momentum.begin(); iter != _momentum.end(); iter++) {
delete[] iter->second;
}
for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
delete[] iter->second;
}
_momentum.clear();
_velocity.clear();
}
void AdamOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
if (_momentum.count(param_name) == 0) {
_momentum[param_name] = new float [size];
memset(_momentum[param_name], 0, size * sizeof(float));
}
if (_velocity.count(param_name) == 0) {
_velocity[param_name] = new float [size];
memset(_velocity[param_name], 0, size * sizeof(float));
}
int true_update_times = int(_update_times / _velocity.size());
float alpha = std::sqrt(1 - std::pow(_beta2, _update_times)) / (1 - std::pow(_beta1,
_update_times));
for (int i = 0; i < size; ++i) {
_momentum[param_name][i] = _beta1 * _momentum[param_name][i] + (1 - _beta1) * gradient[i];
_velocity[param_name][i] = _beta2 * _velocity[param_name][i] + (1 - _beta2) * gradient[i] *
gradient[i];
gradient[i] = alpha * _momentum[param_name][i] / (std::sqrt(_velocity[param_name][i]) + _epsilon);
}
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/cached_gaussian_sampling.h"
namespace evo_kit {
CachedGaussianSampling::CachedGaussianSampling() {}
CachedGaussianSampling::~CachedGaussianSampling() {
delete[] _noise_cache;
}
bool CachedGaussianSampling::load_config(const EvoKitConfig& config) {
bool success = true;
_std = config.gaussian_sampling().std();
success = set_seed(config.seed());
CHECK(success) << "[EvoKit] Fail to set seed while load config.";
_cache_size = config.gaussian_sampling().cache_size();
_noise_cache = new float [_cache_size];
memset(_noise_cache, 0, _cache_size * sizeof(float));
success = _create_noise_cache();
CHECK(success) << "[EvoKit] Fail to create noise_cache while load config.";
return success;
}
bool CachedGaussianSampling::sampling(int* key, float* noise, int64_t size) {
bool success = true;
if (_noise_cache == nullptr) {
LOG(ERROR) << "[EvoKit] Please use load_config() first.";
success = false;
return success;
}
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
return success;
}
if ((size >= _cache_size) || (size < 0)) {
LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
"), cache_size: " << _cache_size;
success = false;
return success;
}
int rand_key = rand();
std::default_random_engine generator(rand_key);
std::uniform_int_distribution<unsigned int> uniform(0, _cache_size - size);
int index = uniform(generator);
*key = index;
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = *(_noise_cache + index + i);
}
return success;
}
bool CachedGaussianSampling::resampling(int key, float* noise, int64_t size) {
bool success = true;
if (_noise_cache == nullptr) {
LOG(ERROR) << "[EvoKit] Please use load_config() first.";
success = false;
return success;
}
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
return success;
}
if ((size >= _cache_size) || (size < 0)) {
LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
"), cache_size: " << _cache_size;
success = false;
return success;
}
if ((key > _cache_size - size) || (key < 0)) {
LOG(ERROR) << "[EvoKit] Resampling key " << key << " is out of bounds [0, "
<< _cache_size - size <<
"], cache_size: " << _cache_size << ", size: " << size;
success = false;
return success;
}
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = *(_noise_cache + key + i);
}
return success;
}
bool CachedGaussianSampling::_create_noise_cache() {
std::default_random_engine generator(_seed);
std::normal_distribution<float> norm;
for (int64_t i = 0; i < _cache_size; ++i) {
*(_noise_cache + i) = norm(generator) * _std;
}
return true;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/gaussian_sampling.h"
namespace evo_kit {
bool GaussianSampling::load_config(const EvoKitConfig& config) {
bool success = true;
_std = config.gaussian_sampling().std();
success = set_seed(config.seed());
return success;
}
bool GaussianSampling::sampling(int* key, float* noise, int64_t size) {
bool success = true;
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
return success;
}
int rand_key = rand();
*key = rand_key;
std::default_random_engine generator(rand_key);
std::normal_distribution<float> norm;
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = norm(generator) * _std;
}
return success;
}
bool GaussianSampling::resampling(int key, float* noise, int64_t size) {
bool success = true;
if (noise == nullptr) {
LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
success = false;
} else {
std::default_random_engine generator(key);
std::normal_distribution<float> norm;
for (int64_t i = 0; i < size; ++i) {
*(noise + i) = norm(generator) * _std;
}
}
return success;
}
}
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/optimizer_factory.h"
namespace evo_kit {
std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config) {
std::shared_ptr<Optimizer> optimizer;
std::string opt_type = optimizer_config.type();
std::transform(opt_type.begin(), opt_type.end(), opt_type.begin(), ::tolower);
if (opt_type == "sgd") {
optimizer = std::make_shared<SGDOptimizer>(optimizer_config.base_lr(), \
optimizer_config.momentum());
} else if (opt_type == "adam") {
optimizer = std::make_shared<AdamOptimizer>(optimizer_config.base_lr(), \
optimizer_config.beta1(), \
optimizer_config.beta2(), \
optimizer_config.epsilon());
} else {
LOG(ERROR) << "type of OptimizerConfig must be SGD or Adam."; // NotImplementedError
}
return optimizer;
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/sampling_factory.h"
namespace evo_kit {
std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& config) {
std::shared_ptr<SamplingMethod> sampling_method;
bool cached = config.gaussian_sampling().cached();
if (cached) {
sampling_method = std::make_shared<CachedGaussianSampling>();
} else {
sampling_method = std::make_shared<GaussianSampling>();
}
bool success = sampling_method->load_config(config);
if (success) {
return sampling_method;
} else {
LOG(ERROR) << "[EvoKit] Fail to create sampling_method";
return nullptr;
}
}
}//namespace
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "evo_kit/sgd_optimizer.h"
namespace evo_kit {
SGDOptimizer::~SGDOptimizer() {
for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
delete[] iter->second;
}
_velocity.clear();
}
void SGDOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
if (_velocity.count(param_name) == 0) {
_velocity[param_name] = new float [size];
memset(_velocity[param_name], 0, size * sizeof(float));
}
for (int i = 0; i < size; ++i) {
_velocity[param_name][i] = _momentum * _velocity[param_name][i] + (1 - _momentum) * gradient[i];
gradient[i] = _velocity[param_name][i];
}
}
}//namespace
此差异已折叠。
seed: 1024
gaussian_sampling {
std: 0.5
cached: true
cache_size: 100000
}
optimizer {
type: "Adam"
base_lr: 0.05
momentum: 0.9
beta1: 0.9
beta2: 0.999
epsilon: 1e-08
}
async_es {
model_iter_id: 0
}
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
此差异已折叠。
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册