diff --git a/.copyright.hook b/.copyright.hook
index 3be6d0ae5bf352aa08ee44ab2144670f1bf03510..1b0acacb97a1b3059fcc88fb44b6168fa0419473 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -1,6 +1,5 @@
from __future__ import absolute_import
from __future__ import print_function
-from __future__ import unicode_literals
import argparse
import io, re
diff --git a/.teamcity/Dockerfile b/.teamcity/Dockerfile
index c3d1c209eb04bf7379969a28d0be4ce1bfe10c0d..99eec25ba86ed4d2acf77faf25f14d9092b09595 100644
--- a/.teamcity/Dockerfile
+++ b/.teamcity/Dockerfile
@@ -18,3 +18,7 @@
FROM parl/parl-test:cuda9.0-cudnn7-v2
COPY ./requirements.txt /root/
+
+RUN apt-get install -y libgflags-dev libgoogle-glog-dev libomp-dev unzip
+RUN apt-get install -y libgtest-dev && cd /usr/src/gtest && mkdir build \
+ && cd build && cmake .. && make && cp libgtest*.a /usr/local/lib
diff --git a/.teamcity/build.sh b/.teamcity/build.sh
index 6a33424797690bcd088381bd8173ae7d881c2dbc..1f3c0cd20e3dfc0fa3eb378d21d5e490d8afea33 100755
--- a/.teamcity/build.sh
+++ b/.teamcity/build.sh
@@ -69,7 +69,7 @@ function run_test_with_gpu() {
Running unit tests with GPU...
========================================
EOF
- ctest --output-on-failure -j10
+ ctest --output-on-failure -j20 --verbose
cd ${REPO_ROOT}
rm -rf ${REPO_ROOT}/build
}
@@ -90,7 +90,7 @@ function run_test_with_cpu() {
=====================================================
EOF
if [ $# -eq 1 ];then
- ctest --output-on-failure -j10
+ ctest --output-on-failure -j20 --verbose
else
ctest --output-on-failure
fi
@@ -145,7 +145,8 @@ function main() {
;;
test)
# test code compability in environments with various python versions
- declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
+ #declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
+ declare -a envs=("py27" "py36")
for env in "${envs[@]}";do
cd /work
source ~/.bashrc
@@ -158,7 +159,7 @@ function main() {
echo ========================================
pip install .
if [ \( $env == "py27" -o $env == "py36" -o $env == "py37" \) ]
- then
+ then
pip install -r .teamcity/requirements.txt
run_test_with_cpu $env
run_test_with_cpu $env "DIS_TESTING_SERIALLY"
@@ -169,6 +170,10 @@ function main() {
pip install -r .teamcity/requirements_torch.txt
run_test_with_cpu $env "DIS_TESTING_TORCH"
fi
+ # clean env
+ export LC_ALL=C.UTF-8
+ export LANG=C.UTF-8
+ xparl stop
done
run_test_with_gpu
diff --git a/.teamcity/requirements.txt b/.teamcity/requirements.txt
index 354e3632e02ce8e678df2024a6d16657281c1a0e..8ed94543532fee0c02b048a36dba05832ae3d161 100644
--- a/.teamcity/requirements.txt
+++ b/.teamcity/requirements.txt
@@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97
gym
details
parameterized
-timeout_decorator
diff --git a/.teamcity/requirements_torch.txt b/.teamcity/requirements_torch.txt
index dd2808a12eaab7e3158d09334ffb916917427417..5cdd9ea56ad6cc2db2ecd1fc6f7e046ff84507b7 100644
--- a/.teamcity/requirements_torch.txt
+++ b/.teamcity/requirements_torch.txt
@@ -2,4 +2,3 @@
gym
details
parameterized
-timeout_decorator
diff --git a/.scripts/update_readme_paddle_version.py b/.teamcity/update_readme_paddle_version.py
similarity index 94%
rename from .scripts/update_readme_paddle_version.py
rename to .teamcity/update_readme_paddle_version.py
index 56d56914c65956a2bb753bc58269d59034766b1c..901d2d672d9f3eff1021241ac80b6e9f75d0886a 100644
--- a/.scripts/update_readme_paddle_version.py
+++ b/.teamcity/update_readme_paddle_version.py
@@ -37,7 +37,8 @@ if __name__ == '__main__':
exclude_examples = [
'NeurIPS2019-Learn-to-Move-Challenge',
- 'NeurIPS2018-AI-for-Prosthetics-Challenge', 'EagerMode'
+ 'NeurIPS2018-AI-for-Prosthetics-Challenge', 'LiftSim_baseline',
+ 'EagerMode'
]
for example in os.listdir('../examples/'):
if example not in exclude_examples:
diff --git a/.teamcity/windows_test.sh b/.teamcity/windows_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a6d12a6f6c9c212e406f8e900a03c3f4f0cfc44b
--- /dev/null
+++ b/.teamcity/windows_test.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: You need install mingw-cmake.
+
+function init() {
+ RED='\033[0;31m'
+ BLUE='\033[0;34m'
+ BOLD='\033[1m'
+ NONE='\033[0m'
+
+ REPO_ROOT=`pwd`
+}
+
+
+function abort(){
+ echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
+ echo "Please use pre-commit to check what is wrong." 1>&2
+ exit 1
+}
+
+function run_test_with_cpu() {
+ export CUDA_VISIBLE_DEVICES="-1"
+
+ mkdir -p ${REPO_ROOT}/build
+ cd ${REPO_ROOT}/build
+ if [ $# -eq 1 ];then
+ cmake -G "MinGW Makefiles" ..
+ else
+ cmake -G "MinGW Makefiles" .. -$2=ON
+ fi
+ cat <
[English](./README.md) | 简体中文
-[**文档**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md)
+[**文档**](https://parl.readthedocs.io/en/stable/index.html)
> PARL 是一个高性能、灵活的强化学习框架。
# 特点
@@ -48,7 +48,7 @@ class Agent(object):
parl.connect('localhost:8037')
agent = Agent()
agent.say_hello()
-ans = agent.sum(1,5) # run remotely and not comsume any local computation resources
+ans = agent.sum(1,5) # run remotely and not comsume any local computation resources
```
两步调度外部的计算资源:
1. 使用`parl.remote_class`修饰一个类,之后这个类就被转化为可以运行在其他CPU或者机器上的类。
@@ -61,8 +61,8 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour
# 安装:
### 依赖
-- Python 2.7 or 3.5+.
-- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle)
+- Python 2.7 or 3.5+. (**Windows系统**目前仅支持python3.6+以上的环境)
+- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**,如果你只用并行部分的接口不需要安装paddle)
```
@@ -83,6 +83,6 @@ pip install parl
- [冠军解决方案:NIPS2018强化学习假肢挑战赛](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/)
- [冠军解决方案:NIPS2019强化学习仿生人控制赛事](examples/NeurIPS2019-Learn-to-Move-Challenge/)
-
+
diff --git a/README.md b/README.md
index a5cbdd76a71c01a04c33f79fe701322a57795010..ed8ae1e28a6864e0a1d171a172d17dfe1bc03b8f 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
English | [简体中文](./README.cn.md)
-[**Documentation**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md)
+[**Documentation**](https://parl.readthedocs.io/en/stable/index.html)
> PARL is a flexible and high-efficient reinforcement learning framework.
@@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c
# Install:
### Dependencies
-- Python 2.7 or 3.5+.
+- Python 2.7 or 3.5+(On **Windows**, PARL only supprorts the enviroment with python3.6+).
- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone)
diff --git a/benchmark/torch/AlphaZero/.pic/good_moves.png b/benchmark/torch/AlphaZero/.pic/good_moves.png
new file mode 100644
index 0000000000000000000000000000000000000000..f007fc4a6f2dbc9df9a6a8163de08dcf59cb82dc
Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/good_moves.png differ
diff --git a/benchmark/torch/AlphaZero/.pic/perfect_moves.png b/benchmark/torch/AlphaZero/.pic/perfect_moves.png
new file mode 100644
index 0000000000000000000000000000000000000000..72c3913ea58498446e92d170255c71606e194fe0
Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/perfect_moves.png differ
diff --git a/benchmark/torch/AlphaZero/Arena.py b/benchmark/torch/AlphaZero/Arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0791803eb1061485f2f6a647540d9bc9d4f45ee
--- /dev/null
+++ b/benchmark/torch/AlphaZero/Arena.py
@@ -0,0 +1,105 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+from tqdm import tqdm
+from parl.utils import logger
+
+
+class Arena():
+ """
+ An Arena class where any 2 agents can be pit against each other.
+ """
+
+ def __init__(self, player1, player2, game, display=None):
+ """
+ Input:
+ player 1,2: two functions that takes board as input, return action
+ game: Game object
+ display: a function that takes board as input and prints it (e.g.
+ display in othello/OthelloGame). Is necessary for verbose
+ mode.
+
+ see othello/OthelloPlayers.py for an example. See pit.py for pitting
+ human players/other baselines with each other.
+ """
+ self.player1 = player1
+ self.player2 = player2
+ self.game = game
+ self.display = display
+
+ def playGame(self, verbose=False):
+ """
+ Executes one episode of a game.
+
+ Returns:
+ either
+ winner: player who won the game (1 if player1, -1 if player2)
+ or
+ draw result returned from the game that is neither 1, -1, nor 0.
+ """
+ players = [self.player2, None, self.player1]
+ curPlayer = 1
+ board = self.game.getInitBoard()
+ it = 0
+ while self.game.getGameEnded(board, curPlayer) == 0:
+ it += 1
+ if verbose:
+ assert self.display
+ print("Turn ", str(it), "Player ", str(curPlayer))
+ self.display(board)
+ action = players[curPlayer + 1](self.game.getCanonicalForm(
+ board, curPlayer))
+
+ valids = self.game.getValidMoves(
+ self.game.getCanonicalForm(board, curPlayer), 1)
+
+ if valids[action] == 0:
+ logger.error('Action {} is not valid!'.format(action))
+ logger.debug('valids = {}'.format(valids))
+ assert valids[action] > 0
+ board, curPlayer = self.game.getNextState(board, curPlayer, action)
+ if verbose:
+ assert self.display
+ print("Game over: Turn ", str(it), "Result ",
+ str(self.game.getGameEnded(board, 1)))
+ self.display(board)
+ return curPlayer * self.game.getGameEnded(board, curPlayer)
+
+ def playGames(self, num, verbose=False):
+ """
+ Plays num games in which player1 starts num/2 games and player2 starts
+ num/2 games.
+
+ Returns:
+ oneWon: games won by player1
+ twoWon: games won by player2
+ draws: games won by nobody
+ """
+
+ num = int(num / 2)
+ oneWon = 0
+ twoWon = 0
+ draws = 0
+ for _ in tqdm(range(num), desc="Arena.playGames (1)"):
+ gameResult = self.playGame(verbose=verbose)
+ if gameResult == 1:
+ oneWon += 1
+ elif gameResult == -1:
+ twoWon += 1
+ else:
+ draws += 1
+
+ self.player1, self.player2 = self.player2, self.player1
+
+ for _ in tqdm(range(num), desc="Arena.playGames (2)"):
+ gameResult = self.playGame(verbose=verbose)
+ if gameResult == -1:
+ oneWon += 1
+ elif gameResult == 1:
+ twoWon += 1
+ else:
+ draws += 1
+
+ return oneWon, twoWon, draws
diff --git a/benchmark/torch/AlphaZero/Coach.py b/benchmark/torch/AlphaZero/Coach.py
new file mode 100644
index 0000000000000000000000000000000000000000..01394b076db969db42a7277b5d95f82bd661db3d
--- /dev/null
+++ b/benchmark/torch/AlphaZero/Coach.py
@@ -0,0 +1,246 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import threading
+import queue
+import pickle
+from pickle import Pickler, Unpickler
+from random import shuffle
+from parl.utils import tensorboard
+
+import numpy as np
+from tqdm import tqdm
+
+import parl
+from parl.utils import logger
+
+from actor import Actor
+from utils import split_group, get_test_dataset
+from alphazero_agent import create_agent
+
+
+class Coach():
+ """
+ This class executes the self-play, learning and evaluating.
+ """
+
+ def __init__(self, game, args):
+ self.game = game
+ self.args = args
+
+ # neural network of current generation
+ self.current_agent = create_agent(self.game)
+ # neural network of previous generation
+ self.previous_agent = create_agent(self.game)
+
+ # history of examples from args.numItersForTrainExamplesHistory latest iterations
+ self.trainExamplesHistory = []
+
+ self.remote_actors_signal_queues = []
+ self.remote_actors_return_queue = queue.Queue()
+
+ self.test_dataset = get_test_dataset()
+
+ def _run_remote_tasks(self, signal_queue):
+ # The remote actor will actually run on the local machine or other machines of xparl cluster
+ remote_actor = Actor(self.game, self.args)
+
+ while True:
+ # receive running task signal
+ # signal: specify task type and task input data (optional)
+ signal = signal_queue.get()
+
+ if signal["task"] == "self-play":
+ episode_num_each_actor = self.args.numEps // self.args.actors_num
+ result = remote_actor.self_play(
+ self.current_agent.get_weights(), episode_num_each_actor)
+ self.remote_actors_return_queue.put({"self-play": result})
+
+ elif signal["task"] == "pitting":
+ games_num_each_actor = self.args.arenaCompare // self.args.actors_num
+ result = remote_actor.pitting(
+ self.previous_agent.get_weights(),
+ self.current_agent.get_weights(), games_num_each_actor)
+ self.remote_actors_return_queue.put({"pitting": result})
+
+ elif signal["task"] == "evaluate_test_dataset":
+ test_dataset = signal["test_dataset"]
+ result = remote_actor.evaluate_test_dataset(
+ self.current_agent.get_weights(), test_dataset)
+ self.remote_actors_return_queue.put({
+ "evaluate_test_dataset":
+ result
+ })
+ else:
+ raise NotImplementedError
+
+ def _create_remote_actors(self):
+ # connect to xparl cluster to submit jobs
+ parl.connect(self.args.master_address)
+
+ for i in range(self.args.actors_num):
+ signal_queue = queue.Queue()
+ self.remote_actors_signal_queues.append(signal_queue)
+
+ remote_thread = threading.Thread(
+ target=self._run_remote_tasks, args=(signal_queue, ))
+ remote_thread.setDaemon(True)
+ remote_thread.start()
+
+ def learn(self):
+ """Each iteration:
+ 1. Performs numEps episodes of self-play.
+ 2. Retrains neural network with examples in trainExamplesHistory
+ (which has a maximum length of numItersForTrainExamplesHistory).
+ 3. Evaluates the new neural network with the test dataset.
+ 4. Pits the new neural network against the old one and accepts it
+ only if it wins >= updateThreshold fraction of games.
+ """
+
+ # create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel.
+ self._create_remote_actors()
+
+ for iteration in range(1, self.args.numIters + 1):
+ logger.info('Starting Iter #{} ...'.format(iteration))
+
+ ####################
+ logger.info('Step1: self-play in parallel...')
+ iterationTrainExamples = []
+ # update weights of remote actors to the latest weights, and ask them to run self-play task
+ for signal_queue in self.remote_actors_signal_queues:
+ signal_queue.put({"task": "self-play"})
+ # wait for all remote actors (a total of self.args.actors_num) to return the self-play results
+ for _ in range(self.args.actors_num):
+ result = self.remote_actors_return_queue.get()
+ iterationTrainExamples.extend(result["self-play"])
+
+ # save the iteration examples to the history
+ self.trainExamplesHistory.append(iterationTrainExamples)
+ if len(self.trainExamplesHistory
+ ) > self.args.numItersForTrainExamplesHistory:
+ logger.warning("Removing the oldest entry in trainExamples.")
+ self.trainExamplesHistory.pop(0)
+ self.saveTrainExamples(iteration) # backup history to a file
+
+ ####################
+ logger.info('Step2: train neural network...')
+ # shuffle examples before training
+ trainExamples = []
+ for e in self.trainExamplesHistory:
+ trainExamples.extend(e)
+ shuffle(trainExamples)
+
+ # training new network, keeping a copy of the old one
+ self.current_agent.save(
+ os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+ self.previous_agent.restore(
+ os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+
+ self.current_agent.learn(trainExamples)
+
+ ####################
+ logger.info('Step3: evaluate test dataset in parallel...')
+ cnt = 0
+ # update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset
+ for i, data in enumerate(
+ split_group(
+ self.test_dataset,
+ len(self.test_dataset) // self.args.actors_num)):
+ self.remote_actors_signal_queues[i].put({
+ "task":
+ "evaluate_test_dataset",
+ "test_dataset":
+ data
+ })
+ cnt += len(data)
+ perfect_moves_cnt, good_moves_cnt = 0, 0
+ # wait for all remote actors (a total of self.args.actors_num) to return the evaluating results
+ for _ in range(self.args.actors_num):
+ (perfect_moves,
+ good_moves) = self.remote_actors_return_queue.get(
+ )["evaluate_test_dataset"]
+ perfect_moves_cnt += perfect_moves
+ good_moves_cnt += good_moves
+ logger.info('perfect moves rate: {}, good moves rate: {}'.format(
+ perfect_moves_cnt / cnt, good_moves_cnt / cnt))
+ tensorboard.add_scalar('perfect_moves_rate',
+ perfect_moves_cnt / cnt, iteration)
+ tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt,
+ iteration)
+
+ ####################
+ logger.info(
+ 'Step4: pitting against previous generation in parallel...')
+ # transfer weights of previous generation and current generation to the remote actors, and ask them to pit.
+ for signal_queue in self.remote_actors_signal_queues:
+ signal_queue.put({"task": "pitting"})
+ previous_wins, current_wins, draws = 0, 0, 0
+ for _ in range(self.args.actors_num):
+ (pwins_, cwins_,
+ draws_) = self.remote_actors_return_queue.get()["pitting"]
+ previous_wins += pwins_
+ current_wins += cwins_
+ draws += draws_
+
+ logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
+ (current_wins, previous_wins, draws))
+ if previous_wins + current_wins == 0 or float(current_wins) / (
+ previous_wins + current_wins) < self.args.updateThreshold:
+ logger.info('REJECTING NEW MODEL')
+ self.current_agent.restore(
+ os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+ else:
+ logger.info('ACCEPTING NEW MODEL')
+ self.current_agent.save(
+ os.path.join(self.args.checkpoint, 'best.pth.tar'))
+ self.current_agent.save(
+ os.path.join(self.args.checkpoint,
+ self.getCheckpointFile(iteration)))
+
+ def getCheckpointFile(self, iteration):
+ return 'checkpoint_' + str(iteration) + '.pth.tar'
+
+ def saveTrainExamples(self, iteration):
+ folder = self.args.checkpoint
+ if not os.path.exists(folder):
+ os.makedirs(folder)
+ filename = os.path.join(
+ folder,
+ self.getCheckpointFile(iteration) + ".examples")
+ with open(filename, "wb+") as f:
+ Pickler(f).dump(self.trainExamplesHistory)
+ f.closed
+
+ def loadModel(self):
+ self.current_agent.restore(
+ os.path.join(self.args.load_folder_file[0],
+ self.args.load_folder_file[1]))
+
+ def loadTrainExamples(self):
+ modelFile = os.path.join(self.args.load_folder_file[0],
+ self.args.load_folder_file[1])
+ examplesFile = modelFile + ".examples"
+ if not os.path.isfile(examplesFile):
+ logger.warning(
+ "File {} with trainExamples not found!".format(examplesFile))
+ r = input("Continue? [y|n]")
+ if r != "y":
+ sys.exit()
+ else:
+ logger.info("File with trainExamples found. Loading it...")
+ with open(examplesFile, "rb") as f:
+ self.trainExamplesHistory = Unpickler(f).load()
+ logger.info('Loading done!')
diff --git a/benchmark/torch/AlphaZero/MCTS.py b/benchmark/torch/AlphaZero/MCTS.py
new file mode 100644
index 0000000000000000000000000000000000000000..b011efe15dbdc10ccbe2c07e6d30b2e2aaa82d9d
--- /dev/null
+++ b/benchmark/torch/AlphaZero/MCTS.py
@@ -0,0 +1,164 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import math
+import time
+
+import numpy as np
+
+EPS = 1e-8
+
+
+class MCTS():
+ """
+ This class handles the MCTS tree.
+ """
+
+ def __init__(self, game, nn_agent, args, dirichlet_noise=False):
+ self.game = game
+ self.nn_agent = nn_agent
+ self.args = args
+ self.dirichlet_noise = dirichlet_noise
+ self.Qsa = {} # stores Q values for s,a (as defined in the paper)
+ self.Nsa = {} # stores #times edge s,a was visited
+ self.Ns = {} # stores #times board s was visited
+ self.Ps = {} # stores initial policy (returned by neural net)
+
+ self.Es = {} # stores game.getGameEnded ended for board s
+ self.Vs = {} # stores game.getValidMoves for board s
+
+ def getActionProb(self, canonicalBoard, temp=1):
+ """
+ This function performs numMCTSSims simulations of MCTS starting from
+ canonicalBoard.
+
+ Returns:
+ probs: a policy vector where the probability of the ith action is
+ proportional to Nsa[(s,a)]**(1./temp)
+ """
+ for i in range(self.args.numMCTSSims):
+ dir_noise = (i == 0 and self.dirichlet_noise)
+ self.search(canonicalBoard, dirichlet_noise=dir_noise)
+
+ s = self.game.stringRepresentation(canonicalBoard)
+ counts = [
+ self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
+ for a in range(self.game.getActionSize())
+ ]
+
+ if temp == 0:
+ bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
+ bestA = np.random.choice(bestAs)
+ probs = [0] * len(counts)
+ probs[bestA] = 1
+ return probs
+
+ counts = [x**(1. / temp) for x in counts]
+ counts_sum = float(sum(counts))
+ probs = [x / counts_sum for x in counts]
+ return probs
+
+ def search(self, canonicalBoard, dirichlet_noise=False):
+ """
+ This function performs one iteration of MCTS. It is recursively called
+ till a leaf node is found. The action chosen at each node is one that
+ has the maximum upper confidence bound as in the paper.
+
+ Once a leaf node is found, the neural network is called to return an
+ initial policy P and a value v for the state. This value is propagated
+ up the search path. In case the leaf node is a terminal state, the
+ outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
+ updated.
+
+ NOTE: the return values are the negative of the value of the current
+ state. This is done since v is in [-1,1] and if v is the value of a
+ state for the current player, then its value is -v for the other player.
+
+ Returns:
+ v: the negative of the value of the current canonicalBoard
+ """
+
+ s = self.game.stringRepresentation(canonicalBoard)
+
+ if s not in self.Es:
+ self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
+ if self.Es[s] != 0:
+ # terminal node
+ return -self.Es[s]
+
+ if s not in self.Ps:
+ # leaf node
+ self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
+
+ valids = self.game.getValidMoves(canonicalBoard, 1)
+ self.Ps[s] = self.Ps[s] * valids # masking invalid moves
+ if dirichlet_noise:
+ self.applyDirNoise(s, valids)
+ sum_Ps_s = np.sum(self.Ps[s])
+ if sum_Ps_s > 0:
+ self.Ps[s] /= sum_Ps_s # renormalize
+ else:
+ # if all valid moves were masked make all valid moves equally probable
+
+ # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
+ # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
+ print("All valid moves were masked, doing a workaround.")
+ self.Ps[s] = self.Ps[s] + valids
+ self.Ps[s] /= np.sum(self.Ps[s])
+
+ self.Vs[s] = valids
+ self.Ns[s] = 0
+ return -v
+
+ valids = self.Vs[s]
+ if dirichlet_noise:
+ self.applyDirNoise(s, valids)
+ sum_Ps_s = np.sum(self.Ps[s])
+ self.Ps[s] /= sum_Ps_s # renormalize
+ cur_best = -float('inf')
+ best_act = -1
+
+ # pick the action with the highest upper confidence bound
+ for a in range(self.game.getActionSize()):
+ if valids[a]:
+ if (s, a) in self.Qsa:
+ u = self.Qsa[
+ (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
+ self.Ns[s]) / (1 + self.Nsa[(s, a)])
+ else:
+ u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
+ self.Ns[s] + EPS) # Q = 0 ?
+
+ if u > cur_best:
+ cur_best = u
+ best_act = a
+
+ a = best_act
+ next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
+ next_s = self.game.getCanonicalForm(next_s, next_player)
+
+ v = self.search(next_s)
+
+ if (s, a) in self.Qsa:
+ self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
+ (s, a)] + v) / (self.Nsa[(s, a)] + 1)
+ self.Nsa[(s, a)] += 1
+
+ else:
+ self.Qsa[(s, a)] = v
+ self.Nsa[(s, a)] = 1
+
+ self.Ns[s] += 1
+ return -v
+
+ def applyDirNoise(self, s, valids):
+ dir_values = np.random.dirichlet(
+ [self.args.dirichletAlpha] * np.count_nonzero(valids))
+ dir_idx = 0
+ for idx in range(len(self.Ps[s])):
+ if self.Ps[s][idx]:
+ self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
+ 0.25 * dir_values[dir_idx])
+ dir_idx += 1
diff --git a/benchmark/torch/AlphaZero/README.md b/benchmark/torch/AlphaZero/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..72d9c807fb5066c51b49520b8aca3a5e666e133c
--- /dev/null
+++ b/benchmark/torch/AlphaZero/README.md
@@ -0,0 +1,58 @@
+## AlphaZero baseline for Connect4 game (distributed version)
+- In this example, we provide a fine-tuned AlphaZero baseline to solve the Connect4 game, based on the code of [alpha-zero-general](https://github.com/suragnair/alpha-zero-general) repo.
+- We take advantage of the parallelism capacity of [PARL](https://github.com/PaddlePaddle/PARL) to support running self-play and evaluating tasks in parallel.
+- We also provide scripts to pack your well-trained model to a submission file, which can be submitted to the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition directly.
+
+### Dependencies
+- python3
+- [parl==1.3](https://github.com/PaddlePaddle/PARL)
+- torch
+- tqdm
+
+### Training
+1. Download the [1k connect4 validation set](https://www.kaggle.com/petercnudde/1k-connect4-validation-set) to the current directory. (filename: `refmoves1k_kaggle`)
+
+2. Start xparl cluster
+```bash
+# You can change following `cpu_num` and `args.actor_nums` in the main.py
+# based on the CPU number of your machine.
+
+xparl start --port 8010 --cpu_num 25
+```
+
+```bash
+# [OPTIONAL] You can also run the following script in other machines to add more CPU resource
+# to the xparl cluster, so you can increase the parallelism (args.actor_nums).
+
+xparl connect --address MASTER_IP:8010 --cpu_num [CPU_NUM]
+```
+
+3. Run training script
+```bash
+python main.py
+```
+
+4. Visualize (good moves rate and perfect moves rate)
+```
+tensorboard --logdir .
+```
+
+### Submitting
+To submit the well-trained model to the Kaggle, you can use our provided script to generate `submission.py`, for example:
+```bash
+python gen_submission.py saved_model/best.pth.tar
+```
+
+### Performance
+- Following are `good moves rate` and `perfect moves rate` indicators in tensorbaord, please refer to the [link](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) for specific meaning.
+
+
+
+> It takes about 1 day to run 25 iterations on the machine with 25 cpus.
+
+- It can reach about score 1368 (rank 5 on 2020/06/04) in the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition.
+
+
+### Reference
+- [suragnair/alpha-zero-general](https://github.com/suragnair/alpha-zero-general)
+- [Scoring connect-x agents](https://www.kaggle.com/petercnudde/scoring-connect-x-agents)
diff --git a/benchmark/torch/AlphaZero/actor.py b/benchmark/torch/AlphaZero/actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed719b92d292903f81f7c92a983927bf5c9cab5
--- /dev/null
+++ b/benchmark/torch/AlphaZero/actor.py
@@ -0,0 +1,165 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import parl
+import os
+from alphazero_agent import create_agent
+from MCTS import MCTS
+from Arena import Arena
+from utils import win_loss_draw
+
+
+@parl.remote_class
+class Actor(object):
+ def __init__(self, game, args):
+ os.environ['OMP_NUM_THREADS'] = "1"
+ self.game = game
+ self.args = args
+
+ # neural network of previous generation
+ self.previous_agent = create_agent(self.game, cuda=False)
+ # neural network of current generation
+ self.current_agent = create_agent(self.game, cuda=False)
+
+ # MCTS of previous generation
+ self.previous_mcts = MCTS(
+ self.game, self.previous_agent, self.args, dirichlet_noise=True)
+ # MCTS of current generation
+ self.current_mcts = MCTS(
+ self.game, self.current_agent, self.args, dirichlet_noise=True)
+
+ def self_play(self, current_weights, game_num):
+ """Collecting training data by self-play.
+
+ Args:
+ current_weights (numpy.array): latest weights of neural network
+ game_num (int): game number of self-play
+
+ Returns:
+ train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v)
+ """
+
+ # update weights of current neural network with latest weights
+ self.current_agent.set_weights(current_weights)
+
+ train_examples = []
+ for _ in range(game_num):
+ # reset node state of MCTS
+ self.current_mcts = MCTS(
+ self.game, self.current_agent, self.args, dirichlet_noise=True)
+ train_examples.extend(self._executeEpisode())
+ return train_examples
+
+ def pitting(self, previous_weights, current_weights, games_num):
+ """Fighting between previous generation agent and current generation agent
+
+ Args:
+ previous_weights (numpy.array): weights of previous generation neural network
+ current_weights (numpy.array): weights of current generation neural network
+ game_num (int): game number of fighting
+
+ Returns:
+ tuple of (game number of previous agent won, game number of current agent won, game number of draw)
+ """
+ # update weights of previous and current neural network
+ self.previous_agent.set_weights(previous_weights)
+ self.current_agent.set_weights(current_weights)
+
+ # reset node state of MCTS
+ self.previous_mcts = MCTS(self.game, self.previous_agent, self.args)
+ self.current_mcts = MCTS(self.game, self.current_agent, self.args)
+
+ arena = Arena(
+ lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)),
+ lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)),
+ self.game)
+ previous_wins, current_wins, draws = arena.playGames(games_num)
+
+ return (previous_wins, current_wins, draws)
+
+ def evaluate_test_dataset(self, current_weights, test_dataset):
+ """Evaluate performance of latest neural nerwork
+
+ Args:
+ current_weights (numpy.array): latest weights of neural network
+ test_dataset (list): game number of self-play
+
+ Returns:
+ tuple of (number of perfect moves, number of good moves)
+ """
+ # update weights of current neural network with latest weights
+ self.current_agent.set_weights(current_weights)
+
+ perfect_move_count, good_move_count = 0, 0
+ for data in test_dataset:
+ self.current_mcts = MCTS(self.game, self.current_agent, self.args)
+
+ x = self.game.getCanonicalForm(data['board'], data['player'])
+ agent_move = int(
+ np.argmax(self.current_mcts.getActionProb(x, temp=0)))
+
+ moves = data["move_score"]
+ perfect_score = max(moves)
+ perfect_moves = [i for i in range(7) if moves[i] == perfect_score]
+
+ if agent_move in perfect_moves:
+ perfect_move_count += 1
+ if win_loss_draw(
+ moves[agent_move]) == win_loss_draw(perfect_score):
+ good_move_count += 1
+
+ return (perfect_move_count, good_move_count)
+
+ def _executeEpisode(self):
+ """
+
+ This function executes one episode of self-play, starting with player 1.
+ As the game goes on, each turn is added as a training example to
+ trainExamples. The game is played till the game ends. After the game
+ ends, the outcome of the game is used to assign values to each example
+ in trainExamples.
+
+ It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter
+ uses temp=0.
+
+ Returns:
+ trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
+ pi is the MCTS informed policy vector, v is +1 if
+ the player eventually won the game, else -1.
+ """
+ trainExamples = []
+ board = self.game.getInitBoard()
+ self.curPlayer = 1
+ episodeStep = 0
+
+ while True:
+ episodeStep += 1
+ canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
+ temp = int(episodeStep < self.args.tempThresholdStep)
+
+ pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp)
+ sym = self.game.getSymmetries(canonicalBoard, pi)
+ for b, p in sym: # board, pi
+ trainExamples.append([b, self.curPlayer, p, None])
+
+ action = np.random.choice(len(pi), p=pi)
+ board, self.curPlayer = self.game.getNextState(
+ board, self.curPlayer, action)
+
+ r = self.game.getGameEnded(board, self.curPlayer)
+
+ if r != 0:
+ return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
+ for x in trainExamples]
diff --git a/benchmark/torch/AlphaZero/alphazero_agent.py b/benchmark/torch/AlphaZero/alphazero_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e7e497e4818f30ae8d71bee109f4ff6f9795962
--- /dev/null
+++ b/benchmark/torch/AlphaZero/alphazero_agent.py
@@ -0,0 +1,150 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import parl
+import torch
+import torch.optim as optim
+
+from tqdm import tqdm
+from utils import *
+from connect4_model import Connect4Model
+
+args = dotdict({
+ 'lr': 0.001,
+ 'dropout': 0.3,
+ 'epochs': 5,
+ 'batch_size': 64,
+ 'num_channels': 64,
+})
+
+
+class AlphaZero(parl.Algorithm):
+ def __init__(self, model):
+ self.model = model
+
+ def learn(self, boards, target_pis, target_vs, optimizer):
+ self.model.train() # train mode
+
+ # compute model output
+ out_log_pi, out_v = self.model(boards)
+
+ pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0]
+
+ v_loss = torch.sum(
+ (target_vs - out_v.view(-1))**2) / target_vs.size()[0]
+
+ total_loss = pi_loss + v_loss
+
+ # compute gradient and do SGD step
+ optimizer.zero_grad()
+ total_loss.backward()
+ optimizer.step()
+
+ return total_loss, pi_loss, v_loss
+
+ def predict(self, board):
+ self.model.eval() # eval mode
+
+ with torch.no_grad():
+ log_pi, v = self.model(board)
+
+ pi = torch.exp(log_pi)
+ return pi, v
+
+
+def create_agent(game, cuda=True):
+ cuda = cuda and torch.cuda.is_available()
+
+ model = Connect4Model(game, args)
+ if cuda:
+ model.cuda()
+
+ algorithm = AlphaZero(model)
+
+ alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
+ return alphazero_agent
+
+
+class AlphaZeroAgent(parl.Agent):
+ def __init__(self, algorithm, game, cuda):
+ super(AlphaZeroAgent, self).__init__(algorithm)
+ self.cuda = cuda
+ self.board_x, self.board_y = game.getBoardSize()
+ self.action_size = game.getActionSize()
+
+ def learn(self, examples):
+ """
+ Args:
+ examples: list of examples, each example is of form (board, pi, v)
+ """
+ optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr)
+
+ for epoch in range(args.epochs):
+ print('EPOCH ::: ' + str(epoch + 1))
+
+ batch_count = int(len(examples) / args.batch_size)
+
+ pbar = tqdm(range(batch_count), desc='Training Net')
+ for _ in pbar:
+ sample_ids = np.random.randint(
+ len(examples), size=args.batch_size)
+ boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
+ boards = torch.FloatTensor(np.array(boards).astype(np.float64))
+ target_pis = torch.FloatTensor(np.array(pis))
+ target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))
+
+ if self.cuda:
+ boards, target_pis, target_vs = boards.contiguous().cuda(
+ ), target_pis.contiguous().cuda(), target_vs.contiguous(
+ ).cuda()
+
+ total_loss, pi_loss, v_loss = self.algorithm.learn(
+ boards, target_pis, target_vs, optimizer)
+
+ # record loss with tqdm
+ pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item())
+
+ def predict(self, board):
+ """
+ Args:
+ board (np.array): input board
+
+ Return:
+ pi (np.array): probability of actions
+ v (np.array): estimated value of input
+ """
+ # preparing input
+ board = torch.FloatTensor(board.astype(np.float64))
+ if self.cuda:
+ board = board.contiguous().cuda()
+ board = board.view(1, self.board_x, self.board_y)
+
+ pi, v = self.algorithm.predict(board)
+
+ return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
+
+
+def create_agent(game, cuda=True):
+ cuda = cuda and torch.cuda.is_available()
+
+ model = Connect4Model(game, args)
+ if cuda:
+ model.cuda()
+
+ algorithm = AlphaZero(model)
+
+ alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
+ return alphazero_agent
diff --git a/benchmark/torch/AlphaZero/connect4_game.py b/benchmark/torch/AlphaZero/connect4_game.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10e8ca4afbca839ef71b18fd8f39f7493f30a4d
--- /dev/null
+++ b/benchmark/torch/AlphaZero/connect4_game.py
@@ -0,0 +1,239 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import numpy as np
+from collections import namedtuple
+
+DEFAULT_HEIGHT = 6
+DEFAULT_WIDTH = 7
+DEFAULT_WIN_LENGTH = 4
+
+WinState = namedtuple('WinState', 'is_ended winner')
+
+
+class Board():
+ """
+ Connect4 Board.
+ """
+
+ def __init__(self,
+ height=None,
+ width=None,
+ win_length=None,
+ np_pieces=None):
+ "Set up initial board configuration."
+ self.height = height or DEFAULT_HEIGHT
+ self.width = width or DEFAULT_WIDTH
+ self.win_length = win_length or DEFAULT_WIN_LENGTH
+
+ if np_pieces is None:
+ self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
+ else:
+ self.np_pieces = np_pieces
+ assert self.np_pieces.shape == (self.height, self.width)
+
+ def add_stone(self, column, player):
+ "Create copy of board containing new stone."
+ available_idx, = np.where(self.np_pieces[:, column] == 0)
+ if len(available_idx) == 0:
+ raise ValueError(
+ "Can't play column %s on board %s" % (column, self))
+
+ self.np_pieces[available_idx[-1]][column] = player
+
+ def get_valid_moves(self):
+ "Any zero value in top row in a valid move"
+ return self.np_pieces[0] == 0
+
+ def get_win_state(self):
+ for player in [-1, 1]:
+ player_pieces = self.np_pieces == -player
+ # Check rows & columns for win
+ if (self._is_straight_winner(player_pieces)
+ or self._is_straight_winner(player_pieces.transpose())
+ or self._is_diagonal_winner(player_pieces)):
+ return WinState(True, -player)
+
+ # draw has very little value.
+ if not self.get_valid_moves().any():
+ return WinState(True, None)
+
+ # Game is not ended yet.
+ return WinState(False, None)
+
+ def with_np_pieces(self, np_pieces):
+ """Create copy of board with specified pieces."""
+ if np_pieces is None:
+ np_pieces = self.np_pieces
+ return Board(self.height, self.width, self.win_length, np_pieces)
+
+ def _is_diagonal_winner(self, player_pieces):
+ """Checks if player_pieces contains a diagonal win."""
+ win_length = self.win_length
+ for i in range(len(player_pieces) - win_length + 1):
+ for j in range(len(player_pieces[0]) - win_length + 1):
+ if all(player_pieces[i + x][j + x] for x in range(win_length)):
+ return True
+ for j in range(win_length - 1, len(player_pieces[0])):
+ if all(player_pieces[i + x][j - x] for x in range(win_length)):
+ return True
+ return False
+
+ def _is_straight_winner(self, player_pieces):
+ """Checks if player_pieces contains a vertical or horizontal win."""
+ run_lengths = [
+ player_pieces[:, i:i + self.win_length].sum(axis=1)
+ for i in range(len(player_pieces) - self.win_length + 2)
+ ]
+ return max([x.max() for x in run_lengths]) >= self.win_length
+
+ def __str__(self):
+ return str(self.np_pieces)
+
+
+class Connect4Game(object):
+ """
+ Connect4 Game class implementing the alpha-zero-general Game interface.
+
+ Use 1 for player1 and -1 for player2.
+ """
+
+ def __init__(self,
+ height=None,
+ width=None,
+ win_length=None,
+ np_pieces=None):
+ self._base_board = Board(height, width, win_length, np_pieces)
+
+ def getInitBoard(self):
+ """
+ Returns:
+ startBoard: a representation of the board (ideally this is the form
+ that will be the input to your neural network)
+ """
+ return self._base_board.np_pieces
+
+ def getBoardSize(self):
+ """
+ Returns:
+ (x,y): a tuple of board dimensions
+ """
+ return (self._base_board.height, self._base_board.width)
+
+ def getActionSize(self):
+ """
+ Returns:
+ actionSize: number of all possible actions
+ """
+ return self._base_board.width
+
+ def getNextState(self, board, player, action):
+ """Returns a copy of the board with updated move, original board is unmodified.
+
+ Input:
+ board: current board
+ player: current player (1 or -1)
+ action: action taken by current player
+
+ Returns:
+ nextBoard: board after applying action
+ nextPlayer: player who plays in the next turn (should be -player)
+
+ """
+ b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
+ b.add_stone(action, player)
+ return b.np_pieces, -player
+
+ def getValidMoves(self, board, player):
+ """Any zero value in top row in a valid move.
+
+ Input:
+ board: current board
+ player: current player
+
+ Returns:
+ validMoves: a binary vector of length self.getActionSize(), 1 for
+ moves that are valid from the current board and player,
+ 0 for invalid moves
+ """
+ return self._base_board.with_np_pieces(
+ np_pieces=board).get_valid_moves()
+
+ def getGameEnded(self, board, player):
+ """
+ Input:
+ board: current board
+ player: current player (1 or -1)
+
+ Returns:
+ r: 0 if game has not ended. 1 if player won, -1 if player lost,
+ small non-zero value for draw.
+
+ """
+ b = self._base_board.with_np_pieces(np_pieces=board)
+ winstate = b.get_win_state()
+ if winstate.is_ended:
+ if winstate.winner is None:
+ # draw has very little value.
+ return 1e-4
+ elif winstate.winner == player:
+ return +1
+ elif winstate.winner == -player:
+ return -1
+ else:
+ raise ValueError('Unexpected winstate found: ', winstate)
+ else:
+ # 0 used to represent unfinished game.
+ return 0
+
+ def getCanonicalForm(self, board, player):
+ """
+ Input:
+ board: current board
+ player: current player (1 or -1)
+
+ Returns:
+ canonicalBoard: returns canonical form of board. The canonical form
+ should be independent of player. For e.g. in chess,
+ the canonical form can be chosen to be from the pov
+ of white. When the player is white, we can return
+ board as is. When the player is black, we can invert
+ the colors and return the board.
+ """
+ return board * player
+
+ def getSymmetries(self, board, pi):
+ """Board is left/right board symmetric
+
+ Input:
+ board: current board
+ pi: policy vector of size self.getActionSize()
+
+ Returns:
+ symmForms: a list of [(board,pi)] where each tuple is a symmetrical
+ form of the board and the corresponding pi vector. This
+ is used when training the neural network from examples.
+ """
+ return [(board, pi),
+ (np.array(board[:, ::-1], copy=True),
+ np.array(pi[::-1], copy=True))]
+
+ def stringRepresentation(self, board):
+ """
+ Input:
+ board: current board
+
+ Returns:
+ boardString: a quick conversion of board to a string format.
+ Required by MCTS for hashing.
+ """
+ return board.tostring()
+
+ @staticmethod
+ def display(board):
+ print(" -----------------------")
+ print(' '.join(map(str, range(len(board[0])))))
+ print(board)
+ print(" -----------------------")
diff --git a/benchmark/torch/AlphaZero/connect4_model.py b/benchmark/torch/AlphaZero/connect4_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0f7705bfc40d1645d77c79ac7e47f1f721a317
--- /dev/null
+++ b/benchmark/torch/AlphaZero/connect4_model.py
@@ -0,0 +1,86 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+class Connect4Model(parl.Model):
+ def __init__(self, game, args):
+ # game params
+ self.board_x, self.board_y = game.getBoardSize()
+ self.action_size = game.getActionSize()
+ self.args = args
+
+ super(Connect4Model, self).__init__()
+ self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
+ self.conv2 = nn.Conv2d(
+ args.num_channels, args.num_channels, 3, stride=1, padding=1)
+ self.conv3 = nn.Conv2d(
+ args.num_channels, args.num_channels, 3, stride=1)
+ self.conv4 = nn.Conv2d(
+ args.num_channels, args.num_channels, 3, stride=1)
+
+ self.bn1 = nn.BatchNorm2d(args.num_channels)
+ self.bn2 = nn.BatchNorm2d(args.num_channels)
+ self.bn3 = nn.BatchNorm2d(args.num_channels)
+ self.bn4 = nn.BatchNorm2d(args.num_channels)
+
+ self.fc1 = nn.Linear(
+ args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
+ self.fc_bn1 = nn.BatchNorm1d(128)
+
+ self.fc2 = nn.Linear(128, 64)
+ self.fc_bn2 = nn.BatchNorm1d(64)
+
+ self.fc3 = nn.Linear(64, self.action_size)
+
+ self.fc4 = nn.Linear(64, 1)
+
+ def forward(self, s):
+ """
+ Args:
+ s(torch.Tensor): batch_size x board_x x board_y
+ """
+ # batch_size x 1 x board_x x board_y
+ s = s.view(-1, 1, self.board_x, self.board_y)
+ # batch_size x num_channels x board_x x board_y
+ s = F.relu(self.bn1(self.conv1(s)))
+ # batch_size x num_channels x board_x x board_y
+ s = F.relu(self.bn2(self.conv2(s)))
+ # batch_size x num_channels x (board_x-2) x (board_y-2)
+ s = F.relu(self.bn3(self.conv3(s)))
+ # batch_size x num_channels x (board_x-4) x (board_y-4)
+ s = F.relu(self.bn4(self.conv4(s)))
+ s = s.view(
+ -1,
+ self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
+
+ s = F.dropout(
+ F.relu(self.fc_bn1(self.fc1(s))),
+ p=self.args.dropout,
+ training=self.training) # batch_size x 128
+ s = F.dropout(
+ F.relu(self.fc_bn2(self.fc2(s))),
+ p=self.args.dropout,
+ training=self.training) # batch_size x 64
+
+ pi = self.fc3(s) # batch_size x action_size
+ v = self.fc4(s) # batch_size x 1
+
+ return F.log_softmax(pi, dim=1), torch.tanh(v)
diff --git a/parl/framework/model_base.py b/benchmark/torch/AlphaZero/gen_submission.py
similarity index 50%
rename from parl/framework/model_base.py
rename to benchmark/torch/AlphaZero/gen_submission.py
index e4057a7706c2e26e66db340128679919290cb1bd..03728ec2cda4f155229ba7b4d18c7f2a22734e05 100644
--- a/parl/framework/model_base.py
+++ b/benchmark/torch/AlphaZero/gen_submission.py
@@ -12,13 +12,29 @@
# See the License for the specific language governing permissions and
# limitations under the License.
-import warnings
+import sys
+import base64
+import inspect
+import os
-warnings.simplefilter('default')
+assert len(sys.argv) == 2, "please specify model path."
+model_path = sys.argv[1]
-warnings.warn(
- "module `parl.framework.model_base.Model` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Model` instead.",
- DeprecationWarning,
- stacklevel=2)
+with open(model_path, 'rb') as f:
+ raw_bytes = f.read()
+ encoded_weights = base64.encodebytes(raw_bytes)
-from parl.core.fluid.model import *
+# encode weights of model to byte string
+submission_file = """
+import base64
+decoded = base64.b64decode({})
+
+""".format(encoded_weights)
+
+# insert code snippet of loading weights
+with open('submission_template.py', 'r') as f:
+ submission_file += ''.join(f.readlines())
+
+# generate final submission file
+with open('submission.py', 'w') as f:
+ f.write(submission_file)
diff --git a/benchmark/torch/AlphaZero/main.py b/benchmark/torch/AlphaZero/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..433e2ff0efb35e6a39df53a845a25a8110b20993
--- /dev/null
+++ b/benchmark/torch/AlphaZero/main.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Coach import Coach
+from connect4_game import Connect4Game
+from utils import *
+
+from parl.utils import logger
+
+args = dotdict({
+ # master address of xparl cluster
+ 'master_address': 'localhost:8010',
+ # number of remote actors (execute tasks [self-play/pitting/evaluate_test_dataset] in parallel).
+ 'actors_num': 25,
+
+ # total number of iteration
+ 'numIters': 200,
+ # Number of complete self-play games to simulate during a new iteration.
+ 'numEps': 500,
+ # Number of games to play during arena (pitting) play to determine if new neural network will be accepted.
+ 'arenaCompare': 50,
+ # Number of games moves for MCTS to simulate.
+ 'numMCTSSims': 800,
+ # temp=1 (Temperature, τ (tau)) if episodeStep < tempThresholdStep, and thereafter uses temp=0.
+ 'tempThresholdStep': 15,
+ # During arena playoff, new neural net will be accepted if threshold or more of games are won.
+ 'updateThreshold': 0.6,
+ # CPUCT parameter
+ 'cpuct': 4,
+ # alpha parameter of dirichlet noise which is added to the policy (pi)
+ 'dirichletAlpha': 1.0,
+ # history of examples from numItersForTrainExamplesHistory latest iterations (training data)
+ 'numItersForTrainExamplesHistory': 20,
+
+ # folder to save model and training examples
+ 'checkpoint': './saved_model/',
+ # whether to load saved model and training examples
+ 'load_model': False,
+ 'load_folder_file': ('./saved_model', 'checkpoint_1.pth.tar'),
+})
+
+# Plays arenaCompare games in which player1 starts arenaCompare/2 games and player2 starts arenaCompare/2 games.
+assert args.arenaCompare % 2 == 0
+
+# make sure the tasks can be split evenly among different remote actors
+assert args.numEps % args.actors_num == 0
+assert (args.arenaCompare // 2) % args.actors_num == 0
+assert 1000 % args.actors_num == 0 # there are 1000 boards state in test_dataset
+
+
+def main():
+ game = Connect4Game()
+
+ c = Coach(game, args)
+
+ if args.load_model:
+ logger.info('Loading checkpoint {}...'.format(args.load_folder_file))
+ c.loadModel()
+ logger.info("Loading 'trainExamples' from file {}...".format(
+ args.load_folder_file))
+ c.loadTrainExamples()
+
+ c.learn()
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmark/torch/AlphaZero/submission_template.py b/benchmark/torch/AlphaZero/submission_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ba9e7eb85b0815403d98ae015c80f07f068334
--- /dev/null
+++ b/benchmark/torch/AlphaZero/submission_template.py
@@ -0,0 +1,559 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import os
+os.environ['OMP_NUM_THREADS'] = "1"
+
+
+# ===== utils.py =====
+class dotdict(dict):
+ def __getattr__(self, name):
+ return self[name]
+
+
+# ===== MCTS.py ======
+import math
+import time
+import numpy as np
+
+EPS = 1e-8
+
+
+class MCTS():
+ """
+ This class handles the MCTS tree.
+ """
+
+ def __init__(self, game, nn_agent, args, dirichlet_noise=False):
+ self.game = game
+ self.nn_agent = nn_agent
+ self.args = args
+ self.dirichlet_noise = dirichlet_noise
+ self.Qsa = {} # stores Q values for s,a (as defined in the paper)
+ self.Nsa = {} # stores #times edge s,a was visited
+ self.Ns = {} # stores #times board s was visited
+ self.Ps = {} # stores initial policy (returned by neural net)
+
+ self.Es = {} # stores game.getGameEnded ended for board s
+ self.Vs = {} # stores game.getValidMoves for board s
+
+ def getActionProb(self, canonicalBoard, temp=1, timelimit=4.9):
+ """
+ This function performs numMCTSSims simulations of MCTS starting from
+ canonicalBoard.
+
+ Returns:
+ probs: a policy vector where the probability of the ith action is
+ proportional to Nsa[(s,a)]**(1./temp)
+ """
+ dir_noise = self.dirichlet_noise
+ start_time = time.time()
+ while time.time() - start_time < timelimit:
+ self.search(canonicalBoard, dirichlet_noise=dir_noise)
+
+ s = self.game.stringRepresentation(canonicalBoard)
+ counts = [
+ self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
+ for a in range(self.game.getActionSize())
+ ]
+
+ if temp == 0:
+ bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
+ bestA = np.random.choice(bestAs)
+ probs = [0] * len(counts)
+ probs[bestA] = 1
+ return probs
+
+ counts = [x**(1. / temp) for x in counts]
+ counts_sum = float(sum(counts))
+ probs = [x / counts_sum for x in counts]
+ return probs
+
+ def search(self, canonicalBoard, dirichlet_noise=False):
+ """
+ This function performs one iteration of MCTS. It is recursively called
+ till a leaf node is found. The action chosen at each node is one that
+ has the maximum upper confidence bound as in the paper.
+
+ Once a leaf node is found, the neural network is called to return an
+ initial policy P and a value v for the state. This value is propagated
+ up the search path. In case the leaf node is a terminal state, the
+ outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
+ updated.
+
+ NOTE: the return values are the negative of the value of the current
+ state. This is done since v is in [-1,1] and if v is the value of a
+ state for the current player, then its value is -v for the other player.
+
+ Returns:
+ v: the negative of the value of the current canonicalBoard
+ """
+
+ s = self.game.stringRepresentation(canonicalBoard)
+
+ if s not in self.Es:
+ self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
+ if self.Es[s] != 0:
+ # terminal node
+ return -self.Es[s]
+
+ if s not in self.Ps:
+ # leaf node
+ self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
+
+ valids = self.game.getValidMoves(canonicalBoard, 1)
+ self.Ps[s] = self.Ps[s] * valids # masking invalid moves
+ if dirichlet_noise:
+ self.applyDirNoise(s, valids)
+ sum_Ps_s = np.sum(self.Ps[s])
+ if sum_Ps_s > 0:
+ self.Ps[s] /= sum_Ps_s # renormalize
+ else:
+ # if all valid moves were masked make all valid moves equally probable
+
+ # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
+ # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
+ print("All valid moves were masked, doing a workaround.")
+ self.Ps[s] = self.Ps[s] + valids
+ self.Ps[s] /= np.sum(self.Ps[s])
+
+ self.Vs[s] = valids
+ self.Ns[s] = 0
+ return -v
+
+ valids = self.Vs[s]
+ if dirichlet_noise:
+ self.applyDirNoise(s, valids)
+ sum_Ps_s = np.sum(self.Ps[s])
+ self.Ps[s] /= sum_Ps_s # renormalize
+ cur_best = -float('inf')
+ best_act = -1
+
+ # pick the action with the highest upper confidence bound
+ for a in range(self.game.getActionSize()):
+ if valids[a]:
+ if (s, a) in self.Qsa:
+ u = self.Qsa[
+ (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
+ self.Ns[s]) / (1 + self.Nsa[(s, a)])
+ else:
+ u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
+ self.Ns[s] + EPS) # Q = 0 ?
+
+ if u > cur_best:
+ cur_best = u
+ best_act = a
+
+ a = best_act
+ next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
+ next_s = self.game.getCanonicalForm(next_s, next_player)
+
+ v = self.search(next_s)
+
+ if (s, a) in self.Qsa:
+ self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
+ (s, a)] + v) / (self.Nsa[(s, a)] + 1)
+ self.Nsa[(s, a)] += 1
+
+ else:
+ self.Qsa[(s, a)] = v
+ self.Nsa[(s, a)] = 1
+
+ self.Ns[s] += 1
+ return -v
+
+ def applyDirNoise(self, s, valids):
+ dir_values = np.random.dirichlet(
+ [self.args.dirichletAlpha] * np.count_nonzero(valids))
+ dir_idx = 0
+ for idx in range(len(self.Ps[s])):
+ if self.Ps[s][idx]:
+ self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
+ 0.25 * dir_values[dir_idx])
+ dir_idx += 1
+
+
+# ===== connect4_game.py ======
+import numpy as np
+from collections import namedtuple
+
+DEFAULT_HEIGHT = 6
+DEFAULT_WIDTH = 7
+DEFAULT_WIN_LENGTH = 4
+
+WinState = namedtuple('WinState', 'is_ended winner')
+
+
+class Board():
+ """
+ Connect4 Board.
+ """
+
+ def __init__(self,
+ height=None,
+ width=None,
+ win_length=None,
+ np_pieces=None):
+ "Set up initial board configuration."
+ self.height = height or DEFAULT_HEIGHT
+ self.width = width or DEFAULT_WIDTH
+ self.win_length = win_length or DEFAULT_WIN_LENGTH
+
+ if np_pieces is None:
+ self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
+ else:
+ self.np_pieces = np_pieces
+ assert self.np_pieces.shape == (self.height, self.width)
+
+ def add_stone(self, column, player):
+ "Create copy of board containing new stone."
+ available_idx, = np.where(self.np_pieces[:, column] == 0)
+ if len(available_idx) == 0:
+ raise ValueError(
+ "Can't play column %s on board %s" % (column, self))
+
+ self.np_pieces[available_idx[-1]][column] = player
+
+ def get_valid_moves(self):
+ "Any zero value in top row in a valid move"
+ return self.np_pieces[0] == 0
+
+ def get_win_state(self):
+ for player in [-1, 1]:
+ player_pieces = self.np_pieces == -player
+ # Check rows & columns for win
+ if (self._is_straight_winner(player_pieces)
+ or self._is_straight_winner(player_pieces.transpose())
+ or self._is_diagonal_winner(player_pieces)):
+ return WinState(True, -player)
+
+ # draw has very little value.
+ if not self.get_valid_moves().any():
+ return WinState(True, None)
+
+ # Game is not ended yet.
+ return WinState(False, None)
+
+ def with_np_pieces(self, np_pieces):
+ """Create copy of board with specified pieces."""
+ if np_pieces is None:
+ np_pieces = self.np_pieces
+ return Board(self.height, self.width, self.win_length, np_pieces)
+
+ def _is_diagonal_winner(self, player_pieces):
+ """Checks if player_pieces contains a diagonal win."""
+ win_length = self.win_length
+ for i in range(len(player_pieces) - win_length + 1):
+ for j in range(len(player_pieces[0]) - win_length + 1):
+ if all(player_pieces[i + x][j + x] for x in range(win_length)):
+ return True
+ for j in range(win_length - 1, len(player_pieces[0])):
+ if all(player_pieces[i + x][j - x] for x in range(win_length)):
+ return True
+ return False
+
+ def _is_straight_winner(self, player_pieces):
+ """Checks if player_pieces contains a vertical or horizontal win."""
+ run_lengths = [
+ player_pieces[:, i:i + self.win_length].sum(axis=1)
+ for i in range(len(player_pieces) - self.win_length + 2)
+ ]
+ return max([x.max() for x in run_lengths]) >= self.win_length
+
+ def __str__(self):
+ return str(self.np_pieces)
+
+
+class Connect4Game(object):
+ """
+ Connect4 Game class implementing the alpha-zero-general Game interface.
+
+ Use 1 for player1 and -1 for player2.
+ """
+
+ def __init__(self,
+ height=None,
+ width=None,
+ win_length=None,
+ np_pieces=None):
+ self._base_board = Board(height, width, win_length, np_pieces)
+
+ def getInitBoard(self):
+ """
+ Returns:
+ startBoard: a representation of the board (ideally this is the form
+ that will be the input to your neural network)
+ """
+ return self._base_board.np_pieces
+
+ def getBoardSize(self):
+ """
+ Returns:
+ (x,y): a tuple of board dimensions
+ """
+ return (self._base_board.height, self._base_board.width)
+
+ def getActionSize(self):
+ """
+ Returns:
+ actionSize: number of all possible actions
+ """
+ return self._base_board.width
+
+ def getNextState(self, board, player, action):
+ """Returns a copy of the board with updated move, original board is unmodified.
+
+ Input:
+ board: current board
+ player: current player (1 or -1)
+ action: action taken by current player
+
+ Returns:
+ nextBoard: board after applying action
+ nextPlayer: player who plays in the next turn (should be -player)
+
+ """
+ b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
+ b.add_stone(action, player)
+ return b.np_pieces, -player
+
+ def getValidMoves(self, board, player):
+ """Any zero value in top row in a valid move.
+
+ Input:
+ board: current board
+ player: current player
+
+ Returns:
+ validMoves: a binary vector of length self.getActionSize(), 1 for
+ moves that are valid from the current board and player,
+ 0 for invalid moves
+ """
+ return self._base_board.with_np_pieces(
+ np_pieces=board).get_valid_moves()
+
+ def getGameEnded(self, board, player):
+ """
+ Input:
+ board: current board
+ player: current player (1 or -1)
+
+ Returns:
+ r: 0 if game has not ended. 1 if player won, -1 if player lost,
+ small non-zero value for draw.
+
+ """
+ b = self._base_board.with_np_pieces(np_pieces=board)
+ winstate = b.get_win_state()
+ if winstate.is_ended:
+ if winstate.winner is None:
+ # draw has very little value.
+ return 1e-4
+ elif winstate.winner == player:
+ return +1
+ elif winstate.winner == -player:
+ return -1
+ else:
+ raise ValueError('Unexpected winstate found: ', winstate)
+ else:
+ # 0 used to represent unfinished game.
+ return 0
+
+ def getCanonicalForm(self, board, player):
+ """
+ Input:
+ board: current board
+ player: current player (1 or -1)
+
+ Returns:
+ canonicalBoard: returns canonical form of board. The canonical form
+ should be independent of player. For e.g. in chess,
+ the canonical form can be chosen to be from the pov
+ of white. When the player is white, we can return
+ board as is. When the player is black, we can invert
+ the colors and return the board.
+ """
+ return board * player
+
+ def getSymmetries(self, board, pi):
+ """Board is left/right board symmetric
+
+ Input:
+ board: current board
+ pi: policy vector of size self.getActionSize()
+
+ Returns:
+ symmForms: a list of [(board,pi)] where each tuple is a symmetrical
+ form of the board and the corresponding pi vector. This
+ is used when training the neural network from examples.
+ """
+ return [(board, pi),
+ (np.array(board[:, ::-1], copy=True),
+ np.array(pi[::-1], copy=True))]
+
+ def stringRepresentation(self, board):
+ """
+ Input:
+ board: current board
+
+ Returns:
+ boardString: a quick conversion of board to a string format.
+ Required by MCTS for hashing.
+ """
+ return board.tostring()
+
+ @staticmethod
+ def display(board):
+ print(" -----------------------")
+ print(' '.join(map(str, range(len(board[0])))))
+ print(board)
+ print(" -----------------------")
+
+
+# ===== connect4_model ======
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+#class Connect4Model(parl.Model): # Kaggle doesn't support parl package
+class Connect4Model(nn.Module):
+ def __init__(self, game, args):
+ # game params
+ self.board_x, self.board_y = game.getBoardSize()
+ self.action_size = game.getActionSize()
+ self.args = args
+
+ super(Connect4Model, self).__init__()
+ self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
+ self.conv2 = nn.Conv2d(
+ args.num_channels, args.num_channels, 3, stride=1, padding=1)
+ self.conv3 = nn.Conv2d(
+ args.num_channels, args.num_channels, 3, stride=1)
+ self.conv4 = nn.Conv2d(
+ args.num_channels, args.num_channels, 3, stride=1)
+
+ self.bn1 = nn.BatchNorm2d(args.num_channels)
+ self.bn2 = nn.BatchNorm2d(args.num_channels)
+ self.bn3 = nn.BatchNorm2d(args.num_channels)
+ self.bn4 = nn.BatchNorm2d(args.num_channels)
+
+ self.fc1 = nn.Linear(
+ args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
+ self.fc_bn1 = nn.BatchNorm1d(128)
+
+ self.fc2 = nn.Linear(128, 64)
+ self.fc_bn2 = nn.BatchNorm1d(64)
+
+ self.fc3 = nn.Linear(64, self.action_size)
+
+ self.fc4 = nn.Linear(64, 1)
+
+ def forward(self, s):
+ # s: batch_size x board_x x board_y
+ s = s.view(-1, 1, self.board_x,
+ self.board_y) # batch_size x 1 x board_x x board_y
+ s = F.relu(self.bn1(
+ self.conv1(s))) # batch_size x num_channels x board_x x board_y
+ s = F.relu(self.bn2(
+ self.conv2(s))) # batch_size x num_channels x board_x x board_y
+ s = F.relu(self.bn3(self.conv3(
+ s))) # batch_size x num_channels x (board_x-2) x (board_y-2)
+ s = F.relu(self.bn4(self.conv4(
+ s))) # batch_size x num_channels x (board_x-4) x (board_y-4)
+ s = s.view(
+ -1,
+ self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
+
+ s = F.dropout(
+ F.relu(self.fc_bn1(self.fc1(s))),
+ p=self.args.dropout,
+ training=self.training) # batch_size x 128
+ s = F.dropout(
+ F.relu(self.fc_bn2(self.fc2(s))),
+ p=self.args.dropout,
+ training=self.training) # batch_size x 64
+
+ pi = self.fc3(s) # batch_size x action_size
+ v = self.fc4(s) # batch_size x 1
+
+ return F.log_softmax(pi, dim=1), torch.tanh(v)
+
+
+# ===== simple agent ======
+args = dotdict({
+ 'dropout': 0.3,
+ 'num_channels': 64,
+})
+
+
+class SimpleAgent():
+ def __init__(self, game, cuda=True):
+ self.cuda = cuda and torch.cuda.is_available()
+ self.model = Connect4Model(game, args)
+ if self.cuda:
+ self.model.cuda()
+
+ self.board_x, self.board_y = game.getBoardSize()
+ self.action_size = game.getActionSize()
+
+ def predict(self, board):
+ """
+ Args:
+ board (np.array): input board
+
+ Return:
+ pi (np.array): probability of actions
+ v (np.array): estimated value of input
+ """
+ # preparing input
+ board = torch.FloatTensor(board.astype(np.float64))
+ if self.cuda:
+ board = board.contiguous().cuda()
+ board = board.view(1, self.board_x, self.board_y)
+
+ self.model.eval() # eval mode
+
+ with torch.no_grad():
+ log_pi, v = self.model(board)
+
+ pi = torch.exp(log_pi)
+
+ return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
+
+ def load_checkpoint(self, buffer):
+ map_location = None if self.cuda else 'cpu'
+ checkpoint = torch.load(buffer, map_location=map_location)
+ self.model.load_state_dict(checkpoint)
+
+
+# ===== predict function ======
+import base64
+import io
+
+game = Connect4Game()
+
+# AlphaZero players
+agent = SimpleAgent(game)
+buffer = io.BytesIO(decoded)
+agent.load_checkpoint(buffer)
+mcts_args = dotdict({'numMCTSSims': 800, 'cpuct': 1.0})
+mcts = MCTS(game, agent, mcts_args)
+
+
+def alphazero_agent(obs, config):
+ board = np.reshape(obs.board.copy(), game.getBoardSize()).astype(int)
+ board[np.where(board == 2)] = -1
+
+ player = 1
+ if obs.mark == 2:
+ player = -1
+
+ x = game.getCanonicalForm(board, player)
+
+ action = np.argmax(
+ mcts.getActionProb(x, temp=0, timelimit=config.timeout - 0.5))
+ return int(action)
diff --git a/benchmark/torch/AlphaZero/utils.py b/benchmark/torch/AlphaZero/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ae500cdae19f002538c563b6cbae725c7b0d9af
--- /dev/null
+++ b/benchmark/torch/AlphaZero/utils.py
@@ -0,0 +1,65 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class dotdict(dict):
+ def __getattr__(self, name):
+ try:
+ return self[name]
+ except KeyError:
+ raise AttributeError(name)
+
+
+def win_loss_draw(score):
+ if score > 0:
+ return 'win'
+ if score < 0:
+ return 'loss'
+ return 'draw'
+
+
+"""
+split one list to multiple lists
+"""
+split_group = lambda the_list, group_size: zip(*(iter(the_list), ) * group_size)
+
+import numpy as np
+import json
+from connect4_game import Connect4Game
+
+
+def get_test_dataset():
+ game = Connect4Game()
+ test_dataset = []
+ with open("refmoves1k_kaggle") as f:
+ for line in f:
+ data = json.loads(line)
+
+ board = data["board"]
+ board = np.reshape(board, game.getBoardSize()).astype(int)
+ board[np.where(board == 2)] = -1
+
+ # find out how many moves are played to set the correct mark.
+ ply = len([x for x in data["board"] if x > 0])
+ if ply & 1:
+ player = -1
+ else:
+ player = 1
+
+ test_dataset.append({
+ 'board': board,
+ 'player': player,
+ 'move_score': data['move score'],
+ })
+ return test_dataset
diff --git a/benchmark/torch/a2c/train.py b/benchmark/torch/a2c/train.py
index f2985367f8304edb6bccc93f894a7d04f5f305c8..9a498023988bc72a0a0aa43d4850c25ced8d2856 100644
--- a/benchmark/torch/a2c/train.py
+++ b/benchmark/torch/a2c/train.py
@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind
from parl.utils.window_stat import WindowStat
from parl.utils.time_stat import TimeStat
from parl.utils import machine_info
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
from parl.algorithms import A2C
from atari_model import ActorCritic
@@ -205,19 +205,19 @@ class Learner(object):
}
if metric['mean_episode_rewards'] is not None:
- tensorboard.add_scalar('train/mean_reward',
- metric['mean_episode_rewards'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/total_loss', metric['total_loss'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/pi_loss', metric['pi_loss'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/vf_loss', metric['vf_loss'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/entropy', metric['entropy'],
- self.sample_total_steps)
- tensorboard.add_scalar('train/learn_rate', metric['lr'],
- self.sample_total_steps)
+ summary.add_scalar('train/mean_reward',
+ metric['mean_episode_rewards'],
+ self.sample_total_steps)
+ summary.add_scalar('train/total_loss', metric['total_loss'],
+ self.sample_total_steps)
+ summary.add_scalar('train/pi_loss', metric['pi_loss'],
+ self.sample_total_steps)
+ summary.add_scalar('train/vf_loss', metric['vf_loss'],
+ self.sample_total_steps)
+ summary.add_scalar('train/entropy', metric['entropy'],
+ self.sample_total_steps)
+ summary.add_scalar('train/learn_rate', metric['lr'],
+ self.sample_total_steps)
logger.info(metric)
diff --git a/benchmark/torch/dqn/replay_memory.py b/benchmark/torch/dqn/replay_memory.py
index ea8c6565155ddacae568e901566f9b390ee3a8b8..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 100644
--- a/benchmark/torch/dqn/replay_memory.py
+++ b/benchmark/torch/dqn/replay_memory.py
@@ -16,16 +16,16 @@ import numpy as np
import copy
from collections import deque, namedtuple
-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
class ReplayMemory(object):
- def __init__(self, max_size, state_shape, context_len):
+ def __init__(self, max_size, obs_shape, context_len):
self.max_size = int(max_size)
- self.state_shape = state_shape
+ self.obs_shape = obs_shape
self.context_len = int(context_len)
- self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
+ self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
self.action = np.zeros((self.max_size, ), dtype='int32')
self.reward = np.zeros((self.max_size, ), dtype='float32')
self.isOver = np.zeros((self.max_size, ), dtype='bool')
@@ -48,42 +48,41 @@ class ReplayMemory(object):
else:
self._context.append(exp)
- def recent_state(self):
- """ maintain recent state for training"""
+ def recent_obs(self):
+ """ maintain recent obs for training"""
lst = list(self._context)
- states = [np.zeros(self.state_shape, dtype='uint8')] * \
+ obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
(self._context.maxlen - len(lst))
- states.extend([k.state for k in lst])
- return states
+ obs.extend([k.obs for k in lst])
+ return obs
def sample(self, idx):
- """ return state, action, reward, isOver,
- note that some frames in state may be generated from last episode,
- they should be removed from state
+ """ return obs, action, reward, isOver,
+ note that some frames in obs may be generated from last episode,
+ they should be removed from obs
"""
- state = np.zeros(
- (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
- state_idx = np.arange(idx,
- idx + self.context_len + 1) % self._curr_size
+ obs = np.zeros(
+ (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+ obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
# confirm that no frame was generated from last episode
has_last_episode = False
for k in range(self.context_len - 2, -1, -1):
- to_check_idx = state_idx[k]
+ to_check_idx = obs_idx[k]
if self.isOver[to_check_idx]:
has_last_episode = True
- state_idx = state_idx[k + 1:]
- state[k + 1:] = self.state[state_idx]
+ obs_idx = obs_idx[k + 1:]
+ obs[k + 1:] = self.obs[obs_idx]
break
if not has_last_episode:
- state = self.state[state_idx]
+ obs = self.obs[obs_idx]
real_idx = (idx + self.context_len - 1) % self._curr_size
action = self.action[real_idx]
reward = self.reward[real_idx]
isOver = self.isOver[real_idx]
- return state, reward, action, isOver
+ return obs, reward, action, isOver
def __len__(self):
return self._curr_size
@@ -92,7 +91,7 @@ class ReplayMemory(object):
return self._curr_size
def _assign(self, pos, exp):
- self.state[pos] = exp.state
+ self.obs[pos] = exp.obs
self.reward[pos] = exp.reward
self.action[pos] = exp.action
self.isOver[pos] = exp.isOver
@@ -107,8 +106,8 @@ class ReplayMemory(object):
return self._process_batch(batch_exp)
def _process_batch(self, batch_exp):
- state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+ obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
action = np.asarray([e[2] for e in batch_exp], dtype='int8')
isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
- return [state, action, reward, isOver]
+ return [obs, action, reward, isOver]
diff --git a/benchmark/torch/dqn/train.py b/benchmark/torch/dqn/train.py
index 9db3b8f776fa669772bb2748cbfed0a7067f5909..ba64b95c93a9b4879621331ad30cce3cbcbcac16 100644
--- a/benchmark/torch/dqn/train.py
+++ b/benchmark/torch/dqn/train.py
@@ -22,11 +22,11 @@ import parl
import numpy as np
from tqdm import tqdm
-from parl.utils import tensorboard, logger
+from parl.utils import summary, logger
from parl.algorithms import DQN, DDQN
from agent import AtariAgent
-from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState
+from atari_wrapper import FireResetEnv, FrameStack, LimitLength
from model import AtariModel
from replay_memory import ReplayMemory, Experience
from utils import get_player
@@ -43,57 +43,57 @@ GAMMA = 0.99
def run_train_episode(env, agent, rpm):
total_reward = 0
all_cost = []
- state = env.reset()
+ obs = env.reset()
steps = 0
while True:
steps += 1
- context = rpm.recent_state()
- context.append(state)
+ context = rpm.recent_obs()
+ context.append(obs)
context = np.stack(context, axis=0)
action = agent.sample(context)
- next_state, reward, isOver, _ = env.step(action)
- rpm.append(Experience(state, action, reward, isOver))
+ next_obs, reward, isOver, _ = env.step(action)
+ rpm.append(Experience(obs, action, reward, isOver))
if rpm.size() > MEMORY_WARMUP_SIZE:
if steps % UPDATE_FREQ == 0:
- batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+ batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
args.batch_size)
- batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
- batch_next_state = batch_all_state[:, 1:, :, :]
- cost = agent.learn(batch_state, batch_action, batch_reward,
- batch_next_state, batch_isOver)
+ batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+ batch_next_obs = batch_all_obs[:, 1:, :, :]
+ cost = agent.learn(batch_obs, batch_action, batch_reward,
+ batch_next_obs, batch_isOver)
all_cost.append(cost)
total_reward += reward
- state = next_state
+ obs = next_obs
if isOver:
mean_loss = np.mean(all_cost) if all_cost else None
return total_reward, steps, mean_loss
def run_evaluate_episode(env, agent):
- state = env.reset()
+ obs = env.reset()
total_reward = 0
while True:
- pred_Q = agent.predict(state)
+ pred_Q = agent.predict(obs)
action = pred_Q.max(1)[1].item()
- state, reward, isOver, _ = env.step(action)
+ obs, reward, isOver, _ = env.step(action)
total_reward += reward
if isOver:
return total_reward
-def get_fixed_states(rpm, batch_size):
- states = []
+def get_fixed_obs(rpm, batch_size):
+ obs = []
for _ in range(3):
- batch_all_state = rpm.sample_batch(batch_size)[0]
- batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
- states.append(batch_state)
- fixed_states = np.concatenate(states, axis=0)
- return fixed_states
+ batch_all_obs = rpm.sample_batch(batch_size)[0]
+ batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+ obs.append(batch_obs)
+ fixed_obs = np.concatenate(obs, axis=0)
+ return fixed_obs
-def evaluate_fixed_Q(agent, states):
+def evaluate_fixed_Q(agent, obs):
with torch.no_grad():
- max_pred_Q = agent.alg.model(states).max(1)[0].mean()
+ max_pred_Q = agent.alg.model(obs).max(1)[0].mean()
return max_pred_Q.item()
@@ -131,9 +131,9 @@ def main():
total_reward, steps, _ = run_train_episode(env, agent, rpm)
pbar.update(steps)
- # Get fixed states to check value function.
- fixed_states = get_fixed_states(rpm, args.batch_size)
- fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device)
+ # Get fixed obs to check value function.
+ fixed_obs = get_fixed_obs(rpm, args.batch_size)
+ fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device)
# train
test_flag = 0
@@ -152,18 +152,17 @@ def main():
for _ in range(3):
eval_rewards.append(run_evaluate_episode(test_env, agent))
- tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards),
- total_steps)
- tensorboard.add_scalar('dqn/score', total_reward, total_steps)
- tensorboard.add_scalar('dqn/loss', loss, total_steps)
- tensorboard.add_scalar('dqn/exploration', agent.exploration,
- total_steps)
- tensorboard.add_scalar('dqn/Q value',
- evaluate_fixed_Q(agent, fixed_states),
- total_steps)
- tensorboard.add_scalar('dqn/grad_norm',
- get_grad_norm(agent.alg.model),
- total_steps)
+ summary.add_scalar('dqn/eval', np.mean(eval_rewards),
+ total_steps)
+ summary.add_scalar('dqn/score', total_reward, total_steps)
+ summary.add_scalar('dqn/loss', loss, total_steps)
+ summary.add_scalar('dqn/exploration', agent.exploration,
+ total_steps)
+ summary.add_scalar('dqn/Q value',
+ evaluate_fixed_Q(agent, fixed_obs),
+ total_steps)
+ summary.add_scalar('dqn/grad_norm',
+ get_grad_norm(agent.alg.model), total_steps)
if __name__ == '__main__':
diff --git a/benchmark/torch/ppo/arguments.py b/benchmark/torch/ppo/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d5d33df54b4652a416f0f9bbb49c3d1bd4a522
--- /dev/null
+++ b/benchmark/torch/ppo/arguments.py
@@ -0,0 +1,103 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import torch
+
+
+def get_args():
+ parser = argparse.ArgumentParser(description='RL')
+ parser.add_argument(
+ '--lr', type=float, default=3e-4, help='learning rate (default: 3e-4)')
+ parser.add_argument(
+ '--eps',
+ type=float,
+ default=1e-5,
+ help='RMSprop optimizer epsilon (default: 1e-5)')
+ parser.add_argument(
+ '--gamma',
+ type=float,
+ default=0.99,
+ help='discount factor for rewards (default: 0.99)')
+ parser.add_argument(
+ '--gae-lambda',
+ type=float,
+ default=0.95,
+ help='gae lambda parameter (default: 0.95)')
+ parser.add_argument(
+ '--entropy-coef',
+ type=float,
+ default=0.,
+ help='entropy term coefficient (default: 0.)')
+ parser.add_argument(
+ '--value-loss-coef',
+ type=float,
+ default=0.5,
+ help='value loss coefficient (default: 0.5)')
+ parser.add_argument(
+ '--max-grad-norm',
+ type=float,
+ default=0.5,
+ help='max norm of gradients (default: 0.5)')
+ parser.add_argument(
+ '--seed', type=int, default=1, help='random seed (default: 1)')
+ parser.add_argument(
+ '--num-steps',
+ type=int,
+ default=2048,
+ help='number of maximum forward steps in ppo (default: 2048)')
+ parser.add_argument(
+ '--ppo-epoch',
+ type=int,
+ default=10,
+ help='number of ppo epochs (default: 10)')
+ parser.add_argument(
+ '--num-mini-batch',
+ type=int,
+ default=32,
+ help='number of batches for ppo (default: 32)')
+ parser.add_argument(
+ '--clip-param',
+ type=float,
+ default=0.2,
+ help='ppo clip parameter (default: 0.2)')
+ parser.add_argument(
+ '--log-interval',
+ type=int,
+ default=1,
+ help='log interval, one log per n updates (default: 1)')
+ parser.add_argument(
+ '--eval-interval',
+ type=int,
+ default=10,
+ help='eval interval, one eval per n updates (default: 10)')
+ parser.add_argument(
+ '--num-env-steps',
+ type=int,
+ default=10e5,
+ help='number of environment steps to train (default: 10e5)')
+ parser.add_argument(
+ '--env-name',
+ default='Hopper-v2',
+ help='environment to train on (default: Hopper-v2)')
+ parser.add_argument(
+ '--use-linear-lr-decay',
+ action='store_true',
+ default=False,
+ help='use a linear schedule on the learning rate')
+ args = parser.parse_args()
+
+ args.cuda = torch.cuda.is_available()
+
+ return args
diff --git a/benchmark/torch/ppo/evaluation.py b/benchmark/torch/ppo/evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aa020ca66a0c3a97d8deea55e37dabc4cf7512b
--- /dev/null
+++ b/benchmark/torch/ppo/evaluation.py
@@ -0,0 +1,56 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+
+import utils
+from wrapper import make_env
+
+
+def evaluate(agent, ob_rms, env_name, seed, device):
+ if seed != None:
+ seed += 1
+ eval_envs = make_env(env_name, seed, None)
+ vec_norm = utils.get_vec_normalize(eval_envs)
+ if vec_norm is not None:
+ vec_norm.eval()
+ vec_norm.ob_rms = ob_rms
+
+ eval_episode_rewards = []
+
+ obs = eval_envs.reset()
+ eval_masks = torch.zeros(1, 1, device=device)
+
+ while len(eval_episode_rewards) < 10:
+ with torch.no_grad():
+ action = agent.predict(obs)
+
+ # Obser reward and next obs
+ obs, _, done, infos = eval_envs.step(action)
+
+ eval_masks = torch.tensor(
+ [[0.0] if done_ else [1.0] for done_ in done],
+ dtype=torch.float32,
+ device=device)
+
+ for info in infos:
+ if 'episode' in info.keys():
+ eval_episode_rewards.append(info['episode']['r'])
+
+ eval_envs.close()
+
+ print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
+ len(eval_episode_rewards), np.mean(eval_episode_rewards)))
+ return np.mean(eval_episode_rewards)
diff --git a/benchmark/torch/ppo/mujoco_agent.py b/benchmark/torch/ppo/mujoco_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..096f683f958829c0780ecc59d9ed144367c15f38
--- /dev/null
+++ b/benchmark/torch/ppo/mujoco_agent.py
@@ -0,0 +1,78 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+import torch
+
+
+class MujocoAgent(parl.Agent):
+ def __init__(self, algorithm, device):
+ self.alg = algorithm
+ self.device = device
+
+ def predict(self, obs):
+ obs = torch.from_numpy(obs).float().to(self.device)
+ action = self.alg.predict(obs)
+ return action.cpu().numpy()
+
+ def sample(self, obs):
+ obs = torch.from_numpy(obs).to(self.device)
+ value, action, action_log_probs = self.alg.sample(obs)
+ return value.cpu().numpy(), action.cpu().numpy(), \
+ action_log_probs.cpu().numpy()
+
+ def learn(self, next_value, gamma, gae_lambda, ppo_epoch, num_mini_batch,
+ rollouts):
+ value_loss_epoch = 0
+ action_loss_epoch = 0
+ dist_entropy_epoch = 0
+
+ for e in range(ppo_epoch):
+ data_generator = rollouts.sample_batch(next_value, gamma,
+ gae_lambda, num_mini_batch)
+
+ for sample in data_generator:
+ obs_batch, actions_batch, \
+ value_preds_batch, return_batch, old_action_log_probs_batch, \
+ adv_targ = sample
+
+ obs_batch = torch.from_numpy(obs_batch).to('cuda')
+ actions_batch = torch.from_numpy(actions_batch).to('cuda').to(
+ 'cuda')
+ value_preds_batch = torch.from_numpy(value_preds_batch).to(
+ 'cuda')
+ return_batch = torch.from_numpy(return_batch).to('cuda')
+ old_action_log_probs_batch = torch.from_numpy(
+ old_action_log_probs_batch).to('cuda')
+ adv_targ = torch.from_numpy(adv_targ).to('cuda')
+
+ value_loss, action_loss, dist_entropy = self.alg.learn(
+ obs_batch, actions_batch, value_preds_batch, return_batch,
+ old_action_log_probs_batch, adv_targ)
+
+ value_loss_epoch += value_loss
+ action_loss_epoch += action_loss
+ dist_entropy_epoch += dist_entropy
+
+ num_updates = ppo_epoch * num_mini_batch
+
+ value_loss_epoch /= num_updates
+ action_loss_epoch /= num_updates
+ dist_entropy_epoch /= num_updates
+
+ return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
+
+ def value(self, obs):
+ obs = torch.from_numpy(obs).to(self.device)
+ return self.alg.value(obs).cpu().numpy()
diff --git a/benchmark/torch/ppo/mujoco_model.py b/benchmark/torch/ppo/mujoco_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..83b762da2bd5a922d2a20605df641b6aec0ad949
--- /dev/null
+++ b/benchmark/torch/ppo/mujoco_model.py
@@ -0,0 +1,64 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Normal
+
+
+class MujocoModel(parl.Model):
+ def __init__(self, obs_dim, act_dim):
+ super(MujocoModel, self).__init__()
+ self.actor = Actor(obs_dim, act_dim)
+ self.critic = Critic(obs_dim)
+
+ def policy(self, obs):
+ return self.actor(obs)
+
+ def value(self, obs):
+ return self.critic(obs)
+
+
+class Actor(parl.Model):
+ def __init__(self, obs_dim, act_dim):
+ super(Actor, self).__init__()
+ self.fc1 = nn.Linear(obs_dim, 64)
+ self.fc2 = nn.Linear(64, 64)
+
+ self.fc_mean = nn.Linear(64, act_dim)
+ self.log_std = nn.Parameter(torch.zeros(act_dim))
+
+ def forward(self, obs):
+ x = torch.tanh(self.fc1(obs))
+ x = torch.tanh(self.fc2(x))
+
+ mean = self.fc_mean(x)
+ return mean, self.log_std
+
+
+class Critic(parl.Model):
+ def __init__(self, obs_dim):
+ super(Critic, self).__init__()
+ self.fc1 = nn.Linear(obs_dim, 64)
+ self.fc2 = nn.Linear(64, 64)
+ self.fc3 = nn.Linear(64, 1)
+
+ def forward(self, obs):
+ x = torch.tanh(self.fc1(obs))
+ x = torch.tanh(self.fc2(x))
+ value = self.fc3(x)
+
+ return value
diff --git a/benchmark/torch/ppo/storage.py b/benchmark/torch/ppo/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..b986b670d545fb88938785fc812a320103023d5d
--- /dev/null
+++ b/benchmark/torch/ppo/storage.py
@@ -0,0 +1,107 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
+
+
+class RolloutStorage(object):
+ def __init__(self, num_steps, obs_dim, act_dim):
+ self.num_steps = num_steps
+ self.obs_dim = obs_dim
+ self.act_dim = act_dim
+
+ self.obs = np.zeros((num_steps + 1, obs_dim), dtype='float32')
+ self.actions = np.zeros((num_steps, act_dim), dtype='float32')
+ self.value_preds = np.zeros((num_steps + 1, ), dtype='float32')
+ self.returns = np.zeros((num_steps + 1, ), dtype='float32')
+ self.action_log_probs = np.zeros((num_steps, ), dtype='float32')
+ self.rewards = np.zeros((num_steps, ), dtype='float32')
+
+ self.masks = np.ones((num_steps + 1, ), dtype='bool')
+ self.bad_masks = np.ones((num_steps + 1, ), dtype='bool')
+
+ self.step = 0
+
+ def append(self, obs, actions, action_log_probs, value_preds, rewards,
+ masks, bad_masks):
+ """
+ print("obs")
+ print(obs)
+ print("masks")
+ print(masks)
+ print("rewards")
+ print(rewards)
+ exit()
+ """
+ self.obs[self.step + 1] = obs
+ self.actions[self.step] = actions
+ self.rewards[self.step] = rewards
+ self.action_log_probs[self.step] = action_log_probs
+ self.value_preds[self.step] = value_preds
+ self.masks[self.step + 1] = masks
+ self.bad_masks[self.step + 1] = bad_masks
+
+ self.step = (self.step + 1) % self.num_steps
+
+ def sample_batch(self,
+ next_value,
+ gamma,
+ gae_lambda,
+ num_mini_batch,
+ mini_batch_size=None):
+ # calculate return and advantage first
+ self.compute_returns(next_value, gamma, gae_lambda)
+ advantages = self.returns[:-1] - self.value_preds[:-1]
+ advantages = (advantages - advantages.mean()) / (
+ advantages.std() + 1e-5)
+
+ # generate sample batch
+ mini_batch_size = self.num_steps // num_mini_batch
+ sampler = BatchSampler(
+ SubsetRandomSampler(range(self.num_steps)),
+ mini_batch_size,
+ drop_last=True)
+ for indices in sampler:
+ obs_batch = self.obs[:-1][indices]
+ actions_batch = self.actions[indices]
+ value_preds_batch = self.value_preds[:-1][indices]
+ returns_batch = self.returns[:-1][indices]
+ old_action_log_probs_batch = self.action_log_probs[indices]
+
+ value_preds_batch = value_preds_batch.reshape(-1, 1)
+ returns_batch = returns_batch.reshape(-1, 1)
+ old_action_log_probs_batch = old_action_log_probs_batch.reshape(
+ -1, 1)
+
+ adv_targ = advantages[indices]
+ adv_targ = adv_targ.reshape(-1, 1)
+
+ yield obs_batch, actions_batch, value_preds_batch, returns_batch, old_action_log_probs_batch, adv_targ
+
+ def after_update(self):
+ self.obs[0] = np.copy(self.obs[-1])
+ self.masks[0] = np.copy(self.masks[-1])
+ self.bad_masks[0] = np.copy(self.bad_masks[-1])
+
+ def compute_returns(self, next_value, gamma, gae_lambda):
+ self.value_preds[-1] = next_value
+ gae = 0
+ for step in reversed(range(self.rewards.size)):
+ delta = self.rewards[step] + gamma * self.value_preds[
+ step + 1] * self.masks[step + 1] - self.value_preds[step]
+ gae = delta + gamma * gae_lambda * self.masks[step + 1] * gae
+ gae = gae * self.bad_masks[step + 1]
+ self.returns[step] = gae + self.value_preds[step]
diff --git a/benchmark/torch/ppo/train.py b/benchmark/torch/ppo/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bb5dafbf4fbc6b96dc664030910446a7cfd46e1
--- /dev/null
+++ b/benchmark/torch/ppo/train.py
@@ -0,0 +1,128 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# modified from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail
+
+import copy
+import os
+from collections import deque
+
+import gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+import utils
+from arguments import get_args
+from wrapper import make_env
+from mujoco_model import MujocoModel
+from parl.algorithms import PPO
+from mujoco_agent import MujocoAgent
+from storage import RolloutStorage
+from evaluation import evaluate
+
+
+def main():
+ args = get_args()
+
+ torch.manual_seed(args.seed)
+ torch.cuda.manual_seed_all(args.seed)
+
+ torch.set_num_threads(1)
+ device = torch.device("cuda:0" if args.cuda else "cpu")
+
+ envs = make_env(args.env_name, args.seed, args.gamma)
+
+ model = MujocoModel(envs.observation_space.shape[0],
+ envs.action_space.shape[0])
+ model.to(device)
+
+ algorithm = PPO(
+ model,
+ args.clip_param,
+ args.value_loss_coef,
+ args.entropy_coef,
+ initial_lr=args.lr,
+ eps=args.eps,
+ max_grad_norm=args.max_grad_norm)
+
+ agent = MujocoAgent(algorithm, device)
+
+ rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
+ envs.action_space.shape[0])
+
+ obs = envs.reset()
+ rollouts.obs[0] = np.copy(obs)
+
+ episode_rewards = deque(maxlen=10)
+
+ num_updates = int(args.num_env_steps) // args.num_steps
+ for j in range(num_updates):
+
+ if args.use_linear_lr_decay:
+ # decrease learning rate linearly
+ utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
+ args.lr)
+
+ for step in range(args.num_steps):
+ # Sample actions
+ with torch.no_grad():
+ value, action, action_log_prob = agent.sample(
+ rollouts.obs[step]) # why use obs from rollouts???有病吧
+
+ # Obser reward and next obs
+ obs, reward, done, infos = envs.step(action)
+
+ for info in infos:
+ if 'episode' in info.keys():
+ episode_rewards.append(info['episode']['r'])
+
+ # If done then clean the history of observations.
+ masks = torch.FloatTensor(
+ [[0.0] if done_ else [1.0] for done_ in done])
+ bad_masks = torch.FloatTensor(
+ [[0.0] if 'bad_transition' in info.keys() else [1.0]
+ for info in infos])
+ rollouts.append(obs, action, action_log_prob, value, reward, masks,
+ bad_masks)
+
+ with torch.no_grad():
+ next_value = agent.value(rollouts.obs[-1])
+
+ value_loss, action_loss, dist_entropy = agent.learn(
+ next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
+ args.num_mini_batch, rollouts)
+
+ rollouts.after_update()
+
+ if j % args.log_interval == 0 and len(episode_rewards) > 1:
+ total_num_steps = (j + 1) * args.num_steps
+ print(
+ "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
+ .format(j, total_num_steps, len(episode_rewards),
+ np.mean(episode_rewards), np.median(episode_rewards),
+ np.min(episode_rewards), np.max(episode_rewards),
+ dist_entropy, value_loss, action_loss))
+
+ if (args.eval_interval is not None and len(episode_rewards) > 1
+ and j % args.eval_interval == 0):
+ ob_rms = utils.get_vec_normalize(envs).ob_rms
+ eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
+ args.seed, device)
+
+
+if __name__ == "__main__":
+ main()
diff --git a/benchmark/torch/ppo/utils.py b/benchmark/torch/ppo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e276a7f0779cfb55b3ef92012f22a61b7937c62
--- /dev/null
+++ b/benchmark/torch/ppo/utils.py
@@ -0,0 +1,43 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+import torch
+import torch.nn as nn
+
+from wrapper import VecNormalize
+
+
+def get_vec_normalize(venv):
+ if isinstance(venv, VecNormalize):
+ return venv
+ elif hasattr(venv, 'venv'):
+ return get_vec_normalize(venv.venv)
+
+ return None
+
+
+def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
+ """Decreases the learning rate linearly"""
+ lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
+ for param_group in optimizer.param_groups:
+ param_group['lr'] = lr
+
+
+def init(module, weight_init, bias_init, gain=1):
+ weight_init(module.weight.data, gain=gain)
+ bias_init(module.bias.data)
+ return module
diff --git a/benchmark/torch/ppo/wrapper.py b/benchmark/torch/ppo/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..a890db1d0e5ee2cc2131794d9317a76a55e16e83
--- /dev/null
+++ b/benchmark/torch/ppo/wrapper.py
@@ -0,0 +1,180 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simplified version of https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl/component/envs.py
+
+import numpy as np
+import gym
+from gym.core import Wrapper
+import time
+
+
+class TimeLimitMask(gym.Wrapper):
+ def step(self, action):
+ obs, rew, done, info = self.env.step(action)
+ if done and self.env._max_episode_steps == self.env._elapsed_steps:
+ info['bad_transition'] = True
+ return obs, rew, done, info
+
+ def reset(self, **kwargs):
+ return self.env.reset(**kwargs)
+
+
+class MonitorEnv(gym.Wrapper):
+ def __init__(self, env):
+ Wrapper.__init__(self, env=env)
+ self.tstart = time.time()
+ self.rewards = None
+
+ def step(self, action):
+ ob, rew, done, info = self.env.step(action)
+ self.update(ob, rew, done, info)
+ return (ob, rew, done, info)
+
+ def update(self, ob, rew, done, info):
+ self.rewards.append(rew)
+ if done:
+ eprew = sum(self.rewards)
+ eplen = len(self.rewards)
+ epinfo = {
+ "r": round(eprew, 6),
+ "l": eplen,
+ "t": round(time.time() - self.tstart, 6)
+ }
+ assert isinstance(info, dict)
+ info['episode'] = epinfo
+ self.reset()
+
+ def reset(self, **kwargs):
+ self.rewards = []
+ return self.env.reset(**kwargs)
+
+
+class VectorEnv(gym.Wrapper):
+ def step(self, action):
+ ob, rew, done, info = self.env.step(action)
+ ob = np.array(ob)
+ ob = ob[np.newaxis, :]
+ rew = np.array([rew])
+
+ done = np.array([done])
+
+ info = [info]
+ return (ob, rew, done, info)
+
+
+class RunningMeanStd(object):
+ # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+ def __init__(self, epsilon=1e-4, shape=()):
+ self.mean = np.zeros(shape, 'float64')
+ self.var = np.ones(shape, 'float64')
+ self.count = epsilon
+
+ def update(self, x):
+ batch_mean = np.mean(x, axis=0)
+ batch_var = np.var(x, axis=0)
+ batch_count = x.shape[0]
+ self.update_from_moments(batch_mean, batch_var, batch_count)
+
+ def update_from_moments(self, batch_mean, batch_var, batch_count):
+ self.mean, self.var, self.count = update_mean_var_count_from_moments(
+ self.mean, self.var, self.count, batch_mean, batch_var,
+ batch_count)
+
+
+def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
+ batch_count):
+ delta = batch_mean - mean
+ tot_count = count + batch_count
+
+ new_mean = mean + delta * batch_count / tot_count
+ m_a = var * count
+ m_b = batch_var * batch_count
+ M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
+ new_var = M2 / tot_count
+ new_count = tot_count
+
+ return new_mean, new_var, new_count
+
+
+class VecNormalize(gym.Wrapper):
+ def __init__(self,
+ env,
+ ob=True,
+ ret=True,
+ clipob=10.,
+ cliprew=10.,
+ gamma=0.99,
+ epsilon=1e-8):
+ Wrapper.__init__(self, env=env)
+ observation_space = env.observation_space.shape[0]
+
+ self.ob_rms = RunningMeanStd(shape=observation_space) if ob else None
+ self.ret_rms = RunningMeanStd(shape=()) if ret else None
+
+ self.clipob = clipob
+ self.cliprew = cliprew
+ self.gamma = gamma
+ self.epsilon = epsilon
+ self.ret = np.zeros(1)
+ self.training = True
+
+ def step(self, action):
+ ob, rew, new, info = self.env.step(action)
+ self.ret = self.ret * self.gamma + rew
+ # normalize observation
+ ob = self._obfilt(ob)
+ # normalize reward
+ if self.ret_rms:
+ self.ret_rms.update(self.ret)
+ rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon),
+ -self.cliprew, self.cliprew)
+ self.ret[new] = 0.
+ return ob, rew, new, info
+
+ def reset(self):
+ self.ret = np.zeros(1)
+ ob = self.env.reset()
+ return self._obfilt(ob)
+
+ def _obfilt(self, ob, update=True):
+ if self.ob_rms:
+ if self.training and update:
+ self.ob_rms.update(ob)
+ ob = np.clip((ob - self.ob_rms.mean) /
+ np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob,
+ self.clipob)
+ return ob
+ else:
+ return ob
+
+ def train(self):
+ self.training = True
+
+ def eval(self):
+ self.trainint = False
+
+
+def make_env(env_name, seed, gamma):
+ env = gym.make(env_name)
+ env.seed(seed)
+ env = TimeLimitMask(env)
+ env = MonitorEnv(env)
+ env = VectorEnv(env)
+ if gamma is None:
+ env = VecNormalize(env, ret=False)
+ else:
+ env = VecNormalize(env, gamma=gamma)
+
+ return env
diff --git a/benchmark/torch/td3/train.py b/benchmark/torch/td3/train.py
index c844d8c079a4b10e1e0ade957202cd7d2dcd27fb..48bd1f77103f1e50bd28f55cc12bee09315496e7 100644
--- a/benchmark/torch/td3/train.py
+++ b/benchmark/torch/td3/train.py
@@ -15,7 +15,7 @@
import gym
import argparse
import numpy as np
-from parl.utils import logger, tensorboard, ReplayMemory
+from parl.utils import logger, summary, ReplayMemory
from mujoco_model import MujocoModel
from mujoco_agent import MujocoAgent
@@ -103,8 +103,7 @@ def main():
train_reward, steps = run_train_episode(env, agent, rpm)
total_steps += steps
logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
- tensorboard.add_scalar('train/episode_reward', train_reward,
- total_steps)
+ summary.add_scalar('train/episode_reward', train_reward, total_steps)
if total_steps // args.test_every_steps >= test_flag:
while total_steps // args.test_every_steps >= test_flag:
@@ -112,8 +111,8 @@ def main():
evaluate_reward = run_evaluate_episode(env, agent)
logger.info('Steps {}, Evaluate reward: {}'.format(
total_steps, evaluate_reward))
- tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
- total_steps)
+ summary.add_scalar('eval/episode_reward', evaluate_reward,
+ total_steps)
if __name__ == '__main__':
diff --git a/docs/EvoKit/minimal_example.rst b/docs/EvoKit/minimal_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0eb7c66902fe71ebe097586f8385f43952068860
--- /dev/null
+++ b/docs/EvoKit/minimal_example.rst
@@ -0,0 +1,190 @@
+minimal example
+---------------------
+
+``本教程的目标:
+演示如何通过EvoKit库来解决经典的CartPole 问题。``
+
+*本教程假定读者曾经使用过PaddlePaddle, 了解基本的进化算法迭代流程。*
+
+CartPole 介绍
+#############
+CartPole又叫倒立摆。小车上放了一根杆,杆会因重力而倒下。为了不让杆倒下,我们要通过移动小车,来保持其是直立的。如下图所示。
+在每一个时间步,模型的输入是一个4维的向量,表示当前小车和杆的状态,模型输出的信号用于控制小车往左或者右移动。当杆没有倒下的时候,每个时间步,环境会给1分的奖励;当杆倒下后,环境不会给任何的奖励,游戏结束。
+
+.. image:: ../../examples/QuickStart/performance.gif
+ :width: 300px
+
+step1: 生成预测网络
+########################
+根据上面的环境介绍,我们需要构造一个神经网络,输入为4维的向量,输出为2维的概率分布向量(表示左/右)移动的概率。
+在这里,我们使用Paddle来实现预测网络,并保存到本地。
+
+.. code-block:: python
+
+ from paddle import fluid
+
+ def net(obs, act_dim):
+ hid1 = fluid.layers.fc(obs, size=20)
+ prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
+ return prob
+
+ if __name__ == '__main__':
+ obs_dim = 4
+ act_dim = 2
+ obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
+ prob = net(obs, act_dim)
+
+ exe = fluid.Executor(fluid.CPUPlace())
+ exe.run(fluid.default_startup_program())
+ fluid.io.save_inference_model(
+ dirname='init_model',
+ feeded_var_names=['obs'],
+ target_vars=[prob],
+ params_filename='params',
+ model_filename='model',
+ executor=exe)
+
+step2: 构造ESAgent
+###################
+
+- 调用 ``load_config`` 加载配置文件。
+- 调用 ``load_inference_model`` 函数加载模型参数。
+- 调用 ``init_solver`` 初始化solver。
+
+配置文件主要是用于指定进化算法类型(比如Gaussian或者CMA),使用的optimizer类型(Adam或者SGD)。
+
+.. code-block:: c++
+
+ ESAgent agent = ESAgent();
+ agent.load_config(config);
+ agent.load_inference_model(model_dir);
+ agent.init_solver();
+
+ // 附:EvoKit配置项示范
+ solver {
+ type: BASIC_ES
+ optimizer { // 线下Adam更新
+ type: ADAM
+ base_lr: 0.05
+ adam {
+ beta1: 0.9
+ beta2: 0.999
+ epsilon: 1e-08
+ }
+ }
+ sampling { // 线上高斯采样
+ type: GAUSSIAN_SAMPLING
+ gaussian_sampling {
+ std: 0.5
+ cached: true
+ seed: 1024
+ cache_size : 100000
+ }
+ }
+ }
+
+
+step3: 生成用于采样的Agent
+###################
+
+主要关注三个接口:
+
+- 调用 ``clone`` 生成一个用于sampling的agent。
+- 调用 ``add_noise`` 给这个agent的参数空间增加噪声,同时返回该噪声对应的唯一信息,这个信息得记录在log中,用于线下更新。
+- 调用 ``predict`` 提供预测接口。
+
+.. code-block:: c++
+
+ auto sampling_agent = agent.clone();
+ auto sampling_info = sampling_agent.add_noise();
+ sampling_agent.predict(feature);
+
+step4: 用采样的数据更新模型参数
+###################
+
+用户提供两组数据:
+
+- 采样参数过程中用于线下复现采样噪声的sampling_info
+- 扰动参数后,新参数的评估结果
+
+.. code-block:: c++
+
+ agent.update(sampling_infos, rewards);
+
+主代码以及注释
+#################
+
+以下的代码演示通过多线程同时采样, 提升解决问题的效率。
+
+.. code-block:: c++
+
+ int main(int argc, char* argv[]) {
+ std::vector envs;
+ // 构造10个环境,用于多线程训练
+ for (int i = 0; i < ITER; ++i) {
+ envs.push_back(CartPole());
+ }
+
+ // 初始化ESAgent
+ std::string model_dir = "./demo/cartpole/init_model";
+ std::string config_path = "./demo/cartpole/config.prototxt";
+ std::shared_ptr agent = std::make_shared();
+ agent->load_config(config_path); // 加载配置
+
+ agent->load_inference_model(FLAGS_model_dir); // 加载初始预测模型
+ agent->init_solver(); // 初始化solver,注意要在load_inference_model后执行
+
+ // 生成10个agent用于同时采样
+ std::vector> sampling_agents;
+ for (int i = 0; i < ITER; ++i) {
+ sampling_agents.push_back(agent->clone());
+ }
+
+ std::vector sampling_infos;
+ std::vector rewards(ITER, 0.0f);
+ sampling_infos.resize(ITER);
+ omp_set_num_threads(10);
+
+ // 共迭代100轮
+ for (int epoch = 0; epoch < 100; ++epoch) {
+ #pragma omp parallel for schedule(dynamic, 1)
+ for (int i = 0; i < ITER; ++i) {
+ std::shared_ptr sampling_agent = sampling_agents[i];
+ SamplingInfo sampling_info;
+ sampling_agent->add_noise(sampling_info);
+ float reward = evaluate(envs[i], sampling_agent);
+ // 保存采样的sampling_info以及对应的评估结果reward
+ sampling_infos[i] = sampling_info;
+ rewards[i] = reward;
+ }
+ // 更新模型参数,注意:参数更新后会自动同步到sampling_agent中
+ agent->update(sampling_infos, rewards);
+
+ int reward = evaluate(envs[0], agent);
+ LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; // 打印每一轮reward
+ }
+ }
+
+如何运行demo
+#################
+
+- 下载代码
+
+ 在icode上clone代码,我们的仓库路径是: ``baidu/nlp/deep-es`` ``TO DO: 修改库路径``
+
+- 编译demo
+
+ 通过bcloud的云端集群编译即可,命令为: ``bb``
+
+- 运行demo
+
+ 编译完成后,我们需要增加动态库查找路径:
+
+ ``export LD_LIBRARY_PATH=./output/so/:$LD_LIBRARY_PATH``
+
+ 运行demo: ``./output/bin/cartpole/train``
+
+问题解决
+####################
+
+在使用过程中有任何问题,请加hi群: 1692822 (PARL官方答疑群)进行咨询,开发同学会直接回答任何的使用问题。
diff --git a/docs/EvoKit/online_example.rst b/docs/EvoKit/online_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c4963f8cb909a240f318b1e85c77ba310c460160
--- /dev/null
+++ b/docs/EvoKit/online_example.rst
@@ -0,0 +1,124 @@
+Example for Online Products
+#########################
+
+``本教程的目标: 演示通过EvoKit库上线后,如何迭代算法,更新模型参数。``
+
+在产品线中,线上无法实时拿到用户日志,经常是通过保存用户点击/时长日志,在线下根据用户数据更新模型,然后再推送到线上,完成算法的更新。
+本教程继续围绕经典的CartPole环境,展示如何通过在线采样/离线更新的方式,来更新迭代ES算法。
+
+demo的完整代码示例放在demp/online_example文件夹中。
+``TO DO: 文件夹``
+
+初始化solver
+---------------------
+构造solver,对它初始化,并保存到文件。初始化solver仅需在开始时调用一次。
+
+.. code-block:: c++
+
+ std::shared_ptr agent = std::make_shared();
+ agent->load_config(FLAGS_config_path);
+ agent->load_inference_model(FLAGS_model_dir);
+ agent->init_solver();
+ agent->save_solver(FLAGS_model_dir);
+
+
+线上采样
+---------------------
+加载模型和solver,记录线上采样返回的sampling_info以及评估的reward,并通过二进制的方式记录到log文件中。
+
+.. code-block:: c++
+
+ std::shared_ptr agent = std::make_shared();
+ agent->load_config(FLAGS_config_path);
+ agent->load_inference_model(FLAGS_model_dir);
+ agent->load_solver(FLAGS_model_dir);
+
+ #pragma omp parallel for schedule(dynamic, 1)
+ for (int i = 0; i < ITER; ++i) {
+ std::shared_ptr sampling_agent = sampling_agents[i];
+ SamplingInfo sampling_info;
+ sampling_agent->add_noise(sampling_info);
+ float reward = evaluate(envs[i], sampling_agent);
+ sampling_infos[i] = sampling_info;
+ rewards[i] = reward;
+ }
+
+ // save sampling information and log in binary fomrat
+ std::ofstream log_stream(FLAGS_log_path, std::ios::binary);
+ for (int i = 0; i < ITER; ++i) {
+ std::string data;
+ sampling_infos[i].SerializeToString(&data);
+ int size = data.size();
+ log_stream.write((char*) &rewards[i], sizeof(float));
+ log_stream.write((char*) &size, sizeof(int));
+ log_stream.write(data.c_str(), size);
+ }
+ log_stream.close();
+
+
+线下更新
+-----------------------
+在加载好之前记录的log之后,调用 ``update`` 函数进行更新,然后通过 ``save_inference_model`` 和 ``save_solver`` 函数保存更新后的参数到本地,推送到线上。
+
+.. code-block:: c++
+
+ std::shared_ptr agent = std::make_shared();
+ agent->load_config(FLAGS_config_path);
+ agent->load_inference_model(FLAGS_model_dir);
+ agent->load_solver(FLAGS_model_dir);
+
+ // load training data
+ std::vector sampling_infos;
+ std::vector rewards(ITER, 0.0f);
+ sampling_infos.resize(ITER);
+ std::ifstream log_stream(FLAGS_log_path);
+ CHECK(log_stream.good()) << "[EvoKit] cannot open log: " << FLAGS_log_path;
+ char buffer[1000];
+ for (int i = 0; i < ITER; ++i) {
+ int size;
+ log_stream.read((char*) &rewards[i], sizeof(float));
+ log_stream.read((char*) &size, sizeof(int));
+ log_stream.read(buffer, size);
+ buffer[size] = 0;
+ std::string data(buffer);
+ sampling_infos[i].ParseFromString(data);
+ }
+
+ // update model and save parameter
+ agent->update(sampling_infos, rewards);
+ agent->save_inference_model(FLAGS_updated_model_dir);
+ agent->save_solver(FLAGS_updated_model_dir);
+
+
+主代码
+-----------------------
+
+将以上代码分别编译成可执行文件。
+
+- 初始化solver: ``init_solver`` 。
+- 线上采样: ``online_sampling`` 。
+- 线下更新: ``offline update`` 。
+
+.. code-block:: shell
+
+ #------------------------init solver------------------------
+ ./init_solver \
+ --model_dir="./model_warehouse/model_dir_0" \
+ --config_path="config.prototxt"
+
+
+ for ((epoch=0;epoch<200;++epoch));do
+ #------------------------online sampling------------------------
+ ./online_sampling \
+ --log_path="./sampling_log" \
+ --model_dir="./model_warehouse/model_dir_$epoch" \
+ --config_path="./config.prototxt"
+
+ #------------------------offline update------------------------
+ next_epoch=$((epoch+1))
+ ./offline_update \
+ --log_path='./sampling_log' \
+ --model_dir="./model_warehouse/model_dir_$epoch" \
+ --updated_model_dir="./model_warehouse/model_dir_${next_epoch}" \
+ --config_path="./config.prototxt"
+ done
diff --git a/docs/EvoKit/overview.rst b/docs/EvoKit/overview.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ce6fa07211456e12a0fbc29f6ecc37b501e45f24
--- /dev/null
+++ b/docs/EvoKit/overview.rst
@@ -0,0 +1,21 @@
+Overview
+------------------
+
+``EvoKit`` 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打 **快速上线验证** 。
+
+.. image:: ../../evo_kit/DeepES.gif
+ :align: center
+ :width: 400px
+
+特性
+#########
+
+**1. 多种进化算法支持。** 支持高斯采样、CMA、GA等算法,更多算法持续接入中。
+
+**2. 主流优化器支持。** 支持SGD/Momentum/Adam等多个主流优化器,有效提升算法收敛效率。
+
+**3. 一站式上线。** 整合了线上采样和线下更新流程, 提供Bcloud/Cmake等编译方式, 助力快速上线。
+
+**4. 深度学习框架全系列兼容。** 裸写的网络,paddle/lego/Torch等深度学习框架,EvoKit都支持。
+
+**5. 同步/异步更新方式。** 支持多个采样模型/多份采样数据异步更新,完美契合业务场景。
diff --git a/docs/conf.py b/docs/conf.py
index e4e009f0d8d2edc5ae158b0ab5d680c9c45fcdc2..29f697d1db5fc60304f1da625ed92cf14f2f819b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -101,3 +101,37 @@ def setup(app):
add_module_names = False
+
+latex_engine = 'xelatex'
+latex_use_xindy = False
+latex_elements = {
+ 'preamble': '\\usepackage[UTF8]{ctex}\n',
+}
+
+latex_elements = {
+ # The paper size ('letterpaper' or 'a4paper').
+ #'papersize': 'letterpaper',
+
+ # The font size ('10pt', '11pt' or '12pt').
+ #'pointsize': '10pt',
+
+ # Additional stuff for the LaTeX preamble.
+ #'preamble': '',
+ 'preamble':
+ r'''
+ \hypersetup{unicode=true}
+ \usepackage{CJKutf8}
+ \DeclareUnicodeCharacter{00A0}{\nobreakspace}
+ \DeclareUnicodeCharacter{2203}{\ensuremath{\exists}}
+ \DeclareUnicodeCharacter{2200}{\ensuremath{\forall}}
+ \DeclareUnicodeCharacter{2286}{\ensuremath{\subseteq}}
+ \DeclareUnicodeCharacter{2713}{x}
+ \DeclareUnicodeCharacter{27FA}{\ensuremath{\Longleftrightarrow}}
+ \DeclareUnicodeCharacter{221A}{\ensuremath{\sqrt{}}}
+ \DeclareUnicodeCharacter{221B}{\ensuremath{\sqrt[3]{}}}
+ \DeclareUnicodeCharacter{2295}{\ensuremath{\oplus}}
+ \DeclareUnicodeCharacter{2297}{\ensuremath{\otimes}}
+ \begin{CJK}{UTF8}{gbsn}
+ \AtEndDocument{\end{CJK}}
+ ''',
+}
diff --git a/docs/index.rst b/docs/index.rst
index e7d6c144112fca11f836b6890c68b2e4c2010832..5009dde813c18dfb97c9066a7dfb9abecf22657a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -46,7 +46,7 @@ Abstractions
:maxdepth: 1
:caption: Installation
- installation.rst
+ installation.rst
.. toctree::
:maxdepth: 1
@@ -58,9 +58,10 @@ Abstractions
:maxdepth: 1
:caption: Tutorial
- getting_started.rst
- new_alg.rst
- save_param.rst
+ tutorial/getting_started.rst
+ tutorial/new_alg.rst
+ tutorial/save_param.rst
+ tutorial/tensorboard.rst
.. toctree::
:maxdepth: 2
@@ -83,3 +84,11 @@ Abstractions
model.rst
algorithm.rst
agent.rst
+
+.. toctree::
+ :maxdepth: 2
+ :caption: EvoKit
+
+ EvoKit/overview.rst
+ EvoKit/minimal_example.rst
+ EvoKit/online_example.rst
diff --git a/docs/tutorial/add_histogram.jpg b/docs/tutorial/add_histogram.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1c33b0e3ad81f3ca0878c2623f6c4a6a80de19b0
Binary files /dev/null and b/docs/tutorial/add_histogram.jpg differ
diff --git a/docs/tutorial/add_scalar.jpg b/docs/tutorial/add_scalar.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..27cb4a270150c00baf37332d79ed821c3bc901ba
Binary files /dev/null and b/docs/tutorial/add_scalar.jpg differ
diff --git a/docs/getting_started.rst b/docs/tutorial/getting_started.rst
similarity index 98%
rename from docs/getting_started.rst
rename to docs/tutorial/getting_started.rst
index a70a438ba7952a54d199d0fee345c0ee4e87b398..f406c47407d8b98d7bb26f99e8a54b64e11423c8 100644
--- a/docs/getting_started.rst
+++ b/docs/tutorial/getting_started.rst
@@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi
Summary
-----------
-.. image:: ../examples/QuickStart/performance.gif
+.. image:: ../../examples/QuickStart/performance.gif
:width: 300px
-.. image:: ./images/quickstart.png
+.. image:: ../images/quickstart.png
:width: 300px
In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem.
diff --git a/docs/new_alg.rst b/docs/tutorial/new_alg.rst
similarity index 98%
rename from docs/new_alg.rst
rename to docs/tutorial/new_alg.rst
index 973c062b88cf5ad7f59e94161d4d019c72fbf717..1acf09796c3ed10ba6135ec367902e6f1d985d47 100644
--- a/docs/new_alg.rst
+++ b/docs/tutorial/new_alg.rst
@@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods:
Args:
model (parl.Model): model defining forward network of Q function
- hyperparas (dict): (deprecated) dict of hyper parameters.
act_dim (int): dimension of the action space
gamma (float): discounted factor for reward computation.
lr (float): learning rate.
diff --git a/docs/save_param.rst b/docs/tutorial/save_param.rst
similarity index 95%
rename from docs/save_param.rst
rename to docs/tutorial/save_param.rst
index 3824eb9d3fe23c47f375877a75c6c88aab06c0b4..82e411ab2010ef3f9b4dcca0fd0c23f319eac7b7 100644
--- a/docs/save_param.rst
+++ b/docs/tutorial/save_param.rst
@@ -22,5 +22,5 @@ Here is a demonstration of usage:
agent.restore('./model.ckpt')
# restore the parameters from ./model.ckpt to another_agent
- another_agent = AtariAgent()
+ another_agent = AtariAgent()
another_agent.restore('./model.ckpt')
diff --git a/docs/tutorial/tensorboard.rst b/docs/tutorial/tensorboard.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8952a5e00b624e1c02b74c451da0d168ee6a4817
--- /dev/null
+++ b/docs/tutorial/tensorboard.rst
@@ -0,0 +1,55 @@
+summary
+===============
+
+Visualize the results with tensorboard.
+
+add_scalar
+-------------
+
+Common used arguments:
+
+* summary.add_scalar(tag, scalar_value, global_step=None)
+ * tag *(string)* – Data identifier
+ * scalar_value *(float or string/blobname)* – Value to save
+ * global_step *(int)* – Global step value to record
+
+Example:
+
+.. code-block:: python
+
+ from parl.utils import summary
+
+ x = range(100)
+ for i in x:
+ summary.add_scalar('y=2x', i * 2, i)
+
+Expected result:
+
+ .. image:: add_scalar.jpg
+ :scale: 50 %
+
+add_histogram
+----------------
+
+Common used arguments:
+
+* summary.add_scalar(tag, scalar_value, global_step=None)
+ * tag *(string)* – Data identifier
+ * values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram
+ * global_step *(int)* – Global step value to record
+
+Example:
+
+.. code-block:: python
+
+ from parl.utils import summary
+ import numpy as np
+
+ for i in range(10):
+ x = np.random.random(1000)
+ summary.add_histogram('distribution centers', x + i, i)
+
+Expected result:
+
+ .. image:: add_histogram.jpg
+ :scale: 50 %
diff --git a/evo_kit/CMakeLists.txt b/evo_kit/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9672c11aa0ea54ee6de4f6c6d60e92c18d47e60
--- /dev/null
+++ b/evo_kit/CMakeLists.txt
@@ -0,0 +1,89 @@
+cmake_minimum_required (VERSION 2.6)
+project (EvoKit)
+
+########## options ##########
+option(WITH_PADDLE "Compile EvoKit with PaddleLite framework." OFF)
+option(WITH_TORCH "Compile EvoKit with Torch framework." OFF)
+
+message("WITH_PADDLE: "${WITH_PADDLE})
+message("WITH_TORCH: "${WITH_TORCH})
+
+if (NOT (WITH_PADDLE OR WITH_TORCH))
+ message("ERROR: You should choose at least one framework to compile EvoKit.")
+ return()
+elseif(WITH_PADDLE AND WITH_TORCH)
+ message("ERROR: You cannot choose more than one framework to compile EvoKit.")
+ return()
+endif()
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(OpenMP)
+if (OPENMP_FOUND)
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+
+file(GLOB src "core/src/*.cc" "core/proto/evo_kit/*.cc")
+include_directories("core/include")
+include_directories("core/proto")
+include_directories("benchmark")
+
+########## PaddleLite config ##########
+if (WITH_PADDLE)
+ add_definitions(-g -O3 -pthread)
+
+ include_directories("paddle/include")
+ include_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/include"
+ "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/include")
+ link_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/lib"
+ "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/lib")
+
+ file(GLOB framework_src "paddle/src/*.cc")
+ set(TARGET EvoKit_paddle)
+########## Torch config ##########
+elseif (WITH_TORCH)
+ # list(APPEND CMAKE_PREFIX_PATH "./libtorch")
+ # find_package(Torch REQUIRED ON) # TODO: not necessary for now
+
+ include_directories("torch/include")
+
+ file(GLOB framework_src "torch/src/*.cc")
+ set(TARGET EvoKit_torch)
+else ()
+ message("ERROR: You should choose at least one framework to compile EvoKit.")
+endif()
+
+
+add_library(${TARGET} STATIC ${src} ${framework_src})
+target_link_libraries(${TARGET} gflags protobuf pthread glog)
+
+
+# ########## PaddleLite libraries ##########
+# if (WITH_PADDLE)
+# target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+# target_link_libraries(${TARGET} -lmklml_intel)
+# target_link_libraries(${TARGET} -ldl)
+# ########## Torch libraries ##########
+# elseif (WITH_TORCH)
+# target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
+# endif()
+
+file(GLOB include "core/include/evo_kit/*.h")
+file(GLOB proto_include "core/proto/evo_kit/*.h")
+file(GLOB torch_include "torch/include/evo_kit/*.h")
+file(GLOB paddle_include "paddle/include/evo_kit/*.h")
+file(GLOB benchmark_include "benchmark/*.h")
+file(GLOB findcmake "cmake/Torch/*.cmake")
+
+set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/libevokit")
+install(TARGETS ${TARGET} ARCHIVE DESTINATION "lib")
+install(FILES ${include} ${proto_include} DESTINATION "include/evo_kit")
+install(FILES ${torch_include} DESTINATION "torch/evo_kit")
+install(FILES ${paddle_include} DESTINATION "paddle/evo_kit")
+install(FILES ${benchmark_include} DESTINATION "include")
+install(FILES ${findcmake} DESTINATION "cmake/Torch")
diff --git a/evo_kit/DeepES.gif b/evo_kit/DeepES.gif
new file mode 100644
index 0000000000000000000000000000000000000000..7240118f3fce55b587690450e0c9cafc2f0694db
Binary files /dev/null and b/evo_kit/DeepES.gif differ
diff --git a/evo_kit/README.md b/evo_kit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ecac85379c048f22e444b9286d8e5225a7e7daa8
--- /dev/null
+++ b/evo_kit/README.md
@@ -0,0 +1,41 @@
+# EvoKit
+EvoKit 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库,主打快速上线验证 。
+
+
+
+
+## 使用示范
+```c++
+//实例化一个预测,根据配置文件加载模型,采样方式(Gaussian\CMA sampling..)、更新方式(SGD\Adam)等
+auto agent = ESAgent(config);
+
+for (int i = 0; i < 10; ++i) {
+ auto sampling_agnet = agent->clone(); // clone出一个sampling agent
+ SamplingInfo info;
+ sampling_agent->add_noise(info); // 参数扰动,同时保存随机种子到info中
+ int reward = evaluate(env, sampling_agent); //评估参数
+ noisy_info.push_back(info); // 记录随机噪声对应种子
+ noisy_rewards.push_back(reward); // 记录评估结果
+}
+//根据评估结果、随机种子更新参数,然后重复以上过程,直到收敛。
+agent->update(noisy_info, noisy_rewards);
+```
+
+## 一键运行demo列表
+- **PaddleLite**: sh ./scripts/build.sh paddle
+- **Torch**: sh ./scripts/build.sh torch
+- **裸写网络**:
+
+## 相关依赖:
+- Protobuf2
+- OpenMP
+- [glog](https://github.com/gflags/gflags/blob/master/INSTALL.md)
+- [gflag](https://github.com/google/glog)
+
+## 额外依赖:
+
+### 使用PaddleLite
+下载PaddleLite的X86预编译库,或者编译PaddleLite源码,得到inference_lite_lib文件夹,放在当前目录中。(可参考:[PaddleLite使用X86预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html))
+
+### 使用torch
+下载[libtorch](https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip)或者编译torch源码,得到libtorch文件夹,放在当前目录中。
diff --git a/evo_kit/benchmark/cartpole.h b/evo_kit/benchmark/cartpole.h
new file mode 100644
index 0000000000000000000000000000000000000000..f289715aeac29cb76d5148a5ae8b4adc5233243c
--- /dev/null
+++ b/evo_kit/benchmark/cartpole.h
@@ -0,0 +1,98 @@
+// Third party code
+// This code is copied or modified from openai/gym's cartpole.py
+#include
+#include
+#include
+#include
+
+const double kPi = 3.1415926535898;
+
+class CartPole {
+public:
+ double gravity = 9.8;
+ double masscart = 1.0;
+ double masspole = 0.1;
+ double total_mass = (masspole + masscart);
+ double length = 0.5; // actually half the pole's length;
+ double polemass_length = (masspole * length);
+ double force_mag = 10.0;
+ double tau = 0.02; // seconds between state updates;
+
+ // Angle at which to fail the episode
+ double theta_threshold_radians = 12 * 2 * kPi / 360;
+ double x_threshold = 2.4;
+ int steps_beyond_done = -1;
+
+ std::vector state = {0, 0, 0, 0};
+ double reward;
+ bool done;
+ int step_ = 0;
+
+ const float* getState() {
+ return state.data();
+ }
+
+ double getReward() {
+ return reward;
+ }
+
+ double isDone() {
+ return done;
+ }
+
+ void reset() {
+ std::random_device rd;
+ std::default_random_engine generator(rd());
+ std::uniform_real_distribution distribution(-0.05, 0.05);
+ for (int i = 0; i < 4; ++i) {
+ state[i] = distribution(generator);
+ }
+
+ steps_beyond_done = -1;
+ step_ = 0;
+ }
+
+ CartPole() {
+ reset();
+ }
+
+ void step(int action) {
+ float x = state[0];
+ float x_dot = state[1];
+ float theta = state[2];
+ float theta_dot = state[3];
+
+ auto force = (action == 1) ? force_mag : -force_mag;
+ auto costheta = std::cos(theta);
+ auto sintheta = std::sin(theta);
+ auto temp = (force + polemass_length * theta_dot * theta_dot * sintheta) /
+ total_mass;
+ auto thetaacc = (gravity * sintheta - costheta * temp) /
+ (length * (4.0 / 3.0 - masspole * costheta * costheta / total_mass));
+ auto xacc = temp - polemass_length * thetaacc * costheta / total_mass;
+
+ x = x + tau * x_dot;
+ x_dot = x_dot + tau * xacc;
+ theta = theta + tau * theta_dot;
+ theta_dot = theta_dot + tau * thetaacc;
+
+ state = {x, x_dot, theta, theta_dot};
+
+ done = x < -x_threshold || x > x_threshold ||
+ theta < -theta_threshold_radians || theta > theta_threshold_radians ||
+ step_ > 200;
+
+ if (!done) {
+ reward = 1.0;
+ } else if (steps_beyond_done == -1) {
+ // Pole just fell!
+ steps_beyond_done = 0;
+ reward = 0;
+ } else {
+ if (steps_beyond_done == 0) {
+ assert(false); // Can't do this
+ }
+ }
+ step_++;
+ }
+};
diff --git a/evo_kit/cmake/Torch/EvoKitConfig.cmake b/evo_kit/cmake/Torch/EvoKitConfig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9f1c954430aec05a38d03c26a6b406343c01ad20
--- /dev/null
+++ b/evo_kit/cmake/Torch/EvoKitConfig.cmake
@@ -0,0 +1,45 @@
+# FindEvoKit
+# -------
+#
+# Finds the EvoKit library
+#
+# This will define the following variables:
+#
+# EVOKIT_FOUND -- True if the system has the EvoKit library
+# EVOKIT_INCLUDE_DIRS -- The include directories for EvoKit
+# EVOKIT_LIBRARY -- Libraries to link against
+#
+# and the following imported targets:
+#
+# EvoKit
+
+include(FindPackageHandleStandardArgs)
+
+if (DEFINED ENV{EVOKIT_INSTALL_PREFIX})
+ set(EVOKIT_INSTALL_PREFIX $ENV{EVOKIT_INSTALL_PREFIX})
+else()
+ # Assume we are in /cmake/Torch/EvoKitConfig.cmake
+ get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+ get_filename_component(EVOKIT_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
+endif()
+
+# Include directories.
+if (EXISTS "${EVOKIT_INSTALL_PREFIX}/include")
+ set(EVOKIT_INCLUDE_DIRS
+ ${EVOKIT_INSTALL_PREFIX}/include
+ ${EVOKIT_INSTALL_PREFIX}/torch)
+else()
+ set(EVOKIT_INCLUDE_DIRS
+ ${EVOKIT_INSTALL_PREFIX}/include
+ ${EVOKIT_INSTALL_PREFIX}/torch)
+endif()
+
+find_library(EVOKIT_LIBRARY libEvoKit_torch.a PATHS "${EVOKIT_INSTALL_PREFIX}/lib")
+
+include_directories("${EVOKIT_INSTALL_PREFIX}/torch")
+include_directories("${EVOKIT_INSTALL_PREFIX}/include")
+
+find_package_handle_standard_args(EvoKit DEFAULT_MSG EVOKIT_LIBRARY EVOKIT_INCLUDE_DIRS)
+message(STATUS "EVOKIT_FOUND: ${EVOKIT_FOUND}")
+message(STATUS "EVOKIT_INCLUDE_DIRS: ${EVOKIT_INCLUDE_DIRS}")
+message(STATUS "EVOKIT_LIBRARY: ${EVOKIT_LIBRARY}")
diff --git a/evo_kit/core/include/evo_kit/adam_optimizer.h b/evo_kit/core/include/evo_kit/adam_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b268b69f61d35e5d6df8eeb56b1869e7bcb828ff
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/adam_optimizer.h
@@ -0,0 +1,53 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_ADAM_OPTIMIZER_H
+#define EVO_KIT_ADAM_OPTIMIZER_H
+
+#include
+#include
+#include "evo_kit/optimizer.h"
+
+namespace evo_kit {
+
+/*@brief AdamOptimizer.
+ * Implements Adam algorithm.
+ *
+ *@Args:
+ * base_lr: learning rate (default: 1e-3).
+ * beta1: coefficients used for computing running averages of gradient (default: 0.9).
+ * beta2: coefficients used for computing running averages of gradient's square (default: 0.999).
+ * epsilon: term added to the denominator to improve numerical stability (default: 1e-8).
+ */
+class AdamOptimizer: public Optimizer {
+public:
+ AdamOptimizer(float base_lr, float beta1 = 0.9, float beta2 = 0.999,
+ float epsilon = 1e-8): Optimizer(base_lr), \
+ _beta1(beta1), _beta2(beta2), _epsilon(epsilon) {}
+ ~AdamOptimizer();
+
+protected:
+ void compute_step(float* gradient, int size, std::string param_name);
+
+private:
+ float _beta1;
+ float _beta2;
+ float _epsilon;
+ std::unordered_map _momentum;
+ std::unordered_map _velocity;
+};
+
+}//namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h b/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..c033fb7f23e9d3d91754237cad61e181a823db2d
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h
@@ -0,0 +1,78 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
+#define EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
+
+#include
+#include
+#include
+#include
+#include
+#include "sampling_method.h"
+#include "utils.h"
+
+namespace evo_kit {
+
+class CachedGaussianSampling: public SamplingMethod {
+
+public:
+ CachedGaussianSampling();
+
+ ~CachedGaussianSampling();
+
+ /*Initialize the sampling algorithm given the config with the protobuf format.
+ *EvoKit library uses only one configuration file for all sampling algorithms.
+ A defalut configuration file can be found at: . // TODO: where?
+ Usally you won't have to modify the configuration items of other algorithms
+ if you are not using them.
+ */
+ bool load_config(const EvoKitConfig& config);
+
+ /*@brief generate Gaussian noise and the related key.
+ *
+ *@Args:
+ * key: a unique key associated with the sampled noise.
+ * noise: a pointer pointed to the memory that stores the noise
+ * size: the number of float to be sampled.
+ *
+ *@return:
+ * success: generate Gaussian successfully or not.
+ */
+ bool sampling(int* key, float* noise, int64_t size);
+
+ /*@brief reconstruct the Gaussion noise given the key.
+ * This function is often used for updating the neuron network parameters in the offline environment.
+ *
+ *@Args:
+ * key: a unique key associated with the sampled noise.
+ * noise: a pointer pointed to the memory that stores the noise
+ * size: the number of float to be sampled.
+ *
+ *@return:
+ * success: reconstruct Gaussian successfully or not.
+ */
+ bool resampling(int key, float* noise, int64_t size);
+
+private:
+ float _std;
+ int _cache_size;
+ float* _noise_cache = nullptr;
+
+ bool _create_noise_cache();
+};
+
+}
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/gaussian_sampling.h b/evo_kit/core/include/evo_kit/gaussian_sampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0fc66f058f2d1b9224d19d5c029cdca1853f638
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/gaussian_sampling.h
@@ -0,0 +1,73 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef EVO_KIT_GAUSSIAN_SAMPLING_H
+#define EVO_KIT_GAUSSIAN_SAMPLING_H
+
+#include
+#include
+#include
+#include
+#include "evo_kit/sampling_method.h"
+#include "evo_kit/utils.h"
+
+namespace evo_kit {
+
+class GaussianSampling: public SamplingMethod {
+
+public:
+ GaussianSampling() {}
+
+ ~GaussianSampling() {}
+
+ /*Initialize the sampling algorithm given the config with the protobuf format.
+ *EvoKit library uses only one configuration file for all sampling algorithms.
+ A defalut configuration file can be found at: . // TODO: where?
+ Usally you won't have to modify the configuration items of other algorithms
+ if you are not using them.
+ */
+ bool load_config(const EvoKitConfig& config);
+
+ /*@brief generate Gaussian noise and the related key.
+ *
+ *@Args:
+ * key: a unique key associated with the sampled noise.
+ * noise: a pointer pointed to the memory that stores the noise
+ * size: the number of float to be sampled.
+ *
+ *@return:
+ * success: generate Gaussian successfully or not.
+ */
+ bool sampling(int* key, float* noise, int64_t size);
+
+ /*@brief reconstruct the Gaussion noise given the key.
+ * This function is often used for updating the neuron network parameters in the offline environment.
+ *
+ *@Args:
+ * key: a unique key associated with the sampled noise.
+ * noise: a pointer pointed to the memory that stores the noise
+ * size: the number of float to be sampled.
+ *
+ *@return:
+ * success: reconstruct Gaussian successfully or not.
+ */
+ bool resampling(int key, float* noise, int64_t size);
+
+private:
+ float _std;
+};
+
+}
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/optimizer.h b/evo_kit/core/include/evo_kit/optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c41bc5d405b00bef71affa0fa6cb82a13afd1b2
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/optimizer.h
@@ -0,0 +1,79 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_OPTIMIZER_H
+#define EVO_KIT_OPTIMIZER_H
+
+#include
+#include
+
+namespace evo_kit {
+
+/*@brief Optimizer. Base class for optimizers.
+ *
+ *@Args:
+ * base_lr: learning rate (default: 1e-3).
+ *
+ * .. warning: update () is based on the parameter level,
+ * you need to perform update () on each parameter.
+ *
+ * Subclasses are required to implement the following functions:
+ * 1. compute_steps
+ */
+class Optimizer {
+public:
+ Optimizer() : _base_lr(1e-3), _update_times(0) {}
+ Optimizer(float base_lr) : _base_lr(base_lr), _update_times(0) {}
+ virtual ~Optimizer() {
+ _params_size.clear();
+ }
+
+ template
+ bool update(T weights, float* gradient, int size, std::string param_name = "") {
+ /*@ Performs a single optimization step (parameter update) at the parameter level.
+ *
+ *@Args:
+ * weights (array): parameter weights.
+ * gradient (array): gradient for updating weights.
+ * size: size of gradient.
+ * param_name: the name corresponding to the weights.
+ */
+ if (_params_size.count(param_name) == 0) {
+ _params_size[param_name] = size;
+ } else if (_params_size[param_name] != size) {
+ LOG(WARNING) << "[Warning] Update times: " << int(_update_times / _params_size.size()) \
+ << ". Size of weights[" << param_name << "] is " << _params_size[param_name] << ", not " << size;
+ return false;
+ }
+
+ ++_update_times;
+ compute_step(gradient, size, param_name);
+
+ for (int i = 0; i < size; ++i) {
+ weights[i] -= _base_lr * gradient[i];
+ }
+
+ return true;
+ } // template function
+
+protected:
+ virtual void compute_step(float* graident, int size, std::string param_name = "") = 0;
+ float _base_lr;
+ float _update_times;
+ std::unordered_map _params_size;
+};
+
+
+}//namespace
+#endif
diff --git a/evo_kit/core/include/evo_kit/optimizer_factory.h b/evo_kit/core/include/evo_kit/optimizer_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e3e099110d17efefd8dce9d5090b06fc27c0d21
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/optimizer_factory.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_OPTIMIZER_FACTORY_H
+#define EVO_KIT_OPTIMIZER_FACTORY_H
+
+#include
+#include
+#include
+#include "evo_kit/adam_optimizer.h"
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/optimizer.h"
+#include "evo_kit/sgd_optimizer.h"
+
+namespace evo_kit {
+/* @brief: create an optimizer according to the configuration"
+ * @args:
+ * config: configuration for the optimizer
+ *
+ */
+std::shared_ptr create_optimizer(const OptimizerConfig& optimizer_config);
+
+} // namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/sampling_factory.h b/evo_kit/core/include/evo_kit/sampling_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7e859cddcb88784b2d01b9642bcbc1b23e378cb
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/sampling_factory.h
@@ -0,0 +1,36 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SAMPLING_FACTORY_H
+#define EVO_KIT_SAMPLING_FACTORY_H
+
+#include
+#include
+#include
+#include "evo_kit/cached_gaussian_sampling.h"
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/sampling_method.h"
+
+namespace evo_kit {
+/* @brief: create an sampling_method according to the configuration"
+ * @args:
+ * config: configuration for the EvoKit
+ *
+ */
+std::shared_ptr create_sampling_method(const EvoKitConfig& Config);
+
+} // namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/sampling_method.h b/evo_kit/core/include/evo_kit/sampling_method.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc07dfe7cfefff694eef6cf7ca17ee35848eea98
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/sampling_method.h
@@ -0,0 +1,90 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SAMPLING_METHOD_H
+#define EVO_KIT_SAMPLING_METHOD_H
+
+#include
+#include
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit {
+
+/*Base class for sampling algorithms. All algorithms are required to override the following functions:
+ *
+ * 1. load_config
+ * 2. sampling
+ * 3. resampling
+ *
+ * View an demostrative algorithm in gaussian_sampling.h
+ * */
+
+class SamplingMethod {
+
+public:
+
+ SamplingMethod(): _seed(0) {}
+
+ virtual ~SamplingMethod() {}
+
+ /*Initialize the sampling algorithm given the config with the protobuf format.
+ *EvoKit library uses only one configuration file for all sampling algorithms.
+ A defalut configuration file can be found at: . // TODO: where?
+ Usally you won't have to modify the configuration items of other algorithms
+ if you are not using them.
+ */
+ virtual bool load_config(const EvoKitConfig& config) = 0;
+
+ /*@brief generate Gaussian noise and the related key.
+ *
+ *@Args:
+ * key: a unique key associated with the sampled noise.
+ * noise: a pointer pointed to the memory that stores the noise
+ * size: the number of float to be sampled.
+ *
+ *@return:
+ * success: generate Gaussian successfully or not.
+ */
+ virtual bool sampling(int* key, float* noise, int64_t size) = 0;
+
+ /*@brief reconstruct the Gaussion noise given the key.
+ * This function is often used for updating the neuron network parameters in the offline environment.
+ *
+ *@Args:
+ * key: a unique key associated with the sampled noise.
+ * noise: a pointer pointed to the memory that stores the noise
+ * size: the number of float to be sampled.
+ *
+ *@return:
+ * success: reconstruct Gaussian successfully or not.
+ */
+ virtual bool resampling(int key, float* noise, int64_t size) = 0;
+
+ bool set_seed(int seed) {
+ _seed = seed;
+ srand(_seed);
+ return true;
+ }
+
+ int get_seed() {
+ return _seed;
+ }
+
+protected:
+ int _seed;
+
+};
+
+}
+#endif
diff --git a/evo_kit/core/include/evo_kit/sgd_optimizer.h b/evo_kit/core/include/evo_kit/sgd_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd0d68803775df66d1bc90c748fe9801e17176c9
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/sgd_optimizer.h
@@ -0,0 +1,46 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SGD_OPTIMIZER_H
+#define EVO_KIT_SGD_OPTIMIZER_H
+
+#include
+#include
+#include "evo_kit/optimizer.h"
+
+namespace evo_kit {
+
+/*@brief SGDOptimizer.
+ * Implements stochastic gradient descent (optionally with momentum).
+ *
+ *@Args:
+ * base_lr: learning rate (default: 1e-3).
+ * momentum: momentum factor (default: 0.9).
+ */
+class SGDOptimizer: public Optimizer {
+public:
+ SGDOptimizer(float base_lr, float momentum = 0.9): Optimizer(base_lr), _momentum(momentum) {}
+ ~SGDOptimizer();
+
+protected:
+ void compute_step(float* gradient, int size, std::string param_name);
+
+private:
+ float _momentum;
+ std::unordered_map _velocity;
+};
+
+} // namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/utils.h b/evo_kit/core/include/evo_kit/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd704fd384de70683445d65d5609f97b9979907a
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/utils.h
@@ -0,0 +1,97 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_UTILS_H
+#define EVO_KIT_UTILS_H
+
+#include
+#include
+#include
+#include
+#include
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit {
+
+/*Return ranks that is normliazed to [-0.5, 0.5] with the rewards as input.
+ Args:
+ reward: an array of rewards
+*/
+bool compute_centered_ranks(std::vector& reward);
+
+std::string read_file(const std::string& filename);
+
+/* Load a protobuf-based configuration from the file.
+ * Args:
+ * config_file: file path.
+ * proto_config: protobuff message for configuration.
+ * return
+ */
+template
+bool load_proto_conf(const std::string& config_file, T& proto_config) {
+ bool success = true;
+ std::ifstream fin(config_file);
+
+ if (!fin || fin.fail()) {
+ LOG(ERROR) << "open prototxt config failed: " << config_file;
+ success = false;
+ } else {
+ fin.seekg(0, std::ios::end);
+ size_t file_size = fin.tellg();
+ fin.seekg(0, std::ios::beg);
+
+ char* file_content_buffer = new char[file_size];
+ fin.read(file_content_buffer, file_size);
+
+ std::string proto_str(file_content_buffer, file_size);
+
+ if (!google::protobuf::TextFormat::ParseFromString(proto_str, &proto_config)) {
+ LOG(ERROR) << "Failed to load config: " << config_file;
+ success = false;
+ }
+
+ delete[] file_content_buffer;
+ fin.close();
+ }
+
+ return success;
+}
+
+template
+bool save_proto_conf(const std::string& config_file, T& proto_config) {
+ bool success = true;
+ std::ofstream ofs(config_file, std::ofstream::out);
+
+ if (!ofs || ofs.fail()) {
+ LOG(ERROR) << "open prototxt config failed: " << config_file;
+ success = false;
+ } else {
+ std::string config_str;
+ success = google::protobuf::TextFormat::PrintToString(proto_config, &config_str);
+
+ if (!success) {
+ return success;
+ }
+
+ ofs << config_str;
+ }
+
+ return success;
+}
+
+std::vector list_all_model_dirs(std::string path);
+
+}
+
+#endif
diff --git a/evo_kit/core/proto/evo_kit/evo_kit.proto b/evo_kit/core/proto/evo_kit/evo_kit.proto
new file mode 100644
index 0000000000000000000000000000000000000000..fc4f68d9247e63b1d98b35ebd338052ffb7eb9a6
--- /dev/null
+++ b/evo_kit/core/proto/evo_kit/evo_kit.proto
@@ -0,0 +1,57 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package evo_kit;
+
+message EvoKitConfig {
+ //sampling configuration
+ optional int32 seed = 1 [default = 18];
+ optional int32 buffer_size = 2 [default = 100000];
+ optional GaussianSamplingConfig gaussian_sampling = 3;
+ // Optimizer Configuration
+ optional OptimizerConfig optimizer = 4;
+ // AsyncESAgent Configuration
+ optional AsyncESConfig async_es = 5;
+}
+
+message GaussianSamplingConfig {
+ optional float std = 1 [default = 1.0];
+ optional bool cached = 2 [default = false];
+ optional int32 cache_size = 3 [default = 100000];
+}
+
+message OptimizerConfig{
+ optional string type = 1 [default = "SGD"];
+ optional float base_lr = 2 [default = 1e-3]; // The base learning rate.
+ optional float momentum = 3 [default = 0.9]; // The momentum value for SGD.
+
+ // ------------Adam Optimizer---------
+ optional float beta1 = 4 [default = 0.9];
+ optional float beta2 = 5 [default = 0.999];
+ optional float epsilon = 6 [default = 1e-8];
+}
+
+message SamplingInfo{
+ repeated int32 key = 1;
+ optional int32 model_iter_id = 2;
+}
+
+message AsyncESConfig{
+ optional string model_warehouse = 1 [default = "./model_warehouse"];
+ repeated string model_md5 = 2;
+ optional int32 max_to_keep = 3 [default = 5];
+ optional int32 model_iter_id = 4 [default = 0];
+}
diff --git a/evo_kit/core/src/adam_optimizer.cc b/evo_kit/core/src/adam_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44f36e4d1d3e01ae2cceeba16d95d7aaa24a2c09
--- /dev/null
+++ b/evo_kit/core/src/adam_optimizer.cc
@@ -0,0 +1,55 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/adam_optimizer.h"
+
+namespace evo_kit {
+
+AdamOptimizer::~AdamOptimizer() {
+ for (auto iter = _momentum.begin(); iter != _momentum.end(); iter++) {
+ delete[] iter->second;
+ }
+
+ for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
+ delete[] iter->second;
+ }
+
+ _momentum.clear();
+ _velocity.clear();
+}
+
+void AdamOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
+ if (_momentum.count(param_name) == 0) {
+ _momentum[param_name] = new float [size];
+ memset(_momentum[param_name], 0, size * sizeof(float));
+ }
+
+ if (_velocity.count(param_name) == 0) {
+ _velocity[param_name] = new float [size];
+ memset(_velocity[param_name], 0, size * sizeof(float));
+ }
+
+ int true_update_times = int(_update_times / _velocity.size());
+ float alpha = std::sqrt(1 - std::pow(_beta2, _update_times)) / (1 - std::pow(_beta1,
+ _update_times));
+
+ for (int i = 0; i < size; ++i) {
+ _momentum[param_name][i] = _beta1 * _momentum[param_name][i] + (1 - _beta1) * gradient[i];
+ _velocity[param_name][i] = _beta2 * _velocity[param_name][i] + (1 - _beta2) * gradient[i] *
+ gradient[i];
+ gradient[i] = alpha * _momentum[param_name][i] / (std::sqrt(_velocity[param_name][i]) + _epsilon);
+ }
+}
+
+}//namespace
diff --git a/evo_kit/core/src/cached_gaussian_sampling.cc b/evo_kit/core/src/cached_gaussian_sampling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..844eca20e2935c4b5e7ac39d5fa07df1c2b13913
--- /dev/null
+++ b/evo_kit/core/src/cached_gaussian_sampling.cc
@@ -0,0 +1,121 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/cached_gaussian_sampling.h"
+
+namespace evo_kit {
+
+CachedGaussianSampling::CachedGaussianSampling() {}
+
+CachedGaussianSampling::~CachedGaussianSampling() {
+ delete[] _noise_cache;
+}
+
+bool CachedGaussianSampling::load_config(const EvoKitConfig& config) {
+ bool success = true;
+ _std = config.gaussian_sampling().std();
+ success = set_seed(config.seed());
+ CHECK(success) << "[EvoKit] Fail to set seed while load config.";
+ _cache_size = config.gaussian_sampling().cache_size();
+ _noise_cache = new float [_cache_size];
+ memset(_noise_cache, 0, _cache_size * sizeof(float));
+ success = _create_noise_cache();
+ CHECK(success) << "[EvoKit] Fail to create noise_cache while load config.";
+ return success;
+}
+
+bool CachedGaussianSampling::sampling(int* key, float* noise, int64_t size) {
+ bool success = true;
+
+ if (_noise_cache == nullptr) {
+ LOG(ERROR) << "[EvoKit] Please use load_config() first.";
+ success = false;
+ return success;
+ }
+
+ if (noise == nullptr) {
+ LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+ success = false;
+ return success;
+ }
+
+ if ((size >= _cache_size) || (size < 0)) {
+ LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
+ "), cache_size: " << _cache_size;
+ success = false;
+ return success;
+ }
+
+ int rand_key = rand();
+ std::default_random_engine generator(rand_key);
+ std::uniform_int_distribution uniform(0, _cache_size - size);
+ int index = uniform(generator);
+ *key = index;
+
+ for (int64_t i = 0; i < size; ++i) {
+ *(noise + i) = *(_noise_cache + index + i);
+ }
+
+ return success;
+}
+
+bool CachedGaussianSampling::resampling(int key, float* noise, int64_t size) {
+ bool success = true;
+
+ if (_noise_cache == nullptr) {
+ LOG(ERROR) << "[EvoKit] Please use load_config() first.";
+ success = false;
+ return success;
+ }
+
+ if (noise == nullptr) {
+ LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+ success = false;
+ return success;
+ }
+
+ if ((size >= _cache_size) || (size < 0)) {
+ LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
+ "), cache_size: " << _cache_size;
+ success = false;
+ return success;
+ }
+
+ if ((key > _cache_size - size) || (key < 0)) {
+ LOG(ERROR) << "[EvoKit] Resampling key " << key << " is out of bounds [0, "
+ << _cache_size - size <<
+ "], cache_size: " << _cache_size << ", size: " << size;
+ success = false;
+ return success;
+ }
+
+ for (int64_t i = 0; i < size; ++i) {
+ *(noise + i) = *(_noise_cache + key + i);
+ }
+
+ return success;
+}
+
+bool CachedGaussianSampling::_create_noise_cache() {
+ std::default_random_engine generator(_seed);
+ std::normal_distribution norm;
+
+ for (int64_t i = 0; i < _cache_size; ++i) {
+ *(_noise_cache + i) = norm(generator) * _std;
+ }
+
+ return true;
+}
+
+}
diff --git a/evo_kit/core/src/gaussian_sampling.cc b/evo_kit/core/src/gaussian_sampling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..776c2c4da940fafd23e073dd97002876ddfc8673
--- /dev/null
+++ b/evo_kit/core/src/gaussian_sampling.cc
@@ -0,0 +1,65 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/gaussian_sampling.h"
+
+namespace evo_kit {
+
+bool GaussianSampling::load_config(const EvoKitConfig& config) {
+ bool success = true;
+ _std = config.gaussian_sampling().std();
+ success = set_seed(config.seed());
+ return success;
+}
+
+bool GaussianSampling::sampling(int* key, float* noise, int64_t size) {
+ bool success = true;
+
+ if (noise == nullptr) {
+ LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+ success = false;
+ return success;
+ }
+
+ int rand_key = rand();
+ *key = rand_key;
+ std::default_random_engine generator(rand_key);
+ std::normal_distribution norm;
+
+ for (int64_t i = 0; i < size; ++i) {
+ *(noise + i) = norm(generator) * _std;
+ }
+
+ return success;
+}
+
+bool GaussianSampling::resampling(int key, float* noise, int64_t size) {
+ bool success = true;
+
+ if (noise == nullptr) {
+ LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+ success = false;
+ } else {
+ std::default_random_engine generator(key);
+ std::normal_distribution norm;
+
+ for (int64_t i = 0; i < size; ++i) {
+ *(noise + i) = norm(generator) * _std;
+ }
+ }
+
+ return success;
+}
+
+}
diff --git a/evo_kit/core/src/optimizer_factory.cc b/evo_kit/core/src/optimizer_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6137d623fc1b023cc8d8edc8c988aced66a482c0
--- /dev/null
+++ b/evo_kit/core/src/optimizer_factory.cc
@@ -0,0 +1,39 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/optimizer_factory.h"
+
+namespace evo_kit {
+
+std::shared_ptr create_optimizer(const OptimizerConfig& optimizer_config) {
+ std::shared_ptr optimizer;
+ std::string opt_type = optimizer_config.type();
+ std::transform(opt_type.begin(), opt_type.end(), opt_type.begin(), ::tolower);
+
+ if (opt_type == "sgd") {
+ optimizer = std::make_shared(optimizer_config.base_lr(), \
+ optimizer_config.momentum());
+ } else if (opt_type == "adam") {
+ optimizer = std::make_shared(optimizer_config.base_lr(), \
+ optimizer_config.beta1(), \
+ optimizer_config.beta2(), \
+ optimizer_config.epsilon());
+ } else {
+ LOG(ERROR) << "type of OptimizerConfig must be SGD or Adam."; // NotImplementedError
+ }
+
+ return optimizer;
+}
+
+}//namespace
diff --git a/evo_kit/core/src/sampling_factory.cc b/evo_kit/core/src/sampling_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b8109a61a6ecaa80d82b8a8042c89574ea5a6
--- /dev/null
+++ b/evo_kit/core/src/sampling_factory.cc
@@ -0,0 +1,41 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/sampling_factory.h"
+
+namespace evo_kit {
+
+
+std::shared_ptr create_sampling_method(const EvoKitConfig& config) {
+ std::shared_ptr sampling_method;
+ bool cached = config.gaussian_sampling().cached();
+
+ if (cached) {
+ sampling_method = std::make_shared();
+ } else {
+ sampling_method = std::make_shared();
+ }
+
+ bool success = sampling_method->load_config(config);
+
+ if (success) {
+ return sampling_method;
+ } else {
+ LOG(ERROR) << "[EvoKit] Fail to create sampling_method";
+ return nullptr;
+ }
+
+}
+
+}//namespace
diff --git a/evo_kit/core/src/sgd_optimizer.cc b/evo_kit/core/src/sgd_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b3174bffa3d7b3f3b353b18aab8eb428ba70437
--- /dev/null
+++ b/evo_kit/core/src/sgd_optimizer.cc
@@ -0,0 +1,40 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/sgd_optimizer.h"
+
+namespace evo_kit {
+
+SGDOptimizer::~SGDOptimizer() {
+ for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
+ delete[] iter->second;
+ }
+
+ _velocity.clear();
+}
+
+void SGDOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
+ if (_velocity.count(param_name) == 0) {
+ _velocity[param_name] = new float [size];
+ memset(_velocity[param_name], 0, size * sizeof(float));
+ }
+
+ for (int i = 0; i < size; ++i) {
+ _velocity[param_name][i] = _momentum * _velocity[param_name][i] + (1 - _momentum) * gradient[i];
+ gradient[i] = _velocity[param_name][i];
+ }
+}
+
+
+}//namespace
diff --git a/evo_kit/core/src/utils.cc b/evo_kit/core/src/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e47b7d097f0f164a83fb96f6ae538e5a5f2370ea
--- /dev/null
+++ b/evo_kit/core/src/utils.cc
@@ -0,0 +1,81 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/utils.h"
+#include
+
+namespace evo_kit {
+
+bool compute_centered_ranks(std::vector& reward) {
+ std::vector> reward_index;
+ float gap = 1.0 / (reward.size() - 1);
+ float normlized_rank = -0.5;
+ int id = 0;
+
+ for (auto& rew : reward) {
+ reward_index.push_back(std::make_pair(rew, id));
+ ++id;
+ }
+
+ std::sort(reward_index.begin(), reward_index.end());
+
+ for (int i = 0; i < reward.size(); ++i) {
+ id = reward_index[i].second;
+ reward[id] = normlized_rank;
+ normlized_rank += gap;
+ }
+
+ return true;
+}
+
+std::vector list_all_model_dirs(std::string path) {
+ std::vector model_dirs;
+ DIR* dpdf;
+ struct dirent* epdf;
+ dpdf = opendir(path.data());
+
+ if (dpdf != NULL) {
+ while (epdf = readdir(dpdf)) {
+ std::string dir(epdf->d_name);
+
+ if (dir.find("model_iter_id") != std::string::npos) {
+ model_dirs.push_back(path + "/" + dir);
+ }
+ }
+ }
+
+ closedir(dpdf);
+ return model_dirs;
+}
+
+std::string read_file(const std::string& filename) {
+ std::ifstream ifile(filename.c_str());
+
+ if (!ifile.is_open()) {
+ LOG(ERROR) << "Open file: [" << filename << "] failed.";
+ return "";
+ }
+
+ std::ostringstream buf;
+ char ch = '\n';
+
+ while (buf && ifile.get(ch)) {
+ buf.put(ch);
+ }
+
+ ifile.close();
+ return buf.str();
+}
+
+}//namespace
diff --git a/evo_kit/demo/cartpole_config.prototxt b/evo_kit/demo/cartpole_config.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..2707cb60171a47675f1f5a0625de487ad04904f5
--- /dev/null
+++ b/evo_kit/demo/cartpole_config.prototxt
@@ -0,0 +1,17 @@
+seed: 1024
+gaussian_sampling {
+ std: 0.5
+ cached: true
+ cache_size: 100000
+}
+optimizer {
+ type: "Adam"
+ base_lr: 0.05
+ momentum: 0.9
+ beta1: 0.9
+ beta2: 0.999
+ epsilon: 1e-08
+}
+async_es {
+ model_iter_id: 0
+}
diff --git a/evo_kit/demo/paddle/cartpole_async_solver.cc b/evo_kit/demo/paddle/cartpole_async_solver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22d2507de2ea7f6684e8d835f78f88efd8fc5eb2
--- /dev/null
+++ b/evo_kit/demo/paddle/cartpole_async_solver.cc
@@ -0,0 +1,134 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+#include
+#include
+#include "evo_kit/async_es_agent.h"
+#include "cartpole.h"
+#include "paddle_api.h"
+
+using namespace evo_kit;
+using namespace paddle::lite_api;
+
+const int ITER = 10;
+
+// Use PaddlePredictor of CartPole model to predict the action.
+std::vector forward(std::shared_ptr predictor, const float* obs) {
+ std::unique_ptr input_tensor(std::move(predictor->GetInput(0)));
+ input_tensor->Resize({1, 4});
+ input_tensor->CopyFromCpu(obs);
+
+ predictor->Run();
+
+ std::vector probs(2, 0.0);
+ std::unique_ptr output_tensor(
+ std::move(predictor->GetOutput(0)));
+ output_tensor->CopyToCpu(probs.data());
+ return probs;
+}
+
+int arg_max(const std::vector& vec) {
+ return static_cast(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())));
+}
+
+
+float evaluate(CartPole& env, std::shared_ptr agent) {
+ float total_reward = 0.0;
+ env.reset();
+ const float* obs = env.getState();
+
+ std::shared_ptr paddle_predictor;
+ paddle_predictor = agent->get_predictor();
+
+ while (true) {
+ std::vector probs = forward(paddle_predictor, obs);
+ int act = arg_max(probs);
+ env.step(act);
+ float reward = env.getReward();
+ bool done = env.isDone();
+ total_reward += reward;
+
+ if (done) {
+ break;
+ }
+
+ obs = env.getState();
+ }
+
+ return total_reward;
+}
+
+
+int main(int argc, char* argv[]) {
+ std::vector envs;
+
+ for (int i = 0; i < ITER; ++i) {
+ envs.push_back(CartPole());
+ }
+
+ std::shared_ptr agent =
+ std::make_shared("./demo/paddle/cartpole_init_model",
+ "./demo/cartpole_config.prototxt");
+
+ // Clone agents to sample (explore).
+ std::vector< std::shared_ptr > sampling_agents;
+
+ for (int i = 0; i < ITER; ++i) {
+ sampling_agents.push_back(agent->clone());
+ }
+
+ std::vector noisy_info;
+ std::vector last_noisy_info;
+ std::vector noisy_rewards(ITER, 0.0f);
+ std::vector last_noisy_rewards;
+ noisy_info.resize(ITER);
+
+ omp_set_num_threads(10);
+
+ for (int epoch = 0; epoch < 100; ++epoch) {
+ last_noisy_info.clear();
+ last_noisy_rewards.clear();
+
+ if (epoch != 0) {
+ for (int i = 0; i < ITER; ++i) {
+ last_noisy_info.push_back(noisy_info[i]);
+ last_noisy_rewards.push_back(noisy_rewards[i]);
+ }
+ }
+
+ #pragma omp parallel for schedule(dynamic, 1)
+
+ for (int i = 0; i < ITER; ++i) {
+ std::shared_ptr sampling_agent = sampling_agents[i];
+ SamplingInfo info;
+ bool success = sampling_agent->add_noise(info);
+ float reward = evaluate(envs[i], sampling_agent);
+
+ noisy_info[i] = info;
+ noisy_rewards[i] = reward;
+ }
+
+ for (int i = 0; i < ITER; ++i) {
+ last_noisy_info.push_back(noisy_info[i]);
+ last_noisy_rewards.push_back(noisy_rewards[i]);
+ }
+
+ // NOTE: all parameters of sampling_agents will be updated
+ bool success = agent->update(last_noisy_info, last_noisy_rewards);
+
+ int reward = evaluate(envs[0], agent);
+ LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
+ }
+}
diff --git a/evo_kit/demo/paddle/cartpole_init_model.zip b/evo_kit/demo/paddle/cartpole_init_model.zip
new file mode 100644
index 0000000000000000000000000000000000000000..16a7720959786471f8f500e7aa031615d53a1928
Binary files /dev/null and b/evo_kit/demo/paddle/cartpole_init_model.zip differ
diff --git a/evo_kit/demo/paddle/cartpole_solver_parallel.cc b/evo_kit/demo/paddle/cartpole_solver_parallel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33aa89990f23c744f494b9d9d75002103a0bfbcc
--- /dev/null
+++ b/evo_kit/demo/paddle/cartpole_solver_parallel.cc
@@ -0,0 +1,115 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+#include
+#include
+#include "cartpole.h"
+#include "evo_kit/es_agent.h"
+#include "paddle_api.h"
+
+using namespace evo_kit;
+using namespace paddle::lite_api;
+
+const int ITER = 10;
+
+// Use PaddlePredictor of CartPole model to predict the action.
+std::vector forward(std::shared_ptr predictor, const float* obs) {
+ std::unique_ptr input_tensor(std::move(predictor->GetInput(0)));
+ input_tensor->Resize({1, 4});
+ input_tensor->CopyFromCpu(obs);
+
+ predictor->Run();
+
+ std::vector probs(2, 0.0);
+ std::unique_ptr output_tensor(
+ std::move(predictor->GetOutput(0)));
+ output_tensor->CopyToCpu(probs.data());
+ return probs;
+}
+
+int arg_max(const std::vector& vec) {
+ return static_cast(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())));
+}
+
+
+float evaluate(CartPole& env, std::shared_ptr agent) {
+ float total_reward = 0.0;
+ env.reset();
+ const float* obs = env.getState();
+
+ std::shared_ptr paddle_predictor;
+ paddle_predictor = agent->get_predictor();
+
+ while (true) {
+ std::vector probs = forward(paddle_predictor, obs);
+ int act = arg_max(probs);
+ env.step(act);
+ float reward = env.getReward();
+ bool done = env.isDone();
+ total_reward += reward;
+
+ if (done) {
+ break;
+ }
+
+ obs = env.getState();
+ }
+
+ return total_reward;
+}
+
+
+int main(int argc, char* argv[]) {
+ std::vector envs;
+
+ for (int i = 0; i < ITER; ++i) {
+ envs.push_back(CartPole());
+ }
+
+ std::shared_ptr agent = std::make_shared("./demo/paddle/cartpole_init_model",
+ "./demo/cartpole_config.prototxt");
+
+ // Clone agents to sample (explore).
+ std::vector< std::shared_ptr > sampling_agents;
+
+ for (int i = 0; i < ITER; ++i) {
+ sampling_agents.push_back(agent->clone());
+ }
+
+ std::vector noisy_keys;
+ std::vector noisy_rewards(ITER, 0.0f);
+ noisy_keys.resize(ITER);
+
+ omp_set_num_threads(10);
+
+ for (int epoch = 0; epoch < 100; ++epoch) {
+ #pragma omp parallel for schedule(dynamic, 1)
+ for (int i = 0; i < ITER; ++i) {
+ std::shared_ptr sampling_agent = sampling_agents[i];
+ SamplingInfo key;
+ bool success = sampling_agent->add_noise(key);
+ float reward = evaluate(envs[i], sampling_agent);
+
+ noisy_keys[i] = key;
+ noisy_rewards[i] = reward;
+ }
+
+ // NOTE: all parameters of sampling_agents will be updated
+ bool success = agent->update(noisy_keys, noisy_rewards);
+
+ int reward = evaluate(envs[0], agent);
+ LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
+ }
+}
diff --git a/evo_kit/demo/paddle/gen_cartpole_init_model.py b/evo_kit/demo/paddle/gen_cartpole_init_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..62228b4f0cf953ffa3c1d11ae7bfd949c3e93925
--- /dev/null
+++ b/evo_kit/demo/paddle/gen_cartpole_init_model.py
@@ -0,0 +1,41 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import fluid
+
+
+def net(obs, act_dim):
+ hid1_size = act_dim * 10
+ hid1 = fluid.layers.fc(obs, size=hid1_size)
+ prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
+ return prob
+
+
+if __name__ == '__main__':
+ obs_dim = 4
+ act_dim = 2
+
+ obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
+
+ prob = net(obs, act_dim)
+
+ exe = fluid.Executor(fluid.CPUPlace())
+ exe.run(fluid.default_startup_program())
+ fluid.io.save_inference_model(
+ dirname='cartpole_init_model',
+ feeded_var_names=['obs'],
+ target_vars=[prob],
+ params_filename='params',
+ model_filename='model',
+ executor=exe)
diff --git a/evo_kit/demo/torch/CMakeLists.txt b/evo_kit/demo/torch/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ece945581c57a4c0c05fa38d007b00b7266392e
--- /dev/null
+++ b/evo_kit/demo/torch/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required (VERSION 2.6)
+project (EvoKit_demo)
+set(TARGET parallel_main)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(OpenMP)
+if (OPENMP_FOUND)
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH "./libtorch")
+find_package(Torch REQUIRED ON)
+set(demo "${PROJECT_SOURCE_DIR}/cartpole_solver_parallel.cc")
+
+
+########## main ##########
+add_executable(${TARGET} ${demo} ${framework_src})
+target_link_libraries(${TARGET} gflags protobuf pthread glog)
+
+########## Torch libraries ##########
+target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
+
+
+########## EvoKit libraries ##########
+list(APPEND CMAKE_PREFIX_PATH "./libevokit/cmake/Torch")
+find_package(EvoKit)
+target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}")
diff --git a/evo_kit/demo/torch/cartpole_solver_parallel.cc b/evo_kit/demo/torch/cartpole_solver_parallel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c8f4c821c4b92e69b4755a1126296853a731102
--- /dev/null
+++ b/evo_kit/demo/torch/cartpole_solver_parallel.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include
+#include
+#include
+#include
+#include
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/es_agent.h"
+#include "cartpole.h"
+#include "model.h"
+
+using namespace evo_kit;
+const int ITER = 10;
+
+float evaluate(CartPole& env, std::shared_ptr> agent) {
+ float total_reward = 0.0;
+ env.reset();
+ const float* obs = env.getState();
+ while (true) {
+ torch::Tensor obs_tensor = torch::tensor({obs[0], obs[1], obs[2], obs[3]});
+ torch::Tensor action = agent->predict(obs_tensor);
+ int act = std::get<1>(action.max(-1)).item();
+ env.step(act);
+ float reward = env.getReward();
+ auto done = env.isDone();
+ total_reward += reward;
+ if (done) break;
+ obs = env.getState();
+ }
+ return total_reward;
+}
+
+int main(int argc, char* argv[]) {
+ //google::InitGoogleLogging(argv[0]);
+ std::vector envs;
+ for (int i = 0; i < ITER; ++i) {
+ envs.push_back(CartPole());
+ }
+
+ auto model = std::make_shared(4, 2);
+ std::shared_ptr> agent = std::make_shared>(model,
+ "./cartpole_config.prototxt");
+
+ // Clone agents to sample (explore).
+ std::vector>> sampling_agents;
+ for (int i = 0; i < ITER; ++i) {
+ sampling_agents.push_back(agent->clone());
+ }
+
+ std::vector noisy_info;
+ std::vector noisy_rewards(ITER, 0.0f);
+ noisy_info.resize(ITER);
+
+ for (int epoch = 0; epoch < 100; ++epoch) {
+#pragma omp parallel for schedule(dynamic, 1)
+ for (int i = 0; i < ITER; ++i) {
+ auto sampling_agent = sampling_agents[i];
+ SamplingInfo info;
+ bool success = sampling_agent->add_noise(info);
+ float reward = evaluate(envs[i], sampling_agent);
+ noisy_info[i] = info;
+ noisy_rewards[i] = reward;
+ }
+
+ // Will also update parameters of sampling_agents
+ bool success = agent->update(noisy_info, noisy_rewards);
+
+ // Use original agent to evalute (without noise).
+ int reward = evaluate(envs[0], agent);
+ LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
+ }
+}
diff --git a/evo_kit/demo/torch/model.h b/evo_kit/demo/torch/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..27373ceffd66bffd9d8a047a2e4fc5fe3a14005a
--- /dev/null
+++ b/evo_kit/demo/torch/model.h
@@ -0,0 +1,61 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _MODEL_H
+#define _MODEL_H
+
+#include
+
+struct Model : public torch::nn::Module{
+
+ Model() = delete;
+
+ Model(const int obs_dim, const int act_dim) {
+
+ _obs_dim = obs_dim;
+ _act_dim = act_dim;
+ int hid1_size = act_dim * 10;
+ fc1 = register_module("fc1", torch::nn::Linear(obs_dim, hid1_size));
+ fc2 = register_module("fc2", torch::nn::Linear(hid1_size, act_dim));
+ }
+
+ torch::Tensor forward(torch::Tensor x) {
+ x = x.reshape({-1, _obs_dim});
+ x = torch::tanh(fc1->forward(x));
+ x = torch::softmax(fc2->forward(x), 1);
+ return x;
+ }
+
+ std::shared_ptr clone() {
+ std::shared_ptr model = std::make_shared(_obs_dim, _act_dim);
+ std::vector parameters1 = parameters();
+ std::vector parameters2 = model->parameters();
+ for (int i = 0; i < parameters1.size(); ++i) {
+ torch::Tensor src = parameters1[i].view({-1});
+ torch::Tensor des = parameters2[i].view({-1});
+ auto src_a = src.accessor();
+ auto des_a = des.accessor();
+ for (int j = 0; j < src.size(0); ++j) {
+ des_a[j] = src_a[j];
+ }
+ }
+ return model;
+ }
+
+ int _act_dim;
+ int _obs_dim;
+ torch::nn::Linear fc1{nullptr}, fc2{nullptr};
+};
+
+#endif
diff --git a/evo_kit/paddle/include/evo_kit/async_es_agent.h b/evo_kit/paddle/include/evo_kit/async_es_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8558820bb86f7d4a6f084aea456e2c9a79ed762
--- /dev/null
+++ b/evo_kit/paddle/include/evo_kit/async_es_agent.h
@@ -0,0 +1,101 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_ASYNC_ES_AGENT_H
+#define EVO_KIT_ASYNC_ES_AGENT_H
+
+#include
+#include
+#include "evo_kit/es_agent.h"
+
+namespace evo_kit {
+/* EvoKit agent with PaddleLite as backend. This agent supports asynchronous update.
+ * Users mainly focus on the following functions:
+ * 1. clone: clone an agent for multi-thread evaluation
+ * 2. add_noise: add noise into parameters.
+ * 3. update: update parameters given data collected during evaluation.
+ */
+class AsyncESAgent: public ESAgent {
+public:
+ AsyncESAgent() {}
+
+ ~AsyncESAgent();
+
+ /**
+ * @args:
+ * predictor: predictor created by users for prediction.
+ * config_path: the path of configuration file.
+ * Note that AsyncESAgent will update the configuration file after calling the update function.
+ * Please use the up-to-date configuration.
+ */
+ AsyncESAgent(
+ const std::string& model_dir,
+ const std::string& config_path);
+
+ /**
+ * @brief: Clone an agent for sampling.
+ */
+ std::shared_ptr clone();
+
+ /**
+ * @brief: update parameters given data collected during evaluation.
+ * @args:
+ * noisy_info: sampling information returned by add_noise function.
+ * noisy_reward: evaluation rewards.
+ */
+ bool update(
+ std::vector& noisy_info,
+ std::vector& noisy_rewards);
+
+private:
+ std::unordered_map> _previous_predictors;
+ std::unordered_map _param_delta;
+ std::string _config_path;
+
+ /**
+ * @brief: parse model_iter_id given a string of model directory.
+ * @return: an integer indicating the model_iter_id
+ */
+ int _parse_model_iter_id(const std::string&);
+
+ /**
+ * @brief: compute the distance between current parameter and previous models.
+ */
+ bool _compute_model_diff();
+
+ /**
+ * @brief: remove expired models to avoid overuse of disk space.
+ * @args:
+ * max_to_keep: the maximum number of models to keep locally.
+ */
+ bool _remove_expired_model(int max_to_keep);
+
+ /**
+ * @brief: save up-to-date parameters to the disk.
+ */
+ bool _save();
+
+ /**
+ * @brief: load all models in the model warehouse.
+ */
+ bool _load();
+
+ /**
+ * @brief: load a model given the model directory.
+ */
+ std::shared_ptr _load_previous_model(std::string model_dir);
+};
+
+} // namespace
+#endif
diff --git a/evo_kit/paddle/include/evo_kit/es_agent.h b/evo_kit/paddle/include/evo_kit/es_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a256712a3d99be12ff4a9f409298602192ec21e
--- /dev/null
+++ b/evo_kit/paddle/include/evo_kit/es_agent.h
@@ -0,0 +1,103 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_
+#define EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_
+
+#include
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/optimizer_factory.h"
+#include "evo_kit/sampling_factory.h"
+#include "evo_kit/utils.h"
+#include "paddle_api.h"
+
+namespace evo_kit {
+
+typedef paddle::lite_api::PaddlePredictor PaddlePredictor;
+typedef paddle::lite_api::CxxConfig CxxConfig;
+typedef paddle::lite_api::Tensor Tensor;
+
+int64_t ShapeProduction(const paddle::lite_api::shape_t& shape);
+
+/**
+ * @brief EvoKit agent with PaddleLite as backend.
+ * Users mainly focus on the following functions:
+ * 1. clone: clone an agent for multi-thread evaluation
+ * 2. add_noise: add noise into parameters.
+ * 3. update: update parameters given data collected during evaluation.
+ *
+ */
+class ESAgent {
+public:
+ ESAgent() {}
+
+ ~ESAgent();
+
+ ESAgent(const std::string& model_dir, const std::string& config_path);
+
+ /**
+ * @breif Clone a sampling agent
+ *
+ * Only cloned ESAgent can call `add_noise` function.
+ * Each cloned ESAgent will have a copy of original parameters.
+ * (support sampling in multi-thread way)
+ */
+ std::shared_ptr clone();
+
+ /**
+ * @brief Update parameters of predictor based on ES algorithm.
+ *
+ * Only not cloned ESAgent can call `update` function.
+ * Parameters of cloned agents will also be updated.
+ */
+ bool update(
+ std::vector& noisy_info,
+ std::vector& noisy_rewards);
+
+ // copied parameters = original parameters + noise
+ bool add_noise(SamplingInfo& sampling_info);
+
+ /**
+ * @brief Get paddle predict
+ *
+ * if _is_sampling_agent is true, will return predictor with added noise;
+ * if _is_sampling_agent is false, will return predictor without added noise.
+ */
+ std::shared_ptr get_predictor();
+
+ // get param size of model
+ int64_t param_size() {
+ return _param_size;
+ }
+
+protected:
+ int64_t _calculate_param_size();
+
+ std::shared_ptr _predictor;
+ std::shared_ptr _sampling_predictor;
+ std::shared_ptr _sampling_method;
+ std::shared_ptr _optimizer;
+ std::shared_ptr _config;
+ std::shared_ptr _cxx_config;
+ std::vector _param_names;
+ // malloc memory of noise and neg_gradients in advance.
+ float* _noise;
+ float* _neg_gradients;
+ int64_t _param_size;
+ bool _is_sampling_agent;
+};
+
+} // namespace
+
+#endif
diff --git a/evo_kit/paddle/src/async_es_agent.cc b/evo_kit/paddle/src/async_es_agent.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bff6e42907f6f83f53ea147051d34d3b4851141
--- /dev/null
+++ b/evo_kit/paddle/src/async_es_agent.cc
@@ -0,0 +1,290 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/async_es_agent.h"
+
+namespace evo_kit {
+
+AsyncESAgent::AsyncESAgent(
+ const std::string& model_dir,
+ const std::string& config_path): ESAgent(model_dir, config_path) {
+ _config_path = config_path;
+}
+AsyncESAgent::~AsyncESAgent() {
+ for (const auto kv : _param_delta) {
+ float* delta = kv.second;
+ delete[] delta;
+ }
+}
+
+bool AsyncESAgent::_save() {
+ using namespace paddle::lite_api;
+ bool success = true;
+
+ if (_is_sampling_agent) {
+ LOG(ERROR) <<
+ "[EvoKit] Cloned AsyncESAgent cannot call `save`.Please use original AsyncESAgent.";
+ success = false;
+ return success;
+ }
+
+ int model_iter_id = _config->async_es().model_iter_id() + 1;
+ //current time
+ time_t rawtime;
+ struct tm* timeinfo;
+ char buffer[80];
+
+ time(&rawtime);
+ timeinfo = localtime(&rawtime);
+
+ std::string model_name = "model_iter_id-" + std::to_string(model_iter_id);
+ std::string model_path = _config->async_es().model_warehouse() + "/" + model_name;
+ LOG(INFO) << "[save]model_path: " << model_path;
+ _predictor->SaveOptimizedModel(model_path, LiteModelType::kProtobuf);
+ // save config
+ auto async_es = _config->mutable_async_es();
+ async_es->set_model_iter_id(model_iter_id);
+ success = save_proto_conf(_config_path, *_config);
+
+ if (!success) {
+ LOG(ERROR) << "[]unable to save config for AsyncESAgent";
+ success = false;
+ return success;
+ }
+
+ int max_to_keep = _config->async_es().max_to_keep();
+ success = _remove_expired_model(max_to_keep);
+ return success;
+}
+
+bool AsyncESAgent::_remove_expired_model(int max_to_keep) {
+ bool success = true;
+ std::string model_path = _config->async_es().model_warehouse();
+ std::vector model_dirs = list_all_model_dirs(model_path);
+ int model_iter_id = _config->async_es().model_iter_id() + 1;
+
+ for (const auto& dir : model_dirs) {
+ int dir_model_iter_id = _parse_model_iter_id(dir);
+
+ if (model_iter_id - dir_model_iter_id >= max_to_keep) {
+ std::string rm_command = std::string("rm -rf ") + dir;
+ int ret = system(rm_command.c_str());
+
+ if (ret == 0) {
+ LOG(INFO) << "[EvoKit] remove expired Model: " << dir;
+ } else {
+ LOG(ERROR) << "[EvoKit] fail to remove expired Model: " << dir;
+ success = false;
+ return success;
+ }
+ }
+ }
+
+ return success;
+}
+
+bool AsyncESAgent::_compute_model_diff() {
+ bool success = true;
+
+ for (const auto& kv : _previous_predictors) {
+ int model_iter_id = kv.first;
+ std::shared_ptr old_predictor = kv.second;
+ float* diff = new float[_param_size];
+ memset(diff, 0, _param_size * sizeof(float));
+ int offset = 0;
+
+ for (const std::string& param_name : _param_names) {
+ auto des_tensor = old_predictor->GetTensor(param_name);
+ auto src_tensor = _predictor->GetTensor(param_name);
+ const float* des_data = des_tensor->data();
+ const float* src_data = src_tensor->data();
+ int64_t tensor_size = ShapeProduction(src_tensor->shape());
+
+ for (int i = 0; i < tensor_size; ++i) {
+ diff[i + offset] = des_data[i] - src_data[i];
+ }
+
+ offset += tensor_size;
+ }
+
+ _param_delta[model_iter_id] = diff;
+ }
+
+ return success;
+}
+
+bool AsyncESAgent::_load() {
+ bool success = true;
+ std::string model_path = _config->async_es().model_warehouse();
+ std::vector model_dirs = list_all_model_dirs(model_path);
+
+ if (model_dirs.size() == 0) {
+ int model_iter_id = _config->async_es().model_iter_id();
+ success = model_iter_id == 0 ? true : false;
+
+ if (!success) {
+ LOG(WARNING) << "[EvoKit] current_model_iter_id is nonzero, but no model is \
+ found at the dir: " << model_path;
+ }
+
+ return success;
+ }
+
+ for (auto& dir : model_dirs) {
+ int model_iter_id = _parse_model_iter_id(dir);
+
+ if (model_iter_id == -1) {
+ LOG(WARNING) << "[EvoKit] fail to parse model_iter_id: " << dir;
+ success = false;
+ return success;
+ }
+
+ std::shared_ptr predictor = _load_previous_model(dir);
+
+ if (predictor == nullptr) {
+ success = false;
+ LOG(WARNING) << "[EvoKit] fail to load model: " << dir;
+ return success;
+ }
+
+ _previous_predictors[model_iter_id] = predictor;
+ }
+
+ success = _compute_model_diff();
+ return success;
+}
+
+std::shared_ptr AsyncESAgent::_load_previous_model(std::string model_dir) {
+ using namespace paddle::lite_api;
+ // 1. Create CxxConfig
+ CxxConfig config;
+ config.set_model_file(model_dir + "/model");
+ config.set_param_file(model_dir + "/params");
+ config.set_valid_places({
+ Place{TARGET(kX86), PRECISION(kFloat)},
+ Place{TARGET(kHost), PRECISION(kFloat)}
+ });
+
+ // 2. Create PaddlePredictor by CxxConfig
+ std::shared_ptr predictor = CreatePaddlePredictor(config);
+ return predictor;
+}
+
+std::shared_ptr AsyncESAgent::clone() {
+
+ std::shared_ptr new_agent = std::make_shared();
+
+ float* noise = new float [_param_size];
+
+ new_agent->_predictor = _predictor;
+ new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor(*_cxx_config);
+ new_agent->_is_sampling_agent = true;
+ new_agent->_sampling_method = _sampling_method;
+ new_agent->_param_names = _param_names;
+ new_agent->_param_size = _param_size;
+ new_agent->_config = _config;
+ new_agent->_noise = noise;
+
+ return new_agent;
+}
+
+bool AsyncESAgent::update(
+ std::vector& noisy_info,
+ std::vector& noisy_rewards) {
+
+ CHECK(!_is_sampling_agent) << "[EvoKit] Cloned ESAgent cannot call update function. \
+ Please use original ESAgent.";
+
+ bool success = _load();
+ CHECK(success) << "[EvoKit] fail to load previous models.";
+
+ int current_model_iter_id = _config->async_es().model_iter_id();
+
+ // validate model_iter_id for each sample before the update
+ for (int i = 0; i < noisy_info.size(); ++i) {
+ int model_iter_id = noisy_info[i].model_iter_id();
+
+ if (model_iter_id != current_model_iter_id
+ && _previous_predictors.count(model_iter_id) == 0) {
+ LOG(WARNING) << "[EvoKit] The sample with model_dir_id: " << model_iter_id \
+ << " cannot match any local model";
+ success = false;
+ return success;
+ }
+ }
+
+ compute_centered_ranks(noisy_rewards);
+ memset(_neg_gradients, 0, _param_size * sizeof(float));
+
+ for (int i = 0; i < noisy_info.size(); ++i) {
+ int key = noisy_info[i].key(0);
+ float reward = noisy_rewards[i];
+ int model_iter_id = noisy_info[i].model_iter_id();
+ bool success = _sampling_method->resampling(key, _noise, _param_size);
+ CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i;
+ float* delta = _param_delta[model_iter_id];
+
+ // compute neg_gradients
+ if (model_iter_id == current_model_iter_id) {
+ for (int64_t j = 0; j < _param_size; ++j) {
+ _neg_gradients[j] += _noise[j] * reward;
+ }
+ } else {
+ for (int64_t j = 0; j < _param_size; ++j) {
+ _neg_gradients[j] += (_noise[j] + delta[j]) * reward;
+ }
+ }
+ }
+
+ for (int64_t j = 0; j < _param_size; ++j) {
+ _neg_gradients[j] /= -1.0 * noisy_info.size();
+ }
+
+ //update
+ int64_t counter = 0;
+
+ for (std::string param_name : _param_names) {
+ std::unique_ptr tensor = _predictor->GetMutableTensor(param_name);
+ float* tensor_data = tensor->mutable_data();
+ int64_t tensor_size = ShapeProduction(tensor->shape());
+ _optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name);
+ counter += tensor_size;
+ }
+
+ success = _save();
+ CHECK(success) << "[EvoKit] fail to save model.";
+ return true;
+}
+
+int AsyncESAgent::_parse_model_iter_id(const std::string& model_path) {
+ int model_iter_id = -1;
+ int pow = 1;
+
+ for (int i = model_path.size() - 1; i >= 0; --i) {
+ if (model_path[i] >= '0' && model_path[i] <= '9') {
+ if (model_iter_id == -1) {
+ model_iter_id = 0;
+ }
+ } else {
+ break;
+ }
+
+ model_iter_id += pow * (model_path[i] - '0');
+ pow *= 10;
+ }
+
+ return model_iter_id;
+}
+
+}//namespace
diff --git a/evo_kit/paddle/src/es_agent.cc b/evo_kit/paddle/src/es_agent.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8f3ebd37299224791f1380f284849195383f65b
--- /dev/null
+++ b/evo_kit/paddle/src/es_agent.cc
@@ -0,0 +1,185 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/es_agent.h"
+#include
+
+namespace evo_kit {
+
+int64_t ShapeProduction(const paddle::lite_api::shape_t& shape) {
+ int64_t res = 1;
+
+ for (auto i : shape) {
+ res *= i;
+ }
+
+ return res;
+}
+
+ESAgent::~ESAgent() {
+ delete[] _noise;
+
+ if (!_is_sampling_agent) {
+ delete[] _neg_gradients;
+ }
+}
+
+ESAgent::ESAgent(const std::string& model_dir, const std::string& config_path) {
+ using namespace paddle::lite_api;
+ // 1. Create CxxConfig
+ _cxx_config = std::make_shared();
+ std::string model_path = model_dir + "/model";
+ std::string param_path = model_dir + "/param";
+ std::string model_buffer = read_file(model_path);
+ std::string param_buffer = read_file(param_path);
+ _cxx_config->set_model_buffer(model_buffer.c_str(), model_buffer.size(),
+ param_buffer.c_str(), param_buffer.size());
+ _cxx_config->set_valid_places({
+ Place{TARGET(kX86), PRECISION(kFloat)},
+ Place{TARGET(kHost), PRECISION(kFloat)}
+ });
+
+ _predictor = CreatePaddlePredictor(*_cxx_config);
+
+ _is_sampling_agent = false;
+ // Original agent can't be used to sample, so keep it same with _predictor for evaluating.
+ _sampling_predictor = _predictor;
+
+ _config = std::make_shared();
+ load_proto_conf(config_path, *_config);
+
+ _sampling_method = create_sampling_method(*_config);
+
+ _optimizer = create_optimizer(_config->optimizer());
+
+ _param_names = _predictor->GetParamNames();
+ _param_size = _calculate_param_size();
+
+ _noise = new float [_param_size];
+ _neg_gradients = new float [_param_size];
+}
+
+std::shared_ptr ESAgent::clone() {
+ if (_is_sampling_agent) {
+ LOG(ERROR) << "[EvoKit] only original ESAgent can call `clone` function.";
+ return nullptr;
+ }
+
+ std::shared_ptr new_agent = std::make_shared();
+
+ float* noise = new float [_param_size];
+
+ new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor(*_cxx_config);
+ new_agent->_predictor = _predictor;
+ new_agent->_cxx_config = _cxx_config;
+ new_agent->_is_sampling_agent = true;
+ new_agent->_sampling_method = _sampling_method;
+ new_agent->_param_names = _param_names;
+ new_agent->_config = _config;
+ new_agent->_param_size = _param_size;
+ new_agent->_noise = noise;
+
+ return new_agent;
+}
+
+bool ESAgent::update(
+ std::vector& noisy_info,
+ std::vector& noisy_rewards) {
+ if (_is_sampling_agent) {
+ LOG(ERROR) << "[EvoKit] Cloned ESAgent cannot call update function, please use original ESAgent.";
+ return false;
+ }
+
+ compute_centered_ranks(noisy_rewards);
+
+ memset(_neg_gradients, 0, _param_size * sizeof(float));
+
+ for (int i = 0; i < noisy_info.size(); ++i) {
+ int key = noisy_info[i].key(0);
+ float reward = noisy_rewards[i];
+ bool success = _sampling_method->resampling(key, _noise, _param_size);
+ CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i;
+
+ for (int64_t j = 0; j < _param_size; ++j) {
+ _neg_gradients[j] += _noise[j] * reward;
+ }
+ }
+
+ for (int64_t j = 0; j < _param_size; ++j) {
+ _neg_gradients[j] /= -1.0 * noisy_info.size();
+ }
+
+ //update
+ int64_t counter = 0;
+
+ for (std::string param_name : _param_names) {
+ std::unique_ptr tensor = _predictor->GetMutableTensor(param_name);
+ float* tensor_data = tensor->mutable_data();
+ int64_t tensor_size = ShapeProduction(tensor->shape());
+ _optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name);
+ counter += tensor_size;
+ }
+
+ return true;
+}
+
+bool ESAgent::add_noise(SamplingInfo& sampling_info) {
+ bool success = true;
+
+ if (!_is_sampling_agent) {
+ LOG(ERROR) <<
+ "[EvoKit] Original ESAgent cannot call add_noise function, please use cloned ESAgent.";
+ success = false;
+ return success;
+ }
+
+ int key = 0;
+ success = _sampling_method->sampling(&key, _noise, _param_size);
+ CHECK(success) << "[EvoKit] sampling error occurs while add_noise.";
+ int model_iter_id = _config->async_es().model_iter_id();
+ sampling_info.add_key(key);
+ sampling_info.set_model_iter_id(model_iter_id);
+ int64_t counter = 0;
+
+ for (std::string param_name : _param_names) {
+ std::unique_ptr sample_tensor = _sampling_predictor->GetMutableTensor(param_name);
+ std::unique_ptr tensor = _predictor->GetTensor(param_name);
+ int64_t tensor_size = ShapeProduction(tensor->shape());
+
+ for (int64_t j = 0; j < tensor_size; ++j) {
+ sample_tensor->mutable_data()[j] = tensor->data()[j] + _noise[counter + j];
+ }
+
+ counter += tensor_size;
+ }
+
+ return success;
+}
+
+std::shared_ptr ESAgent::get_predictor() {
+ return _sampling_predictor;
+}
+
+int64_t ESAgent::_calculate_param_size() {
+ int64_t param_size = 0;
+
+ for (std::string param_name : _param_names) {
+ std::unique_ptr tensor = _predictor->GetTensor(param_name);
+ param_size += ShapeProduction(tensor->shape());
+ }
+
+ return param_size;
+}
+
+}//namespace
diff --git a/evo_kit/scripts/build_torch_demo.sh b/evo_kit/scripts/build_torch_demo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b2f4df4444012c49bade049a7b30c9ebf637cafb
--- /dev/null
+++ b/evo_kit/scripts/build_torch_demo.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+cd demo/torch
+
+#---------------libtorch-------------#
+if [ ! -d "./libtorch" ];then
+ echo "Cannot find the torch library: ./libtorch"
+ echo "Downloading Torch library"
+ wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
+ unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+ rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+ echo "Torch library Downloaded"
+fi
+
+
+#---------------libevokit-------------#
+cp -r ../../libevokit ./
+if [ ! -d "./libevokit" ];then
+ echo "Cannot find the EvoKit library: ./libevokit"
+ echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme
+ exit 1
+fi
+
+# proto
+cp ../cartpole_config.prototxt ./
+
+#----------------build---------------#
+rm -rf build
+mkdir build
+cd build
+cmake ../
+make -j10
+cd -
+
+#-----------------run----------------#
+./build/parallel_main
+
+
+cd ../..
diff --git a/evo_kit/scripts/lib_install.sh b/evo_kit/scripts/lib_install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eb4cc5df7a901618c91b7be8a898d419d607278b
--- /dev/null
+++ b/evo_kit/scripts/lib_install.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+if [ $# != 1 ]; then
+ echo "You must choose one framework (paddle/torch) to compile EvoKit."
+ exit 0
+fi
+
+if [ $1 = "paddle" ]; then
+ #---------------paddlelite-------------#
+ if [ ! -d "./inference_lite_lib" ];then
+ echo "Cannot find the PaddleLite library: ./inference_lite_lib"
+ echo "Please put the PaddleLite libraray to current folder according the instruction in README"
+ exit 1
+ fi
+
+ # Initialization model
+ if [ ! -d ./demo/paddle/cartpole_init_model ]; then
+ unzip ./demo/paddle/cartpole_init_model.zip -d ./demo/paddle/
+ fi
+
+ FLAGS=" -DWITH_PADDLE=ON"
+elif [ $1 = "torch" ]; then
+ FLAGS=" -DWITH_TORCH=ON"
+else
+ echo "Invalid arguments. [paddle/torch]"
+ exit 0
+fi
+
+
+#----------------protobuf-------------#
+cd core/proto/
+protoc evo_kit/evo_kit.proto --cpp_out .
+cd -
+
+#----------------build---------------#
+echo ${FLAGS}
+rm -rf build
+mkdir build
+cd build
+cmake ../ ${FLAGS}
+make -j10
+make install
+cd -
diff --git a/evo_kit/test/CMakeLists.txt b/evo_kit/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..979e5c59afd5e74b2907054a8398fc7d27fbc6e6
--- /dev/null
+++ b/evo_kit/test/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required (VERSION 2.6)
+project (EvoKit_demo)
+set(TARGET unit_test_main)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(GTest REQUIRED)
+find_package(OpenMP)
+if (OPENMP_FOUND)
+ set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+ set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+ set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+# Torch lib
+list(APPEND CMAKE_PREFIX_PATH "../libtorch")
+find_package(Torch REQUIRED ON)
+
+# include and source
+include_directories("${PROJECT_SOURCE_DIR}/include")
+file(GLOB test_src "${PROJECT_SOURCE_DIR}/src/*.cc")
+
+# make
+add_executable(${TARGET} "unit_test.cc" ${core_src} ${agent_src} ${test_src})
+target_link_libraries(${TARGET} gflags protobuf pthread glog gtest "${TORCH_LIBRARIES}")
+
+
+########## EvoKit libraries ##########
+list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/libevokit/cmake/Torch")
+find_package(EvoKit)
+target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}")
diff --git a/evo_kit/test/include/torch_demo_model.h b/evo_kit/test/include/torch_demo_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf9d3400ea4358fe109ff6da3f9bec395920336f
--- /dev/null
+++ b/evo_kit/test/include/torch_demo_model.h
@@ -0,0 +1,65 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _TORCH_DEMO_MODEL_H
+#define _TORCH_DEMO_MODEL_H
+
+#include
+
+struct Model : public torch::nn::Module{
+
+ Model() = delete;
+
+ Model(const int obs_dim, const int act_dim, const int h1_size, const int h2_size) {
+ _obs_dim = obs_dim;
+ _act_dim = act_dim;
+ _h1_size = h1_size;
+ _h2_size = h2_size;
+ fc1 = register_module("fc1", torch::nn::Linear(obs_dim, h1_size));
+ fc2 = register_module("fc2", torch::nn::Linear(h1_size, h2_size));
+ fc3 = register_module("fc3", torch::nn::Linear(h2_size, act_dim));
+ }
+
+ torch::Tensor forward(torch::Tensor x) {
+ x = x.reshape({-1, _obs_dim});
+ x = torch::tanh(fc1->forward(x));
+ x = torch::tanh(fc2->forward(x));
+ x = torch::tanh(fc3->forward(x));
+ return x;
+ }
+
+ std::shared_ptr clone() {
+ std::shared_ptr model = std::make_shared(_obs_dim, _act_dim, _h1_size, _h2_size);
+ std::vector parameters1 = parameters();
+ std::vector parameters2 = model->parameters();
+ for (int i = 0; i < parameters1.size(); ++i) {
+ torch::Tensor src = parameters1[i].view({-1});
+ torch::Tensor des = parameters2[i].view({-1});
+ auto src_a = src.accessor();
+ auto des_a = des.accessor();
+ for (int j = 0; j < src.size(0); ++j) {
+ des_a[j] = src_a[j];
+ }
+ }
+ return model;
+ }
+
+ int _act_dim;
+ int _obs_dim;
+ int _h1_size;
+ int _h2_size;
+ torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
+};
+
+#endif
diff --git a/evo_kit/test/prototxt/torch_sin_cached_config.prototxt b/evo_kit/test/prototxt/torch_sin_cached_config.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fe80b1e07396b0909cb087f1a9b0c20724a0fc4
--- /dev/null
+++ b/evo_kit/test/prototxt/torch_sin_cached_config.prototxt
@@ -0,0 +1,16 @@
+seed : 1024
+
+gaussian_sampling {
+ std: 0.005
+ cached: true
+ cache_size : 100000
+}
+
+optimizer {
+ type: "Adam",
+ base_lr: 0.005,
+ momentum: 0.9,
+ beta1: 0.9,
+ beta2: 0.999,
+ epsilon: 1e-8,
+}
diff --git a/evo_kit/test/prototxt/torch_sin_config.prototxt b/evo_kit/test/prototxt/torch_sin_config.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..3704d64e6b6c7f7976422e33c2f5892b7ca4efc5
--- /dev/null
+++ b/evo_kit/test/prototxt/torch_sin_config.prototxt
@@ -0,0 +1,15 @@
+seed : 1024
+
+gaussian_sampling {
+ std: 0.005
+ cached: false
+}
+
+optimizer {
+ type: "Adam",
+ base_lr: 0.005,
+ momentum: 0.9,
+ beta1: 0.9,
+ beta2: 0.999,
+ epsilon: 1e-8,
+}
diff --git a/evo_kit/test/run_test.sh b/evo_kit/test/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b39cbc9db8c32c4827aa03a101b45a8011dde7ae
--- /dev/null
+++ b/evo_kit/test/run_test.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+#---------------libtorch-------------#
+if [ ! -d "./libtorch" ];then
+echo "Cannot find the torch library: ../libtorch"
+ echo "Downloading Torch library"
+ wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
+ unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+ rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+ echo "Torch library Downloaded"
+fi
+
+#----------------protobuf-------------#
+cd core/proto/
+protoc evo_kit/evo_kit.proto --cpp_out .
+cd -
+
+#----------------build---------------#
+sh scripts/lib_install.sh torch
+
+#----------------build test---------------#
+cd test
+
+cp -r ../libevokit ./
+if [ ! -d "./libevokit" ];then
+ echo "Cannot find the EvoKit library: ./libevokit"
+ echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme
+ exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+cmake ../
+make -j10
+
+#-----------------run----------------#
+./unit_test_main
+
+cd ..
diff --git a/evo_kit/test/src/optimizers_test.cc b/evo_kit/test/src/optimizers_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c561e3085bdf5f9102ba29115e7e8fabbf8ed75
--- /dev/null
+++ b/evo_kit/test/src/optimizers_test.cc
@@ -0,0 +1,59 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include
+#include "evo_kit/optimizer_factory.h"
+#include
+
+namespace evo_kit {
+
+TEST(SGDOptimizersTest, Method_update) {
+ std::shared_ptr config = std::make_shared();
+ auto optimizer_config = config->mutable_optimizer();
+ optimizer_config->set_base_lr(1.0);
+ optimizer_config->set_type("sgd");
+ std::shared_ptr optimizer = create_optimizer(config->optimizer());
+ float sgd_wei[10] = { 0.0 , 0.0 , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081};
+ float sgd_grad[10] = {-0.11992419,-0.0 , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143};
+ float sgd_new[10] = { 0.01199242, 0.0 , 0.0344831 , 0.05776198, 0.04206595, 0.00973154, 0.09637211,-0.03477474, 0.014892306, 0.03129495};
+
+ EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1"));
+ for (int i = 0; i < 10; ++i) {
+ EXPECT_FLOAT_EQ(sgd_new[i], sgd_wei[i]) << " i: " << i ;
+ }
+ EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1"));
+ EXPECT_FALSE(optimizer->update(sgd_wei, sgd_grad, 9, "fc1"));
+}
+
+TEST(AdamOptimizersTest, Method_update) {
+ std::shared_ptr config = std::make_shared();
+ auto optimizer_config = config->mutable_optimizer();
+ optimizer_config->set_base_lr(1.0);
+ optimizer_config->set_type("adam");
+ std::shared_ptr optimizer = create_optimizer(config->optimizer());
+ float adam_wei[10] = { 0.0 , 0.0 , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081};
+ float adam_grad[10] = {-0.11992419,-0.0 , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143};
+ float adam_new[10] = { 0.99999736, 0. ,-0.95783144, 1.05114082,-0.95755763,-0.98908256, 1.06569656,-0.99872491, 0.99906968, 1.01127923};
+
+ EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1"));
+ for (int i = 0; i < 10; ++i) {
+ EXPECT_FLOAT_EQ(adam_new[i], adam_wei[i]) << " i: " << i ;
+ }
+ EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1"));
+ EXPECT_FALSE(optimizer->update(adam_wei, adam_grad, 9, "fc1"));
+}
+
+} // namespace
+
diff --git a/evo_kit/test/src/sampling_test.cc b/evo_kit/test/src/sampling_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e707a63354836f3e70b42d819bab8b0fc3f79e70
--- /dev/null
+++ b/evo_kit/test/src/sampling_test.cc
@@ -0,0 +1,116 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include
+#include "evo_kit/sampling_method.h"
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/cached_gaussian_sampling.h"
+#include
+
+namespace evo_kit {
+
+class SamplingTest : public ::testing::Test {
+ protected:
+ void init_sampling_method(bool cached) {
+ config = std::make_shared();
+ config->set_seed(1024);
+ auto sampling_config = config->mutable_gaussian_sampling();
+ sampling_config->set_std(1.0);
+ sampling_config->set_cached(cached);
+ sampling_config->set_cache_size(cache_size);
+ if (cached) {
+ sampler = std::make_shared();
+ } else {
+ sampler = std::make_shared();
+ }
+ }
+
+ std::shared_ptr sampler;
+ std::shared_ptr config;
+ float array[3] = {1.0, 2.0, 3.0};
+ int cache_size = 100; // default cache_size 100
+ int key = 0;
+};
+
+
+TEST_F(SamplingTest, GaussianSampling_load_config) {
+ init_sampling_method(false);
+ EXPECT_TRUE(sampler->load_config(*config));
+}
+
+TEST_F(SamplingTest, GaussianSampling_sampling) {
+ init_sampling_method(false);
+ sampler->load_config(*config);
+
+ EXPECT_FALSE(sampler->sampling(&key, nullptr, 0));
+ EXPECT_TRUE(sampler->sampling(&key, array, 3));
+}
+
+TEST_F(SamplingTest, GaussianSampling_resampling) {
+ init_sampling_method(false);
+ sampler->load_config(*config);
+
+ EXPECT_FALSE(sampler->resampling(0, nullptr, 0));
+ EXPECT_TRUE(sampler->resampling(0, array, 3));
+}
+
+
+TEST_F(SamplingTest, CachedGaussianSampling_load_config) {
+ init_sampling_method(true);
+ EXPECT_TRUE(sampler->load_config(*config));
+}
+
+TEST_F(SamplingTest, CachedGaussianSampling_sampling) {
+ init_sampling_method(true);
+ EXPECT_FALSE(sampler->sampling(&key, array, 0));
+
+ sampler->load_config(*config);
+
+ EXPECT_FALSE(sampler->sampling(&key, nullptr, 0));
+ EXPECT_FALSE(sampler->sampling(&key, array, -1));
+ EXPECT_FALSE(sampler->sampling(&key, array, cache_size));
+
+ EXPECT_TRUE(sampler->sampling(&key, array, 0));
+ EXPECT_TRUE(sampler->sampling(&key, array, 3));
+}
+
+TEST_F(SamplingTest, CachedGaussianSampling_resampling) {
+ init_sampling_method(true);
+ EXPECT_FALSE(sampler->resampling(0, array, 0));
+
+ sampler->load_config(*config);
+
+ EXPECT_FALSE(sampler->resampling(0, nullptr, 0));
+ EXPECT_FALSE(sampler->resampling(0, array, -1));
+ EXPECT_FALSE(sampler->resampling(0, array, cache_size));
+
+ EXPECT_TRUE(sampler->resampling(0, array, 0));
+ EXPECT_TRUE(sampler->resampling(0, array, 1));
+ EXPECT_TRUE(sampler->resampling(0, array, 2));
+
+ EXPECT_FALSE(sampler->resampling(-1, array, 3));
+ EXPECT_TRUE(sampler->resampling(0, array, 3));
+ EXPECT_TRUE(sampler->resampling(1, array, 3));
+ EXPECT_TRUE(sampler->resampling(2, array, 3));
+ EXPECT_TRUE(sampler->resampling(cache_size-3, array, 3));
+ EXPECT_FALSE(sampler->resampling(cache_size-2, array, 3));
+ EXPECT_FALSE(sampler->resampling(cache_size-1, array, 3));
+ EXPECT_FALSE(sampler->resampling(cache_size, array, 3));
+ EXPECT_FALSE(sampler->resampling(cache_size-3, array, cache_size-1));
+}
+
+
+} // namespace
+
diff --git a/evo_kit/test/src/torch_agent_test.cc b/evo_kit/test/src/torch_agent_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..080b85391d720a6b500517a6f27976f76d2258b6
--- /dev/null
+++ b/evo_kit/test/src/torch_agent_test.cc
@@ -0,0 +1,157 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include
+#include
+#include
+
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/es_agent.h"
+#include "torch_demo_model.h"
+
+#include
+#include
+#include
+#include
+
+namespace evo_kit {
+
+
+// The fixture for testing class Foo.
+class TorchDemoTest : public ::testing::Test {
+protected:
+ float evaluate(std::vector& x_list, std::vector& y_list, int size, std::shared_ptr> agent) {
+ float total_loss = 0.0;
+ for (int i = 0; i < size; ++i) {
+ torch::Tensor x_input = torch::tensor(x_list[i], torch::dtype(torch::kFloat32));
+ torch::Tensor predict_y = agent->predict(x_input);
+ auto pred_y = predict_y.accessor();
+ float loss = pow((pred_y[0][0] - y_list[i]), 2);
+ total_loss += loss;
+ }
+ return -total_loss / float(size);
+ }
+
+ float train_loss() {
+ return -1.0 * evaluate(x_list, y_list, train_data_size, agent);
+ }
+
+ float test_loss() {
+ return -1.0 * evaluate(test_x_list, test_y_list, test_data_size, agent);
+ }
+
+ float train_test_gap() {
+ float train_lo = train_loss();
+ float test_lo = test_loss();
+ if ( train_lo > test_lo) {
+ return train_lo - test_lo;
+ } else {
+ return test_lo - train_lo;
+ }
+ }
+
+ void init_agent(const int in_dim, const int out_dim, const int h1_size, const int h2_size) {
+ std::shared_ptr model = std::make_shared(in_dim, out_dim, h1_size, h2_size);
+ agent = std::make_shared>(model, "../prototxt/torch_sin_config.prototxt");
+ }
+
+ void train_agent(std::string config_path) {
+ std::default_random_engine generator(0); // fix seed
+ std::uniform_real_distribution uniform(-3.0, 9.0);
+ std::normal_distribution norm;
+ for (int i = 0; i < train_data_size; ++i) {
+ float x_i = uniform(generator); // generate data between [-3, 9]
+ float y_i = sin(x_i) + norm(generator) * 0.05; // label noise std 0.05
+ x_list.push_back(x_i);
+ y_list.push_back(y_i);
+ }
+ for (int i= 0; i < test_data_size; ++i) {
+ float x_i = uniform(generator);
+ float y_i = sin(x_i);
+ test_x_list.push_back(x_i);
+ test_y_list.push_back(y_i);
+ }
+
+ std::shared_ptr model = std::make_shared(1, 1, 10, 5);
+ agent = std::make_shared>(model, config_path);
+
+ // Clone agents to sample (explore).
+ std::vector>> sampling_agents;
+ for (int i = 0; i < iter; ++i) {
+ sampling_agents.push_back(agent->clone());
+ }
+
+ std::vector noisy_keys;
+ std::vector noisy_rewards(iter, 0.0f);
+ noisy_keys.resize(iter);
+
+ LOG(INFO) << "start training...";
+ for (int epoch = 0; epoch < 1001; ++epoch) {
+#pragma omp parallel for schedule(dynamic, 1)
+ for (int i = 0; i < iter; ++i) {
+ auto sampling_agent = sampling_agents[i];
+ SamplingInfo key;
+ bool success = sampling_agent->add_noise(key);
+ float reward = evaluate(x_list, y_list, train_data_size, sampling_agent);
+ noisy_keys[i] = key;
+ noisy_rewards[i] = reward;
+ }
+ bool success = agent->update(noisy_keys, noisy_rewards);
+
+ if (epoch % 100 == 0) {
+ float reward = evaluate(test_x_list, test_y_list, test_data_size, agent);
+ float train_reward = evaluate(x_list, y_list, train_data_size, agent);
+ LOG(INFO) << "Epoch:" << epoch << " Loss: " << -reward << ", Train loss" << -train_reward;
+ }
+ }
+ }
+
+ // Class members declared here can be used by all tests in the test suite
+ int train_data_size = 300;
+ int test_data_size = 100;
+ int iter = 10;
+ std::vector x_list;
+ std::vector y_list;
+ std::vector test_x_list;
+ std::vector test_y_list;
+ std::shared_ptr> agent;
+};
+
+TEST_F(TorchDemoTest, TrainingEffectUseNormalSampling) {
+ train_agent("../prototxt/torch_sin_config.prototxt");
+ EXPECT_LT(train_loss(), 0.05);
+ EXPECT_LT(test_loss(), 0.05);
+ EXPECT_LT(train_test_gap(), 0.03);
+}
+
+TEST_F(TorchDemoTest, TrainingEffectTestUseTableSampling) {
+ train_agent("../prototxt/torch_sin_cached_config.prototxt");
+ EXPECT_LT(train_loss(), 0.05);
+ EXPECT_LT(test_loss(), 0.05);
+ EXPECT_LT(train_test_gap(), 0.03);
+}
+
+TEST_F(TorchDemoTest,ParamSizeTest) {
+ init_agent(1, 1, 10, 5);
+ EXPECT_EQ(agent->param_size(), 81);
+ init_agent(2, 3, 10, 5);
+ EXPECT_EQ(agent->param_size(), 103);
+ init_agent(1, 1, 1, 1);
+ EXPECT_EQ(agent->param_size(), 6);
+ init_agent(100, 2, 256, 64);
+ EXPECT_EQ(agent->param_size(), 42434);
+}
+
+} // namespace
diff --git a/evo_kit/test/src/utils_test.cc b/evo_kit/test/src/utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0c8d2c963a698475831a641c3eefc8abcc3693a
--- /dev/null
+++ b/evo_kit/test/src/utils_test.cc
@@ -0,0 +1,30 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include
+#include "evo_kit/utils.h"
+
+namespace evo_kit {
+
+// Tests that the Utils::compute_centered_rank() method.
+TEST(UtilsTest, Method_compute_centered_ranks) {
+ float a[5] = {9.0, 8.0, 7.0, 6.0, 5.0};
+ std::vector reward_vec(a, a+5);
+ EXPECT_EQ(compute_centered_ranks(reward_vec), true);
+}
+
+
+} // namespace
+
diff --git a/evo_kit/test/unit_test.cc b/evo_kit/test/unit_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bbc21f4cdfb8e7709173a258f66560a7f7e27a1
--- /dev/null
+++ b/evo_kit/test/unit_test.cc
@@ -0,0 +1,20 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+ ::testing::InitGoogleTest(&argc, argv);
+ return RUN_ALL_TESTS();
+}
diff --git a/evo_kit/torch/include/evo_kit/es_agent.h b/evo_kit/torch/include/evo_kit/es_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..856034f75fc2c025cbb3aed74c5eac4edc888178
--- /dev/null
+++ b/evo_kit/torch/include/evo_kit/es_agent.h
@@ -0,0 +1,196 @@
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+// http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TORCH_ESAGENT_H
+#define TORCH_ESAGENT_H
+
+#include
+#include
+#include "evo_kit/optimizer_factory.h"
+#include "evo_kit/sampling_factory.h"
+#include "evo_kit/utils.h"
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit{
+
+/**
+ * @brief DeepES agent for Torch.
+ *
+ * Our implemtation is flexible to support any model that subclass torch::nn::Module.
+ * That is, we can instantiate an agent by: es_agent = ESAgent(model);
+ * After that, users can clone an agent for multi-thread processing, add parametric noise for exploration,
+ * and update the parameteres, according to the evaluation resutls of noisy parameters.
+ */
+template