Merge branch 'develop' into CN_docs

dfce491c · TomorrowIsAnOtherDay · bfdbe32d · 8e9eec6c · dfce491c · dfce491c
237 changed file
--- a/.copyright.hook
+++ b/.copyright.hook
 from __future__ import absolute_import
 from __future__ import print_function
-from __future__ import unicode_literals

 import argparse
 import io, re

--- a/.teamcity/Dockerfile
+++ b/.teamcity/Dockerfile
@@ -18,3 +18,7 @@
 FROM parl/parl-test:cuda9.0-cudnn7-v2

 COPY ./requirements.txt /root/
+
+RUN apt-get install -y libgflags-dev libgoogle-glog-dev libomp-dev unzip
+RUN apt-get install -y libgtest-dev && cd /usr/src/gtest && mkdir build \
+	&& cd build && cmake .. && make  && cp libgtest*.a /usr/local/lib
--- a/.teamcity/build.sh
+++ b/.teamcity/build.sh
@@ -69,7 +69,7 @@ function run_test_with_gpu() {
    Running unit tests with GPU...
    ========================================
 EOF
-    ctest --output-on-failure -j10
+    ctest --output-on-failure -j20 --verbose
    cd ${REPO_ROOT}
    rm -rf ${REPO_ROOT}/build
 }
@@ -90,7 +90,7 @@ function run_test_with_cpu() {
    =====================================================
 EOF
    if [ $# -eq 1 ];then
-      ctest --output-on-failure -j10
+      ctest --output-on-failure -j20 --verbose
    else
      ctest --output-on-failure 
    fi
@@ -145,7 +145,8 @@ function main() {
          ;;
        test)
          # test code compability in environments with various python versions
-          declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
+          #declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
+          declare -a envs=("py27" "py36")
          for env in "${envs[@]}";do
              cd /work
              source ~/.bashrc
@@ -158,7 +159,7 @@ function main() {
              echo ========================================
              pip install .
              if [ \( $env == "py27" -o $env == "py36" -o $env == "py37" \) ]
-              then  
+              then
                pip install -r .teamcity/requirements.txt
                run_test_with_cpu $env
                run_test_with_cpu $env "DIS_TESTING_SERIALLY"
@@ -169,6 +170,10 @@ function main() {
                pip install -r .teamcity/requirements_torch.txt
                run_test_with_cpu $env "DIS_TESTING_TORCH"
              fi
+              # clean env
+              export LC_ALL=C.UTF-8
+              export LANG=C.UTF-8
+              xparl stop
          done
          run_test_with_gpu


--- a/.teamcity/requirements.txt
+++ b/.teamcity/requirements.txt
@@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97
 gym
 details
 parameterized
-timeout_decorator
--- a/.teamcity/requirements_torch.txt
+++ b/.teamcity/requirements_torch.txt
@@ -2,4 +2,3 @@
 gym
 details
 parameterized
-timeout_decorator
--- a/.scripts/update_readme_paddle_version.py
+++ b/.scripts/update_readme_paddle_version.py
@@ -37,7 +37,8 @@ if __name__ == '__main__':

    exclude_examples = [
        'NeurIPS2019-Learn-to-Move-Challenge',
-        'NeurIPS2018-AI-for-Prosthetics-Challenge', 'EagerMode'
+        'NeurIPS2018-AI-for-Prosthetics-Challenge', 'LiftSim_baseline',
+        'EagerMode'
    ]
    for example in os.listdir('../examples/'):
        if example not in exclude_examples:

--- a/.teamcity/windows_test.sh
+++ b/.teamcity/windows_test.sh
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: You need install mingw-cmake.
+
+function init() {
+    RED='\033[0;31m'
+    BLUE='\033[0;34m'
+    BOLD='\033[1m'
+    NONE='\033[0m'
+
+    REPO_ROOT=`pwd`
+}
+
+
+function abort(){
+    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
+    exit 1
+}
+
+function run_test_with_cpu() {
+    export CUDA_VISIBLE_DEVICES="-1"
+
+    mkdir -p ${REPO_ROOT}/build
+    cd ${REPO_ROOT}/build
+    if [ $# -eq 1 ];then
+        cmake  -G "MinGW Makefiles" ..
+    else
+        cmake  -G "MinGW Makefiles" .. -$2=ON
+    fi
+    cat <<EOF
+    =====================================================
+    Running unit tests with CPU in the environment: $1
+    =====================================================
+EOF
+    if [ $# -eq 1 ];then
+      ctest --output-on-failure -j10
+    else
+      ctest --output-on-failure 
+    fi
+    cd ${REPO_ROOT}
+    rm -rf ${REPO_ROOT}/build
+}
+
+function main() {
+    set -e
+    local CMD=$1
+    
+    init
+    env="unused_variable"
+    # run unittest in windows (used in local machine)
+    pip install -i https://pypi.tuna.tsinghua.edu.cn/simple .
+    pip uninstall -y torch torchvision
+    pip install -i https://pypi.tuna.tsinghua.edu.cn/simple paddlepaddle==1.6.1 gym details parameterized
+    run_test_with_cpu $env
+    run_test_with_cpu $env "DIS_TESTING_SERIALLY"
+    pip uninstall -y paddlepaddle
+    pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html  
+    run_test_with_cpu $env "DIS_TESTING_TORCH"
+}
+
+main $@
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ function(py_test TARGET_NAME)
    add_test(NAME ${TARGET_NAME}
        COMMAND python -u ${py_test_SRCS} ${py_test_ARGS}
        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 300)
 endfunction()

 function(import_test TARGET_NAME)

--- a/README.cn.md
+++ b/README.cn.md
@@ -3,7 +3,7 @@
 </p>

 [English](./README.md) | 简体中文   
-[**文档**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md)
+[**文档**](https://parl.readthedocs.io/en/stable/index.html)

 > PARL 是一个高性能、灵活的强化学习框架。
 # 特点
@@ -48,7 +48,7 @@ class Agent(object):
 parl.connect('localhost:8037')
 agent = Agent()
 agent.say_hello()
-ans = agent.sum(1,5) # run remotely and not comsume any local computation resources 
+ans = agent.sum(1,5) # run remotely and not comsume any local computation resources
 ```
 两步调度外部的计算资源：
 1. 使用`parl.remote_class`修饰一个类，之后这个类就被转化为可以运行在其他CPU或者机器上的类。
@@ -61,8 +61,8 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour

 # 安装:
 ### 依赖
- Python 2.7 or 3.5+. 
- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**，如果你只用并行部分的接口不需要安装paddle) 
+- Python 2.7 or 3.5+. (**Windows系统**目前仅支持python3.6+以上的环境）
+- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**，如果你只用并行部分的接口不需要安装paddle)


 ```
@@ -83,6 +83,6 @@ pip install parl
 - [冠军解决方案：NIPS2018强化学习假肢挑战赛](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/)
 - [冠军解决方案：NIPS2019强化学习仿生人控制赛事](examples/NeurIPS2019-Learn-to-Move-Challenge/)

-<img src="examples/NeurIPS2019-Learn-to-Move-Challenge/image/performance.gif" width = "300" height ="200" alt="NeurlIPS2018"/> <img src=".github/Half-Cheetah.gif" width = "300" height ="200" alt="Half-Cheetah"/> <img src=".github/Breakout.gif" width = "200" height ="200" alt="Breakout"/> 
+<img src="examples/NeurIPS2019-Learn-to-Move-Challenge/image/performance.gif" width = "300" height ="200" alt="NeurlIPS2018"/> <img src=".github/Half-Cheetah.gif" width = "300" height ="200" alt="Half-Cheetah"/> <img src=".github/Breakout.gif" width = "200" height ="200" alt="Breakout"/>
 <br>
 <img src=".github/Aircraft.gif"  width = "808" height ="300"  alt="NeurlIPS2018"/>
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 </p>

 English | [简体中文](./README.cn.md)   
-[**Documentation**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md)
+[**Documentation**](https://parl.readthedocs.io/en/stable/index.html)

 > PARL is a flexible and high-efficient reinforcement learning framework.

@@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c

 # Install:
 ### Dependencies
- Python 2.7 or 3.5+. 
+- Python 2.7 or 3.5+(On **Windows**, PARL only supprorts the enviroment with python3.6+).
 - [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone)  



--- a/benchmark/torch/AlphaZero/.pic/good_moves.png
+++ b/benchmark/torch/AlphaZero/.pic/good_moves.png
--- a/benchmark/torch/AlphaZero/.pic/perfect_moves.png
+++ b/benchmark/torch/AlphaZero/.pic/perfect_moves.png
--- a/benchmark/torch/AlphaZero/Arena.py
+++ b/benchmark/torch/AlphaZero/Arena.py
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+from tqdm import tqdm
+from parl.utils import logger
+
+
+class Arena():
+    """
+    An Arena class where any 2 agents can be pit against each other.
+    """
+
+    def __init__(self, player1, player2, game, display=None):
+        """
+        Input:
+            player 1,2: two functions that takes board as input, return action
+            game: Game object
+            display: a function that takes board as input and prints it (e.g.
+                     display in othello/OthelloGame). Is necessary for verbose
+                     mode.
+
+        see othello/OthelloPlayers.py for an example. See pit.py for pitting
+        human players/other baselines with each other.
+        """
+        self.player1 = player1
+        self.player2 = player2
+        self.game = game
+        self.display = display
+
+    def playGame(self, verbose=False):
+        """
+        Executes one episode of a game.
+
+        Returns:
+            either
+                winner: player who won the game (1 if player1, -1 if player2)
+            or
+                draw result returned from the game that is neither 1, -1, nor 0.
+        """
+        players = [self.player2, None, self.player1]
+        curPlayer = 1
+        board = self.game.getInitBoard()
+        it = 0
+        while self.game.getGameEnded(board, curPlayer) == 0:
+            it += 1
+            if verbose:
+                assert self.display
+                print("Turn ", str(it), "Player ", str(curPlayer))
+                self.display(board)
+            action = players[curPlayer + 1](self.game.getCanonicalForm(
+                board, curPlayer))
+
+            valids = self.game.getValidMoves(
+                self.game.getCanonicalForm(board, curPlayer), 1)
+
+            if valids[action] == 0:
+                logger.error('Action {} is not valid!'.format(action))
+                logger.debug('valids = {}'.format(valids))
+                assert valids[action] > 0
+            board, curPlayer = self.game.getNextState(board, curPlayer, action)
+        if verbose:
+            assert self.display
+            print("Game over: Turn ", str(it), "Result ",
+                  str(self.game.getGameEnded(board, 1)))
+            self.display(board)
+        return curPlayer * self.game.getGameEnded(board, curPlayer)
+
+    def playGames(self, num, verbose=False):
+        """
+        Plays num games in which player1 starts num/2 games and player2 starts
+        num/2 games.
+
+        Returns:
+            oneWon: games won by player1
+            twoWon: games won by player2
+            draws:  games won by nobody
+        """
+
+        num = int(num / 2)
+        oneWon = 0
+        twoWon = 0
+        draws = 0
+        for _ in tqdm(range(num), desc="Arena.playGames (1)"):
+            gameResult = self.playGame(verbose=verbose)
+            if gameResult == 1:
+                oneWon += 1
+            elif gameResult == -1:
+                twoWon += 1
+            else:
+                draws += 1
+
+        self.player1, self.player2 = self.player2, self.player1
+
+        for _ in tqdm(range(num), desc="Arena.playGames (2)"):
+            gameResult = self.playGame(verbose=verbose)
+            if gameResult == -1:
+                oneWon += 1
+            elif gameResult == 1:
+                twoWon += 1
+            else:
+                draws += 1
+
+        return oneWon, twoWon, draws
--- a/benchmark/torch/AlphaZero/Coach.py
+++ b/benchmark/torch/AlphaZero/Coach.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import threading
+import queue
+import pickle
+from pickle import Pickler, Unpickler
+from random import shuffle
+from parl.utils import tensorboard
+
+import numpy as np
+from tqdm import tqdm
+
+import parl
+from parl.utils import logger
+
+from actor import Actor
+from utils import split_group, get_test_dataset
+from alphazero_agent import create_agent
+
+
+class Coach():
+    """
+    This class executes the self-play, learning and evaluating. 
+    """
+
+    def __init__(self, game, args):
+        self.game = game
+        self.args = args
+
+        # neural network of current generation
+        self.current_agent = create_agent(self.game)
+        # neural network of previous generation
+        self.previous_agent = create_agent(self.game)
+
+        # history of examples from args.numItersForTrainExamplesHistory latest iterations
+        self.trainExamplesHistory = []
+
+        self.remote_actors_signal_queues = []
+        self.remote_actors_return_queue = queue.Queue()
+
+        self.test_dataset = get_test_dataset()
+
+    def _run_remote_tasks(self, signal_queue):
+        # The remote actor will actually run on the local machine or other machines of xparl cluster
+        remote_actor = Actor(self.game, self.args)
+
+        while True:
+            # receive running task signal
+            # signal: specify task type and task input data (optional)
+            signal = signal_queue.get()
+
+            if signal["task"] == "self-play":
+                episode_num_each_actor = self.args.numEps // self.args.actors_num
+                result = remote_actor.self_play(
+                    self.current_agent.get_weights(), episode_num_each_actor)
+                self.remote_actors_return_queue.put({"self-play": result})
+
+            elif signal["task"] == "pitting":
+                games_num_each_actor = self.args.arenaCompare // self.args.actors_num
+                result = remote_actor.pitting(
+                    self.previous_agent.get_weights(),
+                    self.current_agent.get_weights(), games_num_each_actor)
+                self.remote_actors_return_queue.put({"pitting": result})
+
+            elif signal["task"] == "evaluate_test_dataset":
+                test_dataset = signal["test_dataset"]
+                result = remote_actor.evaluate_test_dataset(
+                    self.current_agent.get_weights(), test_dataset)
+                self.remote_actors_return_queue.put({
+                    "evaluate_test_dataset":
+                    result
+                })
+            else:
+                raise NotImplementedError
+
+    def _create_remote_actors(self):
+        # connect to xparl cluster to submit jobs
+        parl.connect(self.args.master_address)
+
+        for i in range(self.args.actors_num):
+            signal_queue = queue.Queue()
+            self.remote_actors_signal_queues.append(signal_queue)
+
+            remote_thread = threading.Thread(
+                target=self._run_remote_tasks, args=(signal_queue, ))
+            remote_thread.setDaemon(True)
+            remote_thread.start()
+
+    def learn(self):
+        """Each iteration:
+        1. Performs numEps episodes of self-play.
+        2. Retrains neural network with examples in trainExamplesHistory
+           (which has a maximum length of numItersForTrainExamplesHistory).
+        3. Evaluates the new neural network with the test dataset.
+        4. Pits the new neural network against the old one and accepts it
+           only if it wins >= updateThreshold fraction of games.
+        """
+
+        # create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel.
+        self._create_remote_actors()
+
+        for iteration in range(1, self.args.numIters + 1):
+            logger.info('Starting Iter #{} ...'.format(iteration))
+
+            ####################
+            logger.info('Step1: self-play in parallel...')
+            iterationTrainExamples = []
+            # update weights of remote actors to the latest weights, and ask them to run self-play task
+            for signal_queue in self.remote_actors_signal_queues:
+                signal_queue.put({"task": "self-play"})
+            # wait for all remote actors (a total of self.args.actors_num) to return the self-play results
+            for _ in range(self.args.actors_num):
+                result = self.remote_actors_return_queue.get()
+                iterationTrainExamples.extend(result["self-play"])
+
+            # save the iteration examples to the history
+            self.trainExamplesHistory.append(iterationTrainExamples)
+            if len(self.trainExamplesHistory
+                   ) > self.args.numItersForTrainExamplesHistory:
+                logger.warning("Removing the oldest entry in trainExamples.")
+                self.trainExamplesHistory.pop(0)
+            self.saveTrainExamples(iteration)  # backup history to a file
+
+            ####################
+            logger.info('Step2: train neural network...')
+            # shuffle examples before training
+            trainExamples = []
+            for e in self.trainExamplesHistory:
+                trainExamples.extend(e)
+            shuffle(trainExamples)
+
+            # training new network, keeping a copy of the old one
+            self.current_agent.save(
+                os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+            self.previous_agent.restore(
+                os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+
+            self.current_agent.learn(trainExamples)
+
+            ####################
+            logger.info('Step3: evaluate test dataset in parallel...')
+            cnt = 0
+            # update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset
+            for i, data in enumerate(
+                    split_group(
+                        self.test_dataset,
+                        len(self.test_dataset) // self.args.actors_num)):
+                self.remote_actors_signal_queues[i].put({
+                    "task":
+                    "evaluate_test_dataset",
+                    "test_dataset":
+                    data
+                })
+                cnt += len(data)
+            perfect_moves_cnt, good_moves_cnt = 0, 0
+            # wait for all remote actors (a total of self.args.actors_num) to return the evaluating results
+            for _ in range(self.args.actors_num):
+                (perfect_moves,
+                 good_moves) = self.remote_actors_return_queue.get(
+                 )["evaluate_test_dataset"]
+                perfect_moves_cnt += perfect_moves
+                good_moves_cnt += good_moves
+            logger.info('perfect moves rate: {}, good moves rate: {}'.format(
+                perfect_moves_cnt / cnt, good_moves_cnt / cnt))
+            tensorboard.add_scalar('perfect_moves_rate',
+                                   perfect_moves_cnt / cnt, iteration)
+            tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt,
+                                   iteration)
+
+            ####################
+            logger.info(
+                'Step4: pitting against previous generation in parallel...')
+            # transfer weights of previous generation and current generation to the remote actors, and ask them to pit.
+            for signal_queue in self.remote_actors_signal_queues:
+                signal_queue.put({"task": "pitting"})
+            previous_wins, current_wins, draws = 0, 0, 0
+            for _ in range(self.args.actors_num):
+                (pwins_, cwins_,
+                 draws_) = self.remote_actors_return_queue.get()["pitting"]
+                previous_wins += pwins_
+                current_wins += cwins_
+                draws += draws_
+
+            logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
+                        (current_wins, previous_wins, draws))
+            if previous_wins + current_wins == 0 or float(current_wins) / (
+                    previous_wins + current_wins) < self.args.updateThreshold:
+                logger.info('REJECTING NEW MODEL')
+                self.current_agent.restore(
+                    os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+            else:
+                logger.info('ACCEPTING NEW MODEL')
+                self.current_agent.save(
+                    os.path.join(self.args.checkpoint, 'best.pth.tar'))
+            self.current_agent.save(
+                os.path.join(self.args.checkpoint,
+                             self.getCheckpointFile(iteration)))
+
+    def getCheckpointFile(self, iteration):
+        return 'checkpoint_' + str(iteration) + '.pth.tar'
+
+    def saveTrainExamples(self, iteration):
+        folder = self.args.checkpoint
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        filename = os.path.join(
+            folder,
+            self.getCheckpointFile(iteration) + ".examples")
+        with open(filename, "wb+") as f:
+            Pickler(f).dump(self.trainExamplesHistory)
+        f.closed
+
+    def loadModel(self):
+        self.current_agent.restore(
+            os.path.join(self.args.load_folder_file[0],
+                         self.args.load_folder_file[1]))
+
+    def loadTrainExamples(self):
+        modelFile = os.path.join(self.args.load_folder_file[0],
+                                 self.args.load_folder_file[1])
+        examplesFile = modelFile + ".examples"
+        if not os.path.isfile(examplesFile):
+            logger.warning(
+                "File {} with trainExamples not found!".format(examplesFile))
+            r = input("Continue? [y|n]")
+            if r != "y":
+                sys.exit()
+        else:
+            logger.info("File with trainExamples found. Loading it...")
+            with open(examplesFile, "rb") as f:
+                self.trainExamplesHistory = Unpickler(f).load()
+            logger.info('Loading done!')
--- a/benchmark/torch/AlphaZero/MCTS.py
+++ b/benchmark/torch/AlphaZero/MCTS.py
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import math
+import time
+
+import numpy as np
+
+EPS = 1e-8
+
+
+class MCTS():
+    """
+    This class handles the MCTS tree.
+    """
+
+    def __init__(self, game, nn_agent, args, dirichlet_noise=False):
+        self.game = game
+        self.nn_agent = nn_agent
+        self.args = args
+        self.dirichlet_noise = dirichlet_noise
+        self.Qsa = {}  # stores Q values for s,a (as defined in the paper)
+        self.Nsa = {}  # stores #times edge s,a was visited
+        self.Ns = {}  # stores #times board s was visited
+        self.Ps = {}  # stores initial policy (returned by neural net)
+
+        self.Es = {}  # stores game.getGameEnded ended for board s
+        self.Vs = {}  # stores game.getValidMoves for board s
+
+    def getActionProb(self, canonicalBoard, temp=1):
+        """
+        This function performs numMCTSSims simulations of MCTS starting from
+        canonicalBoard.
+
+        Returns:
+            probs: a policy vector where the probability of the ith action is
+                   proportional to Nsa[(s,a)]**(1./temp)
+        """
+        for i in range(self.args.numMCTSSims):
+            dir_noise = (i == 0 and self.dirichlet_noise)
+            self.search(canonicalBoard, dirichlet_noise=dir_noise)
+
+        s = self.game.stringRepresentation(canonicalBoard)
+        counts = [
+            self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
+            for a in range(self.game.getActionSize())
+        ]
+
+        if temp == 0:
+            bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
+            bestA = np.random.choice(bestAs)
+            probs = [0] * len(counts)
+            probs[bestA] = 1
+            return probs
+
+        counts = [x**(1. / temp) for x in counts]
+        counts_sum = float(sum(counts))
+        probs = [x / counts_sum for x in counts]
+        return probs
+
+    def search(self, canonicalBoard, dirichlet_noise=False):
+        """
+        This function performs one iteration of MCTS. It is recursively called
+        till a leaf node is found. The action chosen at each node is one that
+        has the maximum upper confidence bound as in the paper.
+
+        Once a leaf node is found, the neural network is called to return an
+        initial policy P and a value v for the state. This value is propagated
+        up the search path. In case the leaf node is a terminal state, the
+        outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
+        updated.
+
+        NOTE: the return values are the negative of the value of the current
+        state. This is done since v is in [-1,1] and if v is the value of a
+        state for the current player, then its value is -v for the other player.
+
+        Returns:
+            v: the negative of the value of the current canonicalBoard
+        """
+
+        s = self.game.stringRepresentation(canonicalBoard)
+
+        if s not in self.Es:
+            self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
+        if self.Es[s] != 0:
+            # terminal node
+            return -self.Es[s]
+
+        if s not in self.Ps:
+            # leaf node
+            self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
+
+            valids = self.game.getValidMoves(canonicalBoard, 1)
+            self.Ps[s] = self.Ps[s] * valids  # masking invalid moves
+            if dirichlet_noise:
+                self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            if sum_Ps_s > 0:
+                self.Ps[s] /= sum_Ps_s  # renormalize
+            else:
+                # if all valid moves were masked make all valid moves equally probable
+
+                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
+                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
+                print("All valid moves were masked, doing a workaround.")
+                self.Ps[s] = self.Ps[s] + valids
+                self.Ps[s] /= np.sum(self.Ps[s])
+
+            self.Vs[s] = valids
+            self.Ns[s] = 0
+            return -v
+
+        valids = self.Vs[s]
+        if dirichlet_noise:
+            self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            self.Ps[s] /= sum_Ps_s  # renormalize
+        cur_best = -float('inf')
+        best_act = -1
+
+        # pick the action with the highest upper confidence bound
+        for a in range(self.game.getActionSize()):
+            if valids[a]:
+                if (s, a) in self.Qsa:
+                    u = self.Qsa[
+                        (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                            self.Ns[s]) / (1 + self.Nsa[(s, a)])
+                else:
+                    u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                        self.Ns[s] + EPS)  # Q = 0 ?
+
+                if u > cur_best:
+                    cur_best = u
+                    best_act = a
+
+        a = best_act
+        next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
+        next_s = self.game.getCanonicalForm(next_s, next_player)
+
+        v = self.search(next_s)
+
+        if (s, a) in self.Qsa:
+            self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
+                (s, a)] + v) / (self.Nsa[(s, a)] + 1)
+            self.Nsa[(s, a)] += 1
+
+        else:
+            self.Qsa[(s, a)] = v
+            self.Nsa[(s, a)] = 1
+
+        self.Ns[s] += 1
+        return -v
+
+    def applyDirNoise(self, s, valids):
+        dir_values = np.random.dirichlet(
+            [self.args.dirichletAlpha] * np.count_nonzero(valids))
+        dir_idx = 0
+        for idx in range(len(self.Ps[s])):
+            if self.Ps[s][idx]:
+                self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
+                    0.25 * dir_values[dir_idx])
+                dir_idx += 1
--- a/benchmark/torch/AlphaZero/README.md
+++ b/benchmark/torch/AlphaZero/README.md
+## AlphaZero baseline for Connect4 game (distributed version)
+- In this example, we provide a fine-tuned AlphaZero baseline to solve the Connect4 game, based on the code of [alpha-zero-general](https://github.com/suragnair/alpha-zero-general) repo.
+- We take advantage of the parallelism capacity of [PARL](https://github.com/PaddlePaddle/PARL) to support running self-play and evaluating tasks in parallel.
+- We also provide scripts to pack your well-trained model to a submission file, which can be submitted to the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition directly.
+
+### Dependencies
+- python3
+- [parl==1.3](https://github.com/PaddlePaddle/PARL)
+- torch
+- tqdm
+
+### Training 
+1. Download the [1k connect4 validation set](https://www.kaggle.com/petercnudde/1k-connect4-validation-set) to the current directory. (filename: `refmoves1k_kaggle`)
+
+2. Start xparl cluster
+```bash
+# You can change following `cpu_num` and `args.actor_nums` in the main.py 
+# based on the CPU number of your machine.
+
+xparl start --port 8010 --cpu_num 25
+```
+
+```bash
+# [OPTIONAL] You can also run the following script in other machines to add more CPU resource 
+#            to the xparl cluster, so you can increase the parallelism (args.actor_nums).
+
+xparl connect --address MASTER_IP:8010 --cpu_num [CPU_NUM]
+```
+
+3. Run training script
+```bash
+python main.py
+```
+
+4. Visualize (good moves rate and perfect moves rate)
+```
+tensorboard --logdir .
+```
+
+### Submitting
+To submit the well-trained model to the Kaggle, you can use our provided script to generate `submission.py`, for example:
+```bash
+python gen_submission.py saved_model/best.pth.tar
+```
+
+### Performance
+- Following are `good moves rate` and `perfect moves rate` indicators in tensorbaord, please refer to the [link](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) for specific meaning.
+
+<img src=".pic/good_moves.png" width = "300" alt="good moves rate"/> <img src=".pic/perfect_moves.png" width = "300" alt="perfect moves rate"/>
+
+> It takes about 1 day to run 25 iterations on the machine with 25 cpus.
+
+- It can reach about score 1368 (rank 5 on 2020/06/04) in the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition.
+
+
+### Reference
+- [suragnair/alpha-zero-general](https://github.com/suragnair/alpha-zero-general)
+- [Scoring connect-x agents](https://www.kaggle.com/petercnudde/scoring-connect-x-agents)
--- a/benchmark/torch/AlphaZero/actor.py
+++ b/benchmark/torch/AlphaZero/actor.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import parl
+import os
+from alphazero_agent import create_agent
+from MCTS import MCTS
+from Arena import Arena
+from utils import win_loss_draw
+
+
+@parl.remote_class
+class Actor(object):
+    def __init__(self, game, args):
+        os.environ['OMP_NUM_THREADS'] = "1"
+        self.game = game
+        self.args = args
+
+        # neural network of previous generation
+        self.previous_agent = create_agent(self.game, cuda=False)
+        # neural network of current generation
+        self.current_agent = create_agent(self.game, cuda=False)
+
+        # MCTS of previous generation
+        self.previous_mcts = MCTS(
+            self.game, self.previous_agent, self.args, dirichlet_noise=True)
+        # MCTS of current generation
+        self.current_mcts = MCTS(
+            self.game, self.current_agent, self.args, dirichlet_noise=True)
+
+    def self_play(self, current_weights, game_num):
+        """Collecting training data by self-play.
+        
+        Args:
+            current_weights (numpy.array): latest weights of neural network
+            game_num (int): game number of self-play
+
+        Returns:
+            train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v)
+        """
+
+        # update weights of current neural network with latest weights
+        self.current_agent.set_weights(current_weights)
+
+        train_examples = []
+        for _ in range(game_num):
+            # reset node state of MCTS
+            self.current_mcts = MCTS(
+                self.game, self.current_agent, self.args, dirichlet_noise=True)
+            train_examples.extend(self._executeEpisode())
+        return train_examples
+
+    def pitting(self, previous_weights, current_weights, games_num):
+        """Fighting between previous generation agent and current generation agent
+
+        Args:
+            previous_weights (numpy.array): weights of previous generation neural network
+            current_weights (numpy.array): weights of current generation neural network
+            game_num (int): game number of fighting 
+
+        Returns:
+            tuple of (game number of previous agent won, game number of current agent won, game number of draw)
+        """
+        # update weights of previous and current neural network
+        self.previous_agent.set_weights(previous_weights)
+        self.current_agent.set_weights(current_weights)
+
+        # reset node state of MCTS
+        self.previous_mcts = MCTS(self.game, self.previous_agent, self.args)
+        self.current_mcts = MCTS(self.game, self.current_agent, self.args)
+
+        arena = Arena(
+            lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)),
+            lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)),
+            self.game)
+        previous_wins, current_wins, draws = arena.playGames(games_num)
+
+        return (previous_wins, current_wins, draws)
+
+    def evaluate_test_dataset(self, current_weights, test_dataset):
+        """Evaluate performance of latest neural nerwork
+        
+        Args:
+            current_weights (numpy.array): latest weights of neural network
+            test_dataset (list): game number of self-play
+
+        Returns:
+            tuple of (number of perfect moves, number of good moves)
+        """
+        # update weights of current neural network with latest weights
+        self.current_agent.set_weights(current_weights)
+
+        perfect_move_count, good_move_count = 0, 0
+        for data in test_dataset:
+            self.current_mcts = MCTS(self.game, self.current_agent, self.args)
+
+            x = self.game.getCanonicalForm(data['board'], data['player'])
+            agent_move = int(
+                np.argmax(self.current_mcts.getActionProb(x, temp=0)))
+
+            moves = data["move_score"]
+            perfect_score = max(moves)
+            perfect_moves = [i for i in range(7) if moves[i] == perfect_score]
+
+            if agent_move in perfect_moves:
+                perfect_move_count += 1
+            if win_loss_draw(
+                    moves[agent_move]) == win_loss_draw(perfect_score):
+                good_move_count += 1
+
+        return (perfect_move_count, good_move_count)
+
+    def _executeEpisode(self):
+        """
+
+        This function executes one episode of self-play, starting with player 1.
+        As the game goes on, each turn is added as a training example to
+        trainExamples. The game is played till the game ends. After the game
+        ends, the outcome of the game is used to assign values to each example
+        in trainExamples.
+
+        It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter
+        uses temp=0.
+
+        Returns:
+            trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
+                           pi is the MCTS informed policy vector, v is +1 if
+                           the player eventually won the game, else -1.
+        """
+        trainExamples = []
+        board = self.game.getInitBoard()
+        self.curPlayer = 1
+        episodeStep = 0
+
+        while True:
+            episodeStep += 1
+            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
+            temp = int(episodeStep < self.args.tempThresholdStep)
+
+            pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp)
+            sym = self.game.getSymmetries(canonicalBoard, pi)
+            for b, p in sym:  # board, pi
+                trainExamples.append([b, self.curPlayer, p, None])
+
+            action = np.random.choice(len(pi), p=pi)
+            board, self.curPlayer = self.game.getNextState(
+                board, self.curPlayer, action)
+
+            r = self.game.getGameEnded(board, self.curPlayer)
+
+            if r != 0:
+                return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
+                        for x in trainExamples]
--- a/benchmark/torch/AlphaZero/alphazero_agent.py
+++ b/benchmark/torch/AlphaZero/alphazero_agent.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import parl
+import torch
+import torch.optim as optim
+
+from tqdm import tqdm
+from utils import *
+from connect4_model import Connect4Model
+
+args = dotdict({
+    'lr': 0.001,
+    'dropout': 0.3,
+    'epochs': 5,
+    'batch_size': 64,
+    'num_channels': 64,
+})
+
+
+class AlphaZero(parl.Algorithm):
+    def __init__(self, model):
+        self.model = model
+
+    def learn(self, boards, target_pis, target_vs, optimizer):
+        self.model.train()  # train mode
+
+        # compute model output
+        out_log_pi, out_v = self.model(boards)
+
+        pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0]
+
+        v_loss = torch.sum(
+            (target_vs - out_v.view(-1))**2) / target_vs.size()[0]
+
+        total_loss = pi_loss + v_loss
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        total_loss.backward()
+        optimizer.step()
+
+        return total_loss, pi_loss, v_loss
+
+    def predict(self, board):
+        self.model.eval()  # eval mode
+
+        with torch.no_grad():
+            log_pi, v = self.model(board)
+
+        pi = torch.exp(log_pi)
+        return pi, v
+
+
+def create_agent(game, cuda=True):
+    cuda = cuda and torch.cuda.is_available()
+
+    model = Connect4Model(game, args)
+    if cuda:
+        model.cuda()
+
+    algorithm = AlphaZero(model)
+
+    alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
+    return alphazero_agent
+
+
+class AlphaZeroAgent(parl.Agent):
+    def __init__(self, algorithm, game, cuda):
+        super(AlphaZeroAgent, self).__init__(algorithm)
+        self.cuda = cuda
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+
+    def learn(self, examples):
+        """
+        Args:
+            examples: list of examples, each example is of form (board, pi, v)
+        """
+        optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr)
+
+        for epoch in range(args.epochs):
+            print('EPOCH ::: ' + str(epoch + 1))
+
+            batch_count = int(len(examples) / args.batch_size)
+
+            pbar = tqdm(range(batch_count), desc='Training Net')
+            for _ in pbar:
+                sample_ids = np.random.randint(
+                    len(examples), size=args.batch_size)
+                boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
+                boards = torch.FloatTensor(np.array(boards).astype(np.float64))
+                target_pis = torch.FloatTensor(np.array(pis))
+                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))
+
+                if self.cuda:
+                    boards, target_pis, target_vs = boards.contiguous().cuda(
+                    ), target_pis.contiguous().cuda(), target_vs.contiguous(
+                    ).cuda()
+
+                total_loss, pi_loss, v_loss = self.algorithm.learn(
+                    boards, target_pis, target_vs, optimizer)
+
+                # record loss with tqdm
+                pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item())
+
+    def predict(self, board):
+        """
+        Args:
+            board (np.array): input board
+
+        Return:
+            pi (np.array): probability of actions
+            v (np.array): estimated value of input
+        """
+        # preparing input
+        board = torch.FloatTensor(board.astype(np.float64))
+        if self.cuda:
+            board = board.contiguous().cuda()
+        board = board.view(1, self.board_x, self.board_y)
+
+        pi, v = self.algorithm.predict(board)
+
+        return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
+
+
+def create_agent(game, cuda=True):
+    cuda = cuda and torch.cuda.is_available()
+
+    model = Connect4Model(game, args)
+    if cuda:
+        model.cuda()
+
+    algorithm = AlphaZero(model)
+
+    alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
+    return alphazero_agent
--- a/benchmark/torch/AlphaZero/connect4_game.py
+++ b/benchmark/torch/AlphaZero/connect4_game.py
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import numpy as np
+from collections import namedtuple
+
+DEFAULT_HEIGHT = 6
+DEFAULT_WIDTH = 7
+DEFAULT_WIN_LENGTH = 4
+
+WinState = namedtuple('WinState', 'is_ended winner')
+
+
+class Board():
+    """
+    Connect4 Board.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        "Set up initial board configuration."
+        self.height = height or DEFAULT_HEIGHT
+        self.width = width or DEFAULT_WIDTH
+        self.win_length = win_length or DEFAULT_WIN_LENGTH
+
+        if np_pieces is None:
+            self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
+        else:
+            self.np_pieces = np_pieces
+            assert self.np_pieces.shape == (self.height, self.width)
+
+    def add_stone(self, column, player):
+        "Create copy of board containing new stone."
+        available_idx, = np.where(self.np_pieces[:, column] == 0)
+        if len(available_idx) == 0:
+            raise ValueError(
+                "Can't play column %s on board %s" % (column, self))
+
+        self.np_pieces[available_idx[-1]][column] = player
+
+    def get_valid_moves(self):
+        "Any zero value in top row in a valid move"
+        return self.np_pieces[0] == 0
+
+    def get_win_state(self):
+        for player in [-1, 1]:
+            player_pieces = self.np_pieces == -player
+            # Check rows & columns for win
+            if (self._is_straight_winner(player_pieces)
+                    or self._is_straight_winner(player_pieces.transpose())
+                    or self._is_diagonal_winner(player_pieces)):
+                return WinState(True, -player)
+
+        # draw has very little value.
+        if not self.get_valid_moves().any():
+            return WinState(True, None)
+
+        # Game is not ended yet.
+        return WinState(False, None)
+
+    def with_np_pieces(self, np_pieces):
+        """Create copy of board with specified pieces."""
+        if np_pieces is None:
+            np_pieces = self.np_pieces
+        return Board(self.height, self.width, self.win_length, np_pieces)
+
+    def _is_diagonal_winner(self, player_pieces):
+        """Checks if player_pieces contains a diagonal win."""
+        win_length = self.win_length
+        for i in range(len(player_pieces) - win_length + 1):
+            for j in range(len(player_pieces[0]) - win_length + 1):
+                if all(player_pieces[i + x][j + x] for x in range(win_length)):
+                    return True
+            for j in range(win_length - 1, len(player_pieces[0])):
+                if all(player_pieces[i + x][j - x] for x in range(win_length)):
+                    return True
+        return False
+
+    def _is_straight_winner(self, player_pieces):
+        """Checks if player_pieces contains a vertical or horizontal win."""
+        run_lengths = [
+            player_pieces[:, i:i + self.win_length].sum(axis=1)
+            for i in range(len(player_pieces) - self.win_length + 2)
+        ]
+        return max([x.max() for x in run_lengths]) >= self.win_length
+
+    def __str__(self):
+        return str(self.np_pieces)
+
+
+class Connect4Game(object):
+    """
+    Connect4 Game class implementing the alpha-zero-general Game interface.
+
+    Use 1 for player1 and -1 for player2.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        self._base_board = Board(height, width, win_length, np_pieces)
+
+    def getInitBoard(self):
+        """
+        Returns:
+            startBoard: a representation of the board (ideally this is the form
+                        that will be the input to your neural network)
+        """
+        return self._base_board.np_pieces
+
+    def getBoardSize(self):
+        """
+        Returns:
+            (x,y): a tuple of board dimensions
+        """
+        return (self._base_board.height, self._base_board.width)
+
+    def getActionSize(self):
+        """
+        Returns:
+            actionSize: number of all possible actions
+        """
+        return self._base_board.width
+
+    def getNextState(self, board, player, action):
+        """Returns a copy of the board with updated move, original board is unmodified.
+
+        Input:
+            board: current board
+            player: current player (1 or -1)
+            action: action taken by current player
+
+        Returns:
+            nextBoard: board after applying action
+            nextPlayer: player who plays in the next turn (should be -player)
+
+        """
+        b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
+        b.add_stone(action, player)
+        return b.np_pieces, -player
+
+    def getValidMoves(self, board, player):
+        """Any zero value in top row in a valid move.
+
+        Input:
+            board: current board
+            player: current player
+
+        Returns:
+            validMoves: a binary vector of length self.getActionSize(), 1 for
+                        moves that are valid from the current board and player,
+                        0 for invalid moves
+        """
+        return self._base_board.with_np_pieces(
+            np_pieces=board).get_valid_moves()
+
+    def getGameEnded(self, board, player):
+        """
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            r: 0 if game has not ended. 1 if player won, -1 if player lost,
+               small non-zero value for draw.
+               
+        """
+        b = self._base_board.with_np_pieces(np_pieces=board)
+        winstate = b.get_win_state()
+        if winstate.is_ended:
+            if winstate.winner is None:
+                # draw has very little value.
+                return 1e-4
+            elif winstate.winner == player:
+                return +1
+            elif winstate.winner == -player:
+                return -1
+            else:
+                raise ValueError('Unexpected winstate found: ', winstate)
+        else:
+            # 0 used to represent unfinished game.
+            return 0
+
+    def getCanonicalForm(self, board, player):
+        """ 
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            canonicalBoard: returns canonical form of board. The canonical form
+                            should be independent of player. For e.g. in chess,
+                            the canonical form can be chosen to be from the pov
+                            of white. When the player is white, we can return
+                            board as is. When the player is black, we can invert
+                            the colors and return the board.
+        """
+        return board * player
+
+    def getSymmetries(self, board, pi):
+        """Board is left/right board symmetric
+
+        Input:
+            board: current board
+            pi: policy vector of size self.getActionSize()
+
+        Returns:
+            symmForms: a list of [(board,pi)] where each tuple is a symmetrical
+                       form of the board and the corresponding pi vector. This
+                       is used when training the neural network from examples.
+        """
+        return [(board, pi),
+                (np.array(board[:, ::-1], copy=True),
+                 np.array(pi[::-1], copy=True))]
+
+    def stringRepresentation(self, board):
+        """
+        Input:
+            board: current board
+
+        Returns:
+            boardString: a quick conversion of board to a string format.
+                         Required by MCTS for hashing.
+        """
+        return board.tostring()
+
+    @staticmethod
+    def display(board):
+        print(" -----------------------")
+        print(' '.join(map(str, range(len(board[0])))))
+        print(board)
+        print(" -----------------------")
--- a/benchmark/torch/AlphaZero/connect4_model.py
+++ b/benchmark/torch/AlphaZero/connect4_model.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+class Connect4Model(parl.Model):
+    def __init__(self, game, args):
+        # game params
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+        self.args = args
+
+        super(Connect4Model, self).__init__()
+        self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1, padding=1)
+        self.conv3 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+        self.conv4 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+
+        self.bn1 = nn.BatchNorm2d(args.num_channels)
+        self.bn2 = nn.BatchNorm2d(args.num_channels)
+        self.bn3 = nn.BatchNorm2d(args.num_channels)
+        self.bn4 = nn.BatchNorm2d(args.num_channels)
+
+        self.fc1 = nn.Linear(
+            args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
+        self.fc_bn1 = nn.BatchNorm1d(128)
+
+        self.fc2 = nn.Linear(128, 64)
+        self.fc_bn2 = nn.BatchNorm1d(64)
+
+        self.fc3 = nn.Linear(64, self.action_size)
+
+        self.fc4 = nn.Linear(64, 1)
+
+    def forward(self, s):
+        """
+        Args:
+            s(torch.Tensor): batch_size x board_x x board_y
+        """
+        # batch_size x 1 x board_x x board_y
+        s = s.view(-1, 1, self.board_x, self.board_y)
+        # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn1(self.conv1(s)))
+        # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn2(self.conv2(s)))
+        # batch_size x num_channels x (board_x-2) x (board_y-2)
+        s = F.relu(self.bn3(self.conv3(s)))
+        # batch_size x num_channels x (board_x-4) x (board_y-4)
+        s = F.relu(self.bn4(self.conv4(s)))
+        s = s.view(
+            -1,
+            self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
+
+        s = F.dropout(
+            F.relu(self.fc_bn1(self.fc1(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 128
+        s = F.dropout(
+            F.relu(self.fc_bn2(self.fc2(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 64
+
+        pi = self.fc3(s)  # batch_size x action_size
+        v = self.fc4(s)  # batch_size x 1
+
+        return F.log_softmax(pi, dim=1), torch.tanh(v)
--- a/benchmark/torch/AlphaZero/gen_submission.py
+++ b/benchmark/torch/AlphaZero/gen_submission.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import sys
+import base64
+import inspect
+import os
+
+assert len(sys.argv) == 2, "please specify model path."
+model_path = sys.argv[1]
+
+with open(model_path, 'rb') as f:
+    raw_bytes = f.read()
+    encoded_weights = base64.encodebytes(raw_bytes)
+
+# encode weights of model to byte string
+submission_file = """
+import base64
+decoded = base64.b64decode({})
+
+""".format(encoded_weights)
+
+# insert code snippet of loading weights
+with open('submission_template.py', 'r') as f:
+    submission_file += ''.join(f.readlines())
+
+# generate final submission file
+with open('submission.py', 'w') as f:
+    f.write(submission_file)
--- a/benchmark/torch/AlphaZero/main.py
+++ b/benchmark/torch/AlphaZero/main.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Coach import Coach
+from connect4_game import Connect4Game
+from utils import *
+
+from parl.utils import logger
+
+args = dotdict({
+    # master address of xparl cluster
+    'master_address': 'localhost:8010',
+    # number of remote actors (execute tasks [self-play/pitting/evaluate_test_dataset] in parallel).
+    'actors_num': 25,
+
+    # total number of iteration
+    'numIters': 200,
+    # Number of complete self-play games to simulate during a new iteration.
+    'numEps': 500,
+    # Number of games to play during arena (pitting) play to determine if new neural network will be accepted.
+    'arenaCompare': 50,
+    # Number of games moves for MCTS to simulate.
+    'numMCTSSims': 800,
+    # temp=1 (Temperature, τ (tau)) if episodeStep < tempThresholdStep, and thereafter uses temp=0.
+    'tempThresholdStep': 15,
+    # During arena playoff, new neural net will be accepted if threshold or more of games are won.
+    'updateThreshold': 0.6,
+    # CPUCT parameter
+    'cpuct': 4,
+    # alpha parameter of dirichlet noise which is added to the policy (pi)
+    'dirichletAlpha': 1.0,
+    # history of examples from numItersForTrainExamplesHistory latest iterations (training data)
+    'numItersForTrainExamplesHistory': 20,
+
+    # folder to save model and training examples
+    'checkpoint': './saved_model/',
+    # whether to load saved model and training examples
+    'load_model': False,
+    'load_folder_file': ('./saved_model', 'checkpoint_1.pth.tar'),
+})
+
+# Plays arenaCompare games in which player1 starts arenaCompare/2 games and player2 starts arenaCompare/2 games.
+assert args.arenaCompare % 2 == 0
+
+# make sure the tasks can be split evenly among different remote actors
+assert args.numEps % args.actors_num == 0
+assert (args.arenaCompare // 2) % args.actors_num == 0
+assert 1000 % args.actors_num == 0  # there are 1000 boards state in test_dataset
+
+
+def main():
+    game = Connect4Game()
+
+    c = Coach(game, args)
+
+    if args.load_model:
+        logger.info('Loading checkpoint {}...'.format(args.load_folder_file))
+        c.loadModel()
+        logger.info("Loading 'trainExamples' from file {}...".format(
+            args.load_folder_file))
+        c.loadTrainExamples()
+
+    c.learn()
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/torch/AlphaZero/submission_template.py
+++ b/benchmark/torch/AlphaZero/submission_template.py
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import os
+os.environ['OMP_NUM_THREADS'] = "1"
+
+
+# ===== utils.py =====
+class dotdict(dict):
+    def __getattr__(self, name):
+        return self[name]
+
+
+# ===== MCTS.py ======
+import math
+import time
+import numpy as np
+
+EPS = 1e-8
+
+
+class MCTS():
+    """
+    This class handles the MCTS tree.
+    """
+
+    def __init__(self, game, nn_agent, args, dirichlet_noise=False):
+        self.game = game
+        self.nn_agent = nn_agent
+        self.args = args
+        self.dirichlet_noise = dirichlet_noise
+        self.Qsa = {}  # stores Q values for s,a (as defined in the paper)
+        self.Nsa = {}  # stores #times edge s,a was visited
+        self.Ns = {}  # stores #times board s was visited
+        self.Ps = {}  # stores initial policy (returned by neural net)
+
+        self.Es = {}  # stores game.getGameEnded ended for board s
+        self.Vs = {}  # stores game.getValidMoves for board s
+
+    def getActionProb(self, canonicalBoard, temp=1, timelimit=4.9):
+        """
+        This function performs numMCTSSims simulations of MCTS starting from
+        canonicalBoard.
+
+        Returns:
+            probs: a policy vector where the probability of the ith action is
+                   proportional to Nsa[(s,a)]**(1./temp)
+        """
+        dir_noise = self.dirichlet_noise
+        start_time = time.time()
+        while time.time() - start_time < timelimit:
+            self.search(canonicalBoard, dirichlet_noise=dir_noise)
+
+        s = self.game.stringRepresentation(canonicalBoard)
+        counts = [
+            self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
+            for a in range(self.game.getActionSize())
+        ]
+
+        if temp == 0:
+            bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
+            bestA = np.random.choice(bestAs)
+            probs = [0] * len(counts)
+            probs[bestA] = 1
+            return probs
+
+        counts = [x**(1. / temp) for x in counts]
+        counts_sum = float(sum(counts))
+        probs = [x / counts_sum for x in counts]
+        return probs
+
+    def search(self, canonicalBoard, dirichlet_noise=False):
+        """
+        This function performs one iteration of MCTS. It is recursively called
+        till a leaf node is found. The action chosen at each node is one that
+        has the maximum upper confidence bound as in the paper.
+
+        Once a leaf node is found, the neural network is called to return an
+        initial policy P and a value v for the state. This value is propagated
+        up the search path. In case the leaf node is a terminal state, the
+        outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
+        updated.
+
+        NOTE: the return values are the negative of the value of the current
+        state. This is done since v is in [-1,1] and if v is the value of a
+        state for the current player, then its value is -v for the other player.
+
+        Returns:
+            v: the negative of the value of the current canonicalBoard
+        """
+
+        s = self.game.stringRepresentation(canonicalBoard)
+
+        if s not in self.Es:
+            self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
+        if self.Es[s] != 0:
+            # terminal node
+            return -self.Es[s]
+
+        if s not in self.Ps:
+            # leaf node
+            self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
+
+            valids = self.game.getValidMoves(canonicalBoard, 1)
+            self.Ps[s] = self.Ps[s] * valids  # masking invalid moves
+            if dirichlet_noise:
+                self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            if sum_Ps_s > 0:
+                self.Ps[s] /= sum_Ps_s  # renormalize
+            else:
+                # if all valid moves were masked make all valid moves equally probable
+
+                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
+                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
+                print("All valid moves were masked, doing a workaround.")
+                self.Ps[s] = self.Ps[s] + valids
+                self.Ps[s] /= np.sum(self.Ps[s])
+
+            self.Vs[s] = valids
+            self.Ns[s] = 0
+            return -v
+
+        valids = self.Vs[s]
+        if dirichlet_noise:
+            self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            self.Ps[s] /= sum_Ps_s  # renormalize
+        cur_best = -float('inf')
+        best_act = -1
+
+        # pick the action with the highest upper confidence bound
+        for a in range(self.game.getActionSize()):
+            if valids[a]:
+                if (s, a) in self.Qsa:
+                    u = self.Qsa[
+                        (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                            self.Ns[s]) / (1 + self.Nsa[(s, a)])
+                else:
+                    u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                        self.Ns[s] + EPS)  # Q = 0 ?
+
+                if u > cur_best:
+                    cur_best = u
+                    best_act = a
+
+        a = best_act
+        next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
+        next_s = self.game.getCanonicalForm(next_s, next_player)
+
+        v = self.search(next_s)
+
+        if (s, a) in self.Qsa:
+            self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
+                (s, a)] + v) / (self.Nsa[(s, a)] + 1)
+            self.Nsa[(s, a)] += 1
+
+        else:
+            self.Qsa[(s, a)] = v
+            self.Nsa[(s, a)] = 1
+
+        self.Ns[s] += 1
+        return -v
+
+    def applyDirNoise(self, s, valids):
+        dir_values = np.random.dirichlet(
+            [self.args.dirichletAlpha] * np.count_nonzero(valids))
+        dir_idx = 0
+        for idx in range(len(self.Ps[s])):
+            if self.Ps[s][idx]:
+                self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
+                    0.25 * dir_values[dir_idx])
+                dir_idx += 1
+
+
+# ===== connect4_game.py ======
+import numpy as np
+from collections import namedtuple
+
+DEFAULT_HEIGHT = 6
+DEFAULT_WIDTH = 7
+DEFAULT_WIN_LENGTH = 4
+
+WinState = namedtuple('WinState', 'is_ended winner')
+
+
+class Board():
+    """
+    Connect4 Board.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        "Set up initial board configuration."
+        self.height = height or DEFAULT_HEIGHT
+        self.width = width or DEFAULT_WIDTH
+        self.win_length = win_length or DEFAULT_WIN_LENGTH
+
+        if np_pieces is None:
+            self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
+        else:
+            self.np_pieces = np_pieces
+            assert self.np_pieces.shape == (self.height, self.width)
+
+    def add_stone(self, column, player):
+        "Create copy of board containing new stone."
+        available_idx, = np.where(self.np_pieces[:, column] == 0)
+        if len(available_idx) == 0:
+            raise ValueError(
+                "Can't play column %s on board %s" % (column, self))
+
+        self.np_pieces[available_idx[-1]][column] = player
+
+    def get_valid_moves(self):
+        "Any zero value in top row in a valid move"
+        return self.np_pieces[0] == 0
+
+    def get_win_state(self):
+        for player in [-1, 1]:
+            player_pieces = self.np_pieces == -player
+            # Check rows & columns for win
+            if (self._is_straight_winner(player_pieces)
+                    or self._is_straight_winner(player_pieces.transpose())
+                    or self._is_diagonal_winner(player_pieces)):
+                return WinState(True, -player)
+
+        # draw has very little value.
+        if not self.get_valid_moves().any():
+            return WinState(True, None)
+
+        # Game is not ended yet.
+        return WinState(False, None)
+
+    def with_np_pieces(self, np_pieces):
+        """Create copy of board with specified pieces."""
+        if np_pieces is None:
+            np_pieces = self.np_pieces
+        return Board(self.height, self.width, self.win_length, np_pieces)
+
+    def _is_diagonal_winner(self, player_pieces):
+        """Checks if player_pieces contains a diagonal win."""
+        win_length = self.win_length
+        for i in range(len(player_pieces) - win_length + 1):
+            for j in range(len(player_pieces[0]) - win_length + 1):
+                if all(player_pieces[i + x][j + x] for x in range(win_length)):
+                    return True
+            for j in range(win_length - 1, len(player_pieces[0])):
+                if all(player_pieces[i + x][j - x] for x in range(win_length)):
+                    return True
+        return False
+
+    def _is_straight_winner(self, player_pieces):
+        """Checks if player_pieces contains a vertical or horizontal win."""
+        run_lengths = [
+            player_pieces[:, i:i + self.win_length].sum(axis=1)
+            for i in range(len(player_pieces) - self.win_length + 2)
+        ]
+        return max([x.max() for x in run_lengths]) >= self.win_length
+
+    def __str__(self):
+        return str(self.np_pieces)
+
+
+class Connect4Game(object):
+    """
+    Connect4 Game class implementing the alpha-zero-general Game interface.
+
+    Use 1 for player1 and -1 for player2.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        self._base_board = Board(height, width, win_length, np_pieces)
+
+    def getInitBoard(self):
+        """
+        Returns:
+            startBoard: a representation of the board (ideally this is the form
+                        that will be the input to your neural network)
+        """
+        return self._base_board.np_pieces
+
+    def getBoardSize(self):
+        """
+        Returns:
+            (x,y): a tuple of board dimensions
+        """
+        return (self._base_board.height, self._base_board.width)
+
+    def getActionSize(self):
+        """
+        Returns:
+            actionSize: number of all possible actions
+        """
+        return self._base_board.width
+
+    def getNextState(self, board, player, action):
+        """Returns a copy of the board with updated move, original board is unmodified.
+
+        Input:
+            board: current board
+            player: current player (1 or -1)
+            action: action taken by current player
+
+        Returns:
+            nextBoard: board after applying action
+            nextPlayer: player who plays in the next turn (should be -player)
+
+        """
+        b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
+        b.add_stone(action, player)
+        return b.np_pieces, -player
+
+    def getValidMoves(self, board, player):
+        """Any zero value in top row in a valid move.
+
+        Input:
+            board: current board
+            player: current player
+
+        Returns:
+            validMoves: a binary vector of length self.getActionSize(), 1 for
+                        moves that are valid from the current board and player,
+                        0 for invalid moves
+        """
+        return self._base_board.with_np_pieces(
+            np_pieces=board).get_valid_moves()
+
+    def getGameEnded(self, board, player):
+        """
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            r: 0 if game has not ended. 1 if player won, -1 if player lost,
+               small non-zero value for draw.
+               
+        """
+        b = self._base_board.with_np_pieces(np_pieces=board)
+        winstate = b.get_win_state()
+        if winstate.is_ended:
+            if winstate.winner is None:
+                # draw has very little value.
+                return 1e-4
+            elif winstate.winner == player:
+                return +1
+            elif winstate.winner == -player:
+                return -1
+            else:
+                raise ValueError('Unexpected winstate found: ', winstate)
+        else:
+            # 0 used to represent unfinished game.
+            return 0
+
+    def getCanonicalForm(self, board, player):
+        """ 
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            canonicalBoard: returns canonical form of board. The canonical form
+                            should be independent of player. For e.g. in chess,
+                            the canonical form can be chosen to be from the pov
+                            of white. When the player is white, we can return
+                            board as is. When the player is black, we can invert
+                            the colors and return the board.
+        """
+        return board * player
+
+    def getSymmetries(self, board, pi):
+        """Board is left/right board symmetric
+
+        Input:
+            board: current board
+            pi: policy vector of size self.getActionSize()
+
+        Returns:
+            symmForms: a list of [(board,pi)] where each tuple is a symmetrical
+                       form of the board and the corresponding pi vector. This
+                       is used when training the neural network from examples.
+        """
+        return [(board, pi),
+                (np.array(board[:, ::-1], copy=True),
+                 np.array(pi[::-1], copy=True))]
+
+    def stringRepresentation(self, board):
+        """
+        Input:
+            board: current board
+
+        Returns:
+            boardString: a quick conversion of board to a string format.
+                         Required by MCTS for hashing.
+        """
+        return board.tostring()
+
+    @staticmethod
+    def display(board):
+        print(" -----------------------")
+        print(' '.join(map(str, range(len(board[0])))))
+        print(board)
+        print(" -----------------------")
+
+
+# ===== connect4_model ======
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+#class Connect4Model(parl.Model): # Kaggle doesn't support parl package
+class Connect4Model(nn.Module):
+    def __init__(self, game, args):
+        # game params
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+        self.args = args
+
+        super(Connect4Model, self).__init__()
+        self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1, padding=1)
+        self.conv3 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+        self.conv4 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+
+        self.bn1 = nn.BatchNorm2d(args.num_channels)
+        self.bn2 = nn.BatchNorm2d(args.num_channels)
+        self.bn3 = nn.BatchNorm2d(args.num_channels)
+        self.bn4 = nn.BatchNorm2d(args.num_channels)
+
+        self.fc1 = nn.Linear(
+            args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
+        self.fc_bn1 = nn.BatchNorm1d(128)
+
+        self.fc2 = nn.Linear(128, 64)
+        self.fc_bn2 = nn.BatchNorm1d(64)
+
+        self.fc3 = nn.Linear(64, self.action_size)
+
+        self.fc4 = nn.Linear(64, 1)
+
+    def forward(self, s):
+        #                                                            s: batch_size x board_x x board_y
+        s = s.view(-1, 1, self.board_x,
+                   self.board_y)  # batch_size x 1 x board_x x board_y
+        s = F.relu(self.bn1(
+            self.conv1(s)))  # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn2(
+            self.conv2(s)))  # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn3(self.conv3(
+            s)))  # batch_size x num_channels x (board_x-2) x (board_y-2)
+        s = F.relu(self.bn4(self.conv4(
+            s)))  # batch_size x num_channels x (board_x-4) x (board_y-4)
+        s = s.view(
+            -1,
+            self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
+
+        s = F.dropout(
+            F.relu(self.fc_bn1(self.fc1(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 128
+        s = F.dropout(
+            F.relu(self.fc_bn2(self.fc2(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 64
+
+        pi = self.fc3(s)  # batch_size x action_size
+        v = self.fc4(s)  # batch_size x 1
+
+        return F.log_softmax(pi, dim=1), torch.tanh(v)
+
+
+# ===== simple agent ======
+args = dotdict({
+    'dropout': 0.3,
+    'num_channels': 64,
+})
+
+
+class SimpleAgent():
+    def __init__(self, game, cuda=True):
+        self.cuda = cuda and torch.cuda.is_available()
+        self.model = Connect4Model(game, args)
+        if self.cuda:
+            self.model.cuda()
+
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+
+    def predict(self, board):
+        """
+        Args:
+            board (np.array): input board
+
+        Return:
+            pi (np.array): probability of actions
+            v (np.array): estimated value of input
+        """
+        # preparing input
+        board = torch.FloatTensor(board.astype(np.float64))
+        if self.cuda:
+            board = board.contiguous().cuda()
+        board = board.view(1, self.board_x, self.board_y)
+
+        self.model.eval()  # eval mode
+
+        with torch.no_grad():
+            log_pi, v = self.model(board)
+
+        pi = torch.exp(log_pi)
+
+        return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
+
+    def load_checkpoint(self, buffer):
+        map_location = None if self.cuda else 'cpu'
+        checkpoint = torch.load(buffer, map_location=map_location)
+        self.model.load_state_dict(checkpoint)
+
+
+# ===== predict function ======
+import base64
+import io
+
+game = Connect4Game()
+
+# AlphaZero players
+agent = SimpleAgent(game)
+buffer = io.BytesIO(decoded)
+agent.load_checkpoint(buffer)
+mcts_args = dotdict({'numMCTSSims': 800, 'cpuct': 1.0})
+mcts = MCTS(game, agent, mcts_args)
+
+
+def alphazero_agent(obs, config):
+    board = np.reshape(obs.board.copy(), game.getBoardSize()).astype(int)
+    board[np.where(board == 2)] = -1
+
+    player = 1
+    if obs.mark == 2:
+        player = -1
+
+    x = game.getCanonicalForm(board, player)
+
+    action = np.argmax(
+        mcts.getActionProb(x, temp=0, timelimit=config.timeout - 0.5))
+    return int(action)
--- a/benchmark/torch/AlphaZero/utils.py
+++ b/benchmark/torch/AlphaZero/utils.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class dotdict(dict):
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+
+def win_loss_draw(score):
+    if score > 0:
+        return 'win'
+    if score < 0:
+        return 'loss'
+    return 'draw'
+
+
+"""
+split one list to multiple lists
+"""
+split_group = lambda the_list, group_size: zip(*(iter(the_list), ) * group_size)
+
+import numpy as np
+import json
+from connect4_game import Connect4Game
+
+
+def get_test_dataset():
+    game = Connect4Game()
+    test_dataset = []
+    with open("refmoves1k_kaggle") as f:
+        for line in f:
+            data = json.loads(line)
+
+            board = data["board"]
+            board = np.reshape(board, game.getBoardSize()).astype(int)
+            board[np.where(board == 2)] = -1
+
+            # find out how many moves are played to set the correct mark.
+            ply = len([x for x in data["board"] if x > 0])
+            if ply & 1:
+                player = -1
+            else:
+                player = 1
+
+            test_dataset.append({
+                'board': board,
+                'player': player,
+                'move_score': data['move score'],
+            })
+    return test_dataset
--- a/benchmark/torch/a2c/train.py
+++ b/benchmark/torch/a2c/train.py
@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind
 from parl.utils.window_stat import WindowStat
 from parl.utils.time_stat import TimeStat
 from parl.utils import machine_info
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
 from parl.algorithms import A2C

 from atari_model import ActorCritic
@@ -205,19 +205,19 @@ class Learner(object):
        }

        if metric['mean_episode_rewards'] is not None:
-            tensorboard.add_scalar('train/mean_reward',
-                                   metric['mean_episode_rewards'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/total_loss', metric['total_loss'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/pi_loss', metric['pi_loss'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/vf_loss', metric['vf_loss'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/entropy', metric['entropy'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/learn_rate', metric['lr'],
-                                   self.sample_total_steps)
+            summary.add_scalar('train/mean_reward',
+                               metric['mean_episode_rewards'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/total_loss', metric['total_loss'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/pi_loss', metric['pi_loss'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/vf_loss', metric['vf_loss'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/entropy', metric['entropy'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/learn_rate', metric['lr'],
+                               self.sample_total_steps)

        logger.info(metric)


--- a/benchmark/torch/dqn/replay_memory.py
+++ b/benchmark/torch/dqn/replay_memory.py
@@ -16,16 +16,16 @@ import numpy as np
 import copy
 from collections import deque, namedtuple

-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])


 class ReplayMemory(object):
-    def __init__(self, max_size, state_shape, context_len):
+    def __init__(self, max_size, obs_shape, context_len):
        self.max_size = int(max_size)
-        self.state_shape = state_shape
+        self.obs_shape = obs_shape
        self.context_len = int(context_len)

-        self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
+        self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
        self.action = np.zeros((self.max_size, ), dtype='int32')
        self.reward = np.zeros((self.max_size, ), dtype='float32')
        self.isOver = np.zeros((self.max_size, ), dtype='bool')
@@ -48,42 +48,41 @@ class ReplayMemory(object):
        else:
            self._context.append(exp)

-    def recent_state(self):
-        """ maintain recent state for training"""
+    def recent_obs(self):
+        """ maintain recent obs for training"""
        lst = list(self._context)
-        states = [np.zeros(self.state_shape, dtype='uint8')] * \
+        obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
                    (self._context.maxlen - len(lst))
-        states.extend([k.state for k in lst])
-        return states
+        obs.extend([k.obs for k in lst])
+        return obs

    def sample(self, idx):
-        """ return state, action, reward, isOver,
-            note that some frames in state may be generated from last episode,
-            they should be removed from state
+        """ return obs, action, reward, isOver,
+            note that some frames in obs may be generated from last episode,
+            they should be removed from obs
            """
-        state = np.zeros(
-            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
-        state_idx = np.arange(idx,
-                              idx + self.context_len + 1) % self._curr_size
+        obs = np.zeros(
+            (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+        obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size

        # confirm that no frame was generated from last episode
        has_last_episode = False
        for k in range(self.context_len - 2, -1, -1):
-            to_check_idx = state_idx[k]
+            to_check_idx = obs_idx[k]
            if self.isOver[to_check_idx]:
                has_last_episode = True
-                state_idx = state_idx[k + 1:]
-                state[k + 1:] = self.state[state_idx]
+                obs_idx = obs_idx[k + 1:]
+                obs[k + 1:] = self.obs[obs_idx]
                break

        if not has_last_episode:
-            state = self.state[state_idx]
+            obs = self.obs[obs_idx]

        real_idx = (idx + self.context_len - 1) % self._curr_size
        action = self.action[real_idx]
        reward = self.reward[real_idx]
        isOver = self.isOver[real_idx]
-        return state, reward, action, isOver
+        return obs, reward, action, isOver

    def __len__(self):
        return self._curr_size
@@ -92,7 +91,7 @@ class ReplayMemory(object):
        return self._curr_size

    def _assign(self, pos, exp):
-        self.state[pos] = exp.state
+        self.obs[pos] = exp.obs
        self.reward[pos] = exp.reward
        self.action[pos] = exp.action
        self.isOver[pos] = exp.isOver
@@ -107,8 +106,8 @@ class ReplayMemory(object):
        return self._process_batch(batch_exp)

    def _process_batch(self, batch_exp):
-        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
        reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
        action = np.asarray([e[2] for e in batch_exp], dtype='int8')
        isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
-        return [state, action, reward, isOver]
+        return [obs, action, reward, isOver]
--- a/benchmark/torch/dqn/train.py
+++ b/benchmark/torch/dqn/train.py
@@ -22,11 +22,11 @@ import parl

 import numpy as np
 from tqdm import tqdm
-from parl.utils import tensorboard, logger
+from parl.utils import summary, logger
 from parl.algorithms import DQN, DDQN

 from agent import AtariAgent
-from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState
+from atari_wrapper import FireResetEnv, FrameStack, LimitLength
 from model import AtariModel
 from replay_memory import ReplayMemory, Experience
 from utils import get_player
@@ -43,57 +43,57 @@ GAMMA = 0.99
 def run_train_episode(env, agent, rpm):
    total_reward = 0
    all_cost = []
-    state = env.reset()
+    obs = env.reset()
    steps = 0
    while True:
        steps += 1
-        context = rpm.recent_state()
-        context.append(state)
+        context = rpm.recent_obs()
+        context.append(obs)
        context = np.stack(context, axis=0)
        action = agent.sample(context)
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append(Experience(state, action, reward, isOver))
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append(Experience(obs, action, reward, isOver))
        if rpm.size() > MEMORY_WARMUP_SIZE:
            if steps % UPDATE_FREQ == 0:
-                batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                    args.batch_size)
-                batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-                batch_next_state = batch_all_state[:, 1:, :, :]
-                cost = agent.learn(batch_state, batch_action, batch_reward,
-                                   batch_next_state, batch_isOver)
+                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+                batch_next_obs = batch_all_obs[:, 1:, :, :]
+                cost = agent.learn(batch_obs, batch_action, batch_reward,
+                                   batch_next_obs, batch_isOver)
                all_cost.append(cost)
        total_reward += reward
-        state = next_state
+        obs = next_obs
        if isOver:
            mean_loss = np.mean(all_cost) if all_cost else None
            return total_reward, steps, mean_loss


 def run_evaluate_episode(env, agent):
-    state = env.reset()
+    obs = env.reset()
    total_reward = 0
    while True:
-        pred_Q = agent.predict(state)
+        pred_Q = agent.predict(obs)
        action = pred_Q.max(1)[1].item()
-        state, reward, isOver, _ = env.step(action)
+        obs, reward, isOver, _ = env.step(action)
        total_reward += reward
        if isOver:
            return total_reward


-def get_fixed_states(rpm, batch_size):
-    states = []
+def get_fixed_obs(rpm, batch_size):
+    obs = []
    for _ in range(3):
-        batch_all_state = rpm.sample_batch(batch_size)[0]
-        batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-        states.append(batch_state)
-    fixed_states = np.concatenate(states, axis=0)
-    return fixed_states
+        batch_all_obs = rpm.sample_batch(batch_size)[0]
+        batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+        obs.append(batch_obs)
+    fixed_obs = np.concatenate(obs, axis=0)
+    return fixed_obs


-def evaluate_fixed_Q(agent, states):
+def evaluate_fixed_Q(agent, obs):
    with torch.no_grad():
-        max_pred_Q = agent.alg.model(states).max(1)[0].mean()
+        max_pred_Q = agent.alg.model(obs).max(1)[0].mean()
    return max_pred_Q.item()


@@ -131,9 +131,9 @@ def main():
            total_reward, steps, _ = run_train_episode(env, agent, rpm)
            pbar.update(steps)

-    # Get fixed states to check value function.
-    fixed_states = get_fixed_states(rpm, args.batch_size)
-    fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device)
+    # Get fixed obs to check value function.
+    fixed_obs = get_fixed_obs(rpm, args.batch_size)
+    fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device)

    # train
    test_flag = 0
@@ -152,18 +152,17 @@ def main():
                for _ in range(3):
                    eval_rewards.append(run_evaluate_episode(test_env, agent))

-                tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards),
-                                       total_steps)
-                tensorboard.add_scalar('dqn/score', total_reward, total_steps)
-                tensorboard.add_scalar('dqn/loss', loss, total_steps)
-                tensorboard.add_scalar('dqn/exploration', agent.exploration,
-                                       total_steps)
-                tensorboard.add_scalar('dqn/Q value',
-                                       evaluate_fixed_Q(agent, fixed_states),
-                                       total_steps)
-                tensorboard.add_scalar('dqn/grad_norm',
-                                       get_grad_norm(agent.alg.model),
-                                       total_steps)
+                summary.add_scalar('dqn/eval', np.mean(eval_rewards),
+                                   total_steps)
+                summary.add_scalar('dqn/score', total_reward, total_steps)
+                summary.add_scalar('dqn/loss', loss, total_steps)
+                summary.add_scalar('dqn/exploration', agent.exploration,
+                                   total_steps)
+                summary.add_scalar('dqn/Q value',
+                                   evaluate_fixed_Q(agent, fixed_obs),
+                                   total_steps)
+                summary.add_scalar('dqn/grad_norm',
+                                   get_grad_norm(agent.alg.model), total_steps)


 if __name__ == '__main__':

--- a/benchmark/torch/ppo/arguments.py
+++ b/benchmark/torch/ppo/arguments.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='RL')
+    parser.add_argument(
+        '--lr', type=float, default=3e-4, help='learning rate (default: 3e-4)')
+    parser.add_argument(
+        '--eps',
+        type=float,
+        default=1e-5,
+        help='RMSprop optimizer epsilon (default: 1e-5)')
+    parser.add_argument(
+        '--gamma',
+        type=float,
+        default=0.99,
+        help='discount factor for rewards (default: 0.99)')
+    parser.add_argument(
+        '--gae-lambda',
+        type=float,
+        default=0.95,
+        help='gae lambda parameter (default: 0.95)')
+    parser.add_argument(
+        '--entropy-coef',
+        type=float,
+        default=0.,
+        help='entropy term coefficient (default: 0.)')
+    parser.add_argument(
+        '--value-loss-coef',
+        type=float,
+        default=0.5,
+        help='value loss coefficient (default: 0.5)')
+    parser.add_argument(
+        '--max-grad-norm',
+        type=float,
+        default=0.5,
+        help='max norm of gradients (default: 0.5)')
+    parser.add_argument(
+        '--seed', type=int, default=1, help='random seed (default: 1)')
+    parser.add_argument(
+        '--num-steps',
+        type=int,
+        default=2048,
+        help='number of maximum forward steps in ppo (default: 2048)')
+    parser.add_argument(
+        '--ppo-epoch',
+        type=int,
+        default=10,
+        help='number of ppo epochs (default: 10)')
+    parser.add_argument(
+        '--num-mini-batch',
+        type=int,
+        default=32,
+        help='number of batches for ppo (default: 32)')
+    parser.add_argument(
+        '--clip-param',
+        type=float,
+        default=0.2,
+        help='ppo clip parameter (default: 0.2)')
+    parser.add_argument(
+        '--log-interval',
+        type=int,
+        default=1,
+        help='log interval, one log per n updates (default: 1)')
+    parser.add_argument(
+        '--eval-interval',
+        type=int,
+        default=10,
+        help='eval interval, one eval per n updates (default: 10)')
+    parser.add_argument(
+        '--num-env-steps',
+        type=int,
+        default=10e5,
+        help='number of environment steps to train (default: 10e5)')
+    parser.add_argument(
+        '--env-name',
+        default='Hopper-v2',
+        help='environment to train on (default: Hopper-v2)')
+    parser.add_argument(
+        '--use-linear-lr-decay',
+        action='store_true',
+        default=False,
+        help='use a linear schedule on the learning rate')
+    args = parser.parse_args()
+
+    args.cuda = torch.cuda.is_available()
+
+    return args
--- a/benchmark/torch/ppo/evaluation.py
+++ b/benchmark/torch/ppo/evaluation.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+
+import utils
+from wrapper import make_env
+
+
+def evaluate(agent, ob_rms, env_name, seed, device):
+    if seed != None:
+        seed += 1
+    eval_envs = make_env(env_name, seed, None)
+    vec_norm = utils.get_vec_normalize(eval_envs)
+    if vec_norm is not None:
+        vec_norm.eval()
+        vec_norm.ob_rms = ob_rms
+
+    eval_episode_rewards = []
+
+    obs = eval_envs.reset()
+    eval_masks = torch.zeros(1, 1, device=device)
+
+    while len(eval_episode_rewards) < 10:
+        with torch.no_grad():
+            action = agent.predict(obs)
+
+        # Obser reward and next obs
+        obs, _, done, infos = eval_envs.step(action)
+
+        eval_masks = torch.tensor(
+            [[0.0] if done_ else [1.0] for done_ in done],
+            dtype=torch.float32,
+            device=device)
+
+        for info in infos:
+            if 'episode' in info.keys():
+                eval_episode_rewards.append(info['episode']['r'])
+
+    eval_envs.close()
+
+    print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
+        len(eval_episode_rewards), np.mean(eval_episode_rewards)))
+    return np.mean(eval_episode_rewards)
--- a/benchmark/torch/ppo/mujoco_agent.py
+++ b/benchmark/torch/ppo/mujoco_agent.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+import torch
+
+
+class MujocoAgent(parl.Agent):
+    def __init__(self, algorithm, device):
+        self.alg = algorithm
+        self.device = device
+
+    def predict(self, obs):
+        obs = torch.from_numpy(obs).float().to(self.device)
+        action = self.alg.predict(obs)
+        return action.cpu().numpy()
+
+    def sample(self, obs):
+        obs = torch.from_numpy(obs).to(self.device)
+        value, action, action_log_probs = self.alg.sample(obs)
+        return value.cpu().numpy(), action.cpu().numpy(), \
+            action_log_probs.cpu().numpy()
+
+    def learn(self, next_value, gamma, gae_lambda, ppo_epoch, num_mini_batch,
+              rollouts):
+        value_loss_epoch = 0
+        action_loss_epoch = 0
+        dist_entropy_epoch = 0
+
+        for e in range(ppo_epoch):
+            data_generator = rollouts.sample_batch(next_value, gamma,
+                                                   gae_lambda, num_mini_batch)
+
+            for sample in data_generator:
+                obs_batch, actions_batch, \
+                    value_preds_batch, return_batch, old_action_log_probs_batch, \
+                            adv_targ = sample
+
+                obs_batch = torch.from_numpy(obs_batch).to('cuda')
+                actions_batch = torch.from_numpy(actions_batch).to('cuda').to(
+                    'cuda')
+                value_preds_batch = torch.from_numpy(value_preds_batch).to(
+                    'cuda')
+                return_batch = torch.from_numpy(return_batch).to('cuda')
+                old_action_log_probs_batch = torch.from_numpy(
+                    old_action_log_probs_batch).to('cuda')
+                adv_targ = torch.from_numpy(adv_targ).to('cuda')
+
+                value_loss, action_loss, dist_entropy = self.alg.learn(
+                    obs_batch, actions_batch, value_preds_batch, return_batch,
+                    old_action_log_probs_batch, adv_targ)
+
+                value_loss_epoch += value_loss
+                action_loss_epoch += action_loss
+                dist_entropy_epoch += dist_entropy
+
+        num_updates = ppo_epoch * num_mini_batch
+
+        value_loss_epoch /= num_updates
+        action_loss_epoch /= num_updates
+        dist_entropy_epoch /= num_updates
+
+        return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
+
+    def value(self, obs):
+        obs = torch.from_numpy(obs).to(self.device)
+        return self.alg.value(obs).cpu().numpy()
--- a/benchmark/torch/ppo/mujoco_model.py
+++ b/benchmark/torch/ppo/mujoco_model.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Normal
+
+
+class MujocoModel(parl.Model):
+    def __init__(self, obs_dim, act_dim):
+        super(MujocoModel, self).__init__()
+        self.actor = Actor(obs_dim, act_dim)
+        self.critic = Critic(obs_dim)
+
+    def policy(self, obs):
+        return self.actor(obs)
+
+    def value(self, obs):
+        return self.critic(obs)
+
+
+class Actor(parl.Model):
+    def __init__(self, obs_dim, act_dim):
+        super(Actor, self).__init__()
+        self.fc1 = nn.Linear(obs_dim, 64)
+        self.fc2 = nn.Linear(64, 64)
+
+        self.fc_mean = nn.Linear(64, act_dim)
+        self.log_std = nn.Parameter(torch.zeros(act_dim))
+
+    def forward(self, obs):
+        x = torch.tanh(self.fc1(obs))
+        x = torch.tanh(self.fc2(x))
+
+        mean = self.fc_mean(x)
+        return mean, self.log_std
+
+
+class Critic(parl.Model):
+    def __init__(self, obs_dim):
+        super(Critic, self).__init__()
+        self.fc1 = nn.Linear(obs_dim, 64)
+        self.fc2 = nn.Linear(64, 64)
+        self.fc3 = nn.Linear(64, 1)
+
+    def forward(self, obs):
+        x = torch.tanh(self.fc1(obs))
+        x = torch.tanh(self.fc2(x))
+        value = self.fc3(x)
+
+        return value
--- a/benchmark/torch/ppo/storage.py
+++ b/benchmark/torch/ppo/storage.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
+
+
+class RolloutStorage(object):
+    def __init__(self, num_steps, obs_dim, act_dim):
+        self.num_steps = num_steps
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+
+        self.obs = np.zeros((num_steps + 1, obs_dim), dtype='float32')
+        self.actions = np.zeros((num_steps, act_dim), dtype='float32')
+        self.value_preds = np.zeros((num_steps + 1, ), dtype='float32')
+        self.returns = np.zeros((num_steps + 1, ), dtype='float32')
+        self.action_log_probs = np.zeros((num_steps, ), dtype='float32')
+        self.rewards = np.zeros((num_steps, ), dtype='float32')
+
+        self.masks = np.ones((num_steps + 1, ), dtype='bool')
+        self.bad_masks = np.ones((num_steps + 1, ), dtype='bool')
+
+        self.step = 0
+
+    def append(self, obs, actions, action_log_probs, value_preds, rewards,
+               masks, bad_masks):
+        """
+        print("obs")
+        print(obs)
+        print("masks")
+        print(masks)
+        print("rewards")
+        print(rewards)
+        exit()
+        """
+        self.obs[self.step + 1] = obs
+        self.actions[self.step] = actions
+        self.rewards[self.step] = rewards
+        self.action_log_probs[self.step] = action_log_probs
+        self.value_preds[self.step] = value_preds
+        self.masks[self.step + 1] = masks
+        self.bad_masks[self.step + 1] = bad_masks
+
+        self.step = (self.step + 1) % self.num_steps
+
+    def sample_batch(self,
+                     next_value,
+                     gamma,
+                     gae_lambda,
+                     num_mini_batch,
+                     mini_batch_size=None):
+        # calculate return and advantage first
+        self.compute_returns(next_value, gamma, gae_lambda)
+        advantages = self.returns[:-1] - self.value_preds[:-1]
+        advantages = (advantages - advantages.mean()) / (
+            advantages.std() + 1e-5)
+
+        # generate sample batch
+        mini_batch_size = self.num_steps // num_mini_batch
+        sampler = BatchSampler(
+            SubsetRandomSampler(range(self.num_steps)),
+            mini_batch_size,
+            drop_last=True)
+        for indices in sampler:
+            obs_batch = self.obs[:-1][indices]
+            actions_batch = self.actions[indices]
+            value_preds_batch = self.value_preds[:-1][indices]
+            returns_batch = self.returns[:-1][indices]
+            old_action_log_probs_batch = self.action_log_probs[indices]
+
+            value_preds_batch = value_preds_batch.reshape(-1, 1)
+            returns_batch = returns_batch.reshape(-1, 1)
+            old_action_log_probs_batch = old_action_log_probs_batch.reshape(
+                -1, 1)
+
+            adv_targ = advantages[indices]
+            adv_targ = adv_targ.reshape(-1, 1)
+
+            yield obs_batch, actions_batch, value_preds_batch, returns_batch, old_action_log_probs_batch, adv_targ
+
+    def after_update(self):
+        self.obs[0] = np.copy(self.obs[-1])
+        self.masks[0] = np.copy(self.masks[-1])
+        self.bad_masks[0] = np.copy(self.bad_masks[-1])
+
+    def compute_returns(self, next_value, gamma, gae_lambda):
+        self.value_preds[-1] = next_value
+        gae = 0
+        for step in reversed(range(self.rewards.size)):
+            delta = self.rewards[step] + gamma * self.value_preds[
+                step + 1] * self.masks[step + 1] - self.value_preds[step]
+            gae = delta + gamma * gae_lambda * self.masks[step + 1] * gae
+            gae = gae * self.bad_masks[step + 1]
+            self.returns[step] = gae + self.value_preds[step]
--- a/benchmark/torch/ppo/train.py
+++ b/benchmark/torch/ppo/train.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# modified from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail
+
+import copy
+import os
+from collections import deque
+
+import gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+import utils
+from arguments import get_args
+from wrapper import make_env
+from mujoco_model import MujocoModel
+from parl.algorithms import PPO
+from mujoco_agent import MujocoAgent
+from storage import RolloutStorage
+from evaluation import evaluate
+
+
+def main():
+    args = get_args()
+
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    torch.set_num_threads(1)
+    device = torch.device("cuda:0" if args.cuda else "cpu")
+
+    envs = make_env(args.env_name, args.seed, args.gamma)
+
+    model = MujocoModel(envs.observation_space.shape[0],
+                        envs.action_space.shape[0])
+    model.to(device)
+
+    algorithm = PPO(
+        model,
+        args.clip_param,
+        args.value_loss_coef,
+        args.entropy_coef,
+        initial_lr=args.lr,
+        eps=args.eps,
+        max_grad_norm=args.max_grad_norm)
+
+    agent = MujocoAgent(algorithm, device)
+
+    rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
+                              envs.action_space.shape[0])
+
+    obs = envs.reset()
+    rollouts.obs[0] = np.copy(obs)
+
+    episode_rewards = deque(maxlen=10)
+
+    num_updates = int(args.num_env_steps) // args.num_steps
+    for j in range(num_updates):
+
+        if args.use_linear_lr_decay:
+            # decrease learning rate linearly
+            utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
+                                         args.lr)
+
+        for step in range(args.num_steps):
+            # Sample actions
+            with torch.no_grad():
+                value, action, action_log_prob = agent.sample(
+                    rollouts.obs[step])  # why use obs from rollouts???有病吧
+
+            # Obser reward and next obs
+            obs, reward, done, infos = envs.step(action)
+
+            for info in infos:
+                if 'episode' in info.keys():
+                    episode_rewards.append(info['episode']['r'])
+
+            # If done then clean the history of observations.
+            masks = torch.FloatTensor(
+                [[0.0] if done_ else [1.0] for done_ in done])
+            bad_masks = torch.FloatTensor(
+                [[0.0] if 'bad_transition' in info.keys() else [1.0]
+                 for info in infos])
+            rollouts.append(obs, action, action_log_prob, value, reward, masks,
+                            bad_masks)
+
+        with torch.no_grad():
+            next_value = agent.value(rollouts.obs[-1])
+
+        value_loss, action_loss, dist_entropy = agent.learn(
+            next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
+            args.num_mini_batch, rollouts)
+
+        rollouts.after_update()
+
+        if j % args.log_interval == 0 and len(episode_rewards) > 1:
+            total_num_steps = (j + 1) * args.num_steps
+            print(
+                "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
+                .format(j, total_num_steps, len(episode_rewards),
+                        np.mean(episode_rewards), np.median(episode_rewards),
+                        np.min(episode_rewards), np.max(episode_rewards),
+                        dist_entropy, value_loss, action_loss))
+
+        if (args.eval_interval is not None and len(episode_rewards) > 1
+                and j % args.eval_interval == 0):
+            ob_rms = utils.get_vec_normalize(envs).ob_rms
+            eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
+                                        args.seed, device)
+
+
+if __name__ == "__main__":
+    main()
--- a/benchmark/torch/ppo/utils.py
+++ b/benchmark/torch/ppo/utils.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+import torch
+import torch.nn as nn
+
+from wrapper import VecNormalize
+
+
+def get_vec_normalize(venv):
+    if isinstance(venv, VecNormalize):
+        return venv
+    elif hasattr(venv, 'venv'):
+        return get_vec_normalize(venv.venv)
+
+    return None
+
+
+def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
+    """Decreases the learning rate linearly"""
+    lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def init(module, weight_init, bias_init, gain=1):
+    weight_init(module.weight.data, gain=gain)
+    bias_init(module.bias.data)
+    return module
--- a/benchmark/torch/ppo/wrapper.py
+++ b/benchmark/torch/ppo/wrapper.py
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simplified version of https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl/component/envs.py
+
+import numpy as np
+import gym
+from gym.core import Wrapper
+import time
+
+
+class TimeLimitMask(gym.Wrapper):
+    def step(self, action):
+        obs, rew, done, info = self.env.step(action)
+        if done and self.env._max_episode_steps == self.env._elapsed_steps:
+            info['bad_transition'] = True
+        return obs, rew, done, info
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+
+
+class MonitorEnv(gym.Wrapper):
+    def __init__(self, env):
+        Wrapper.__init__(self, env=env)
+        self.tstart = time.time()
+        self.rewards = None
+
+    def step(self, action):
+        ob, rew, done, info = self.env.step(action)
+        self.update(ob, rew, done, info)
+        return (ob, rew, done, info)
+
+    def update(self, ob, rew, done, info):
+        self.rewards.append(rew)
+        if done:
+            eprew = sum(self.rewards)
+            eplen = len(self.rewards)
+            epinfo = {
+                "r": round(eprew, 6),
+                "l": eplen,
+                "t": round(time.time() - self.tstart, 6)
+            }
+            assert isinstance(info, dict)
+            info['episode'] = epinfo
+            self.reset()
+
+    def reset(self, **kwargs):
+        self.rewards = []
+        return self.env.reset(**kwargs)
+
+
+class VectorEnv(gym.Wrapper):
+    def step(self, action):
+        ob, rew, done, info = self.env.step(action)
+        ob = np.array(ob)
+        ob = ob[np.newaxis, :]
+        rew = np.array([rew])
+
+        done = np.array([done])
+
+        info = [info]
+        return (ob, rew, done, info)
+
+
+class RunningMeanStd(object):
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    def __init__(self, epsilon=1e-4, shape=()):
+        self.mean = np.zeros(shape, 'float64')
+        self.var = np.ones(shape, 'float64')
+        self.count = epsilon
+
+    def update(self, x):
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        self.mean, self.var, self.count = update_mean_var_count_from_moments(
+            self.mean, self.var, self.count, batch_mean, batch_var,
+            batch_count)
+
+
+def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
+                                       batch_count):
+    delta = batch_mean - mean
+    tot_count = count + batch_count
+
+    new_mean = mean + delta * batch_count / tot_count
+    m_a = var * count
+    m_b = batch_var * batch_count
+    M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
+    new_var = M2 / tot_count
+    new_count = tot_count
+
+    return new_mean, new_var, new_count
+
+
+class VecNormalize(gym.Wrapper):
+    def __init__(self,
+                 env,
+                 ob=True,
+                 ret=True,
+                 clipob=10.,
+                 cliprew=10.,
+                 gamma=0.99,
+                 epsilon=1e-8):
+        Wrapper.__init__(self, env=env)
+        observation_space = env.observation_space.shape[0]
+
+        self.ob_rms = RunningMeanStd(shape=observation_space) if ob else None
+        self.ret_rms = RunningMeanStd(shape=()) if ret else None
+
+        self.clipob = clipob
+        self.cliprew = cliprew
+        self.gamma = gamma
+        self.epsilon = epsilon
+        self.ret = np.zeros(1)
+        self.training = True
+
+    def step(self, action):
+        ob, rew, new, info = self.env.step(action)
+        self.ret = self.ret * self.gamma + rew
+        # normalize observation
+        ob = self._obfilt(ob)
+        # normalize reward
+        if self.ret_rms:
+            self.ret_rms.update(self.ret)
+            rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon),
+                          -self.cliprew, self.cliprew)
+        self.ret[new] = 0.
+        return ob, rew, new, info
+
+    def reset(self):
+        self.ret = np.zeros(1)
+        ob = self.env.reset()
+        return self._obfilt(ob)
+
+    def _obfilt(self, ob, update=True):
+        if self.ob_rms:
+            if self.training and update:
+                self.ob_rms.update(ob)
+            ob = np.clip((ob - self.ob_rms.mean) /
+                         np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob,
+                         self.clipob)
+            return ob
+        else:
+            return ob
+
+    def train(self):
+        self.training = True
+
+    def eval(self):
+        self.trainint = False
+
+
+def make_env(env_name, seed, gamma):
+    env = gym.make(env_name)
+    env.seed(seed)
+    env = TimeLimitMask(env)
+    env = MonitorEnv(env)
+    env = VectorEnv(env)
+    if gamma is None:
+        env = VecNormalize(env, ret=False)
+    else:
+        env = VecNormalize(env, gamma=gamma)
+
+    return env
--- a/benchmark/torch/td3/train.py
+++ b/benchmark/torch/td3/train.py
@@ -15,7 +15,7 @@
 import gym
 import argparse
 import numpy as np
-from parl.utils import logger, tensorboard, ReplayMemory
+from parl.utils import logger, summary, ReplayMemory

 from mujoco_model import MujocoModel
 from mujoco_agent import MujocoAgent
@@ -103,8 +103,7 @@ def main():
        train_reward, steps = run_train_episode(env, agent, rpm)
        total_steps += steps
        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
-        tensorboard.add_scalar('train/episode_reward', train_reward,
-                               total_steps)
+        summary.add_scalar('train/episode_reward', train_reward, total_steps)

        if total_steps // args.test_every_steps >= test_flag:
            while total_steps // args.test_every_steps >= test_flag:
@@ -112,8 +111,8 @@ def main():
            evaluate_reward = run_evaluate_episode(env, agent)
            logger.info('Steps {}, Evaluate reward: {}'.format(
                total_steps, evaluate_reward))
-            tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
-                                   total_steps)
+            summary.add_scalar('eval/episode_reward', evaluate_reward,
+                               total_steps)


 if __name__ == '__main__':

--- a/docs/EvoKit/minimal_example.rst
+++ b/docs/EvoKit/minimal_example.rst
+minimal example
+---------------------
+
+``本教程的目标:
+演示如何通过EvoKit库来解决经典的CartPole 问题。``
+
+*本教程假定读者曾经使用过PaddlePaddle, 了解基本的进化算法迭代流程。*
+
+CartPole 介绍
+#############
+CartPole又叫倒立摆。小车上放了一根杆，杆会因重力而倒下。为了不让杆倒下，我们要通过移动小车，来保持其是直立的。如下图所示。
+在每一个时间步，模型的输入是一个4维的向量,表示当前小车和杆的状态，模型输出的信号用于控制小车往左或者右移动。当杆没有倒下的时候，每个时间步，环境会给1分的奖励；当杆倒下后，环境不会给任何的奖励，游戏结束。
+
+.. image:: ../../examples/QuickStart/performance.gif
+  :width: 300px
+
+step1: 生成预测网络
+########################
+根据上面的环境介绍，我们需要构造一个神经网络，输入为4维的向量，输出为2维的概率分布向量（表示左/右）移动的概率。
+在这里，我们使用Paddle来实现预测网络，并保存到本地。
+
+.. code-block:: python
+
+	from paddle import fluid
+	
+	def net(obs, act_dim):
+	    hid1 = fluid.layers.fc(obs, size=20)
+	    prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
+	    return prob
+	
+	if __name__ == '__main__':
+	    obs_dim = 4
+	    act_dim = 2
+	    obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
+	    prob = net(obs, act_dim)
+	
+	    exe = fluid.Executor(fluid.CPUPlace())
+	    exe.run(fluid.default_startup_program())
+	    fluid.io.save_inference_model(
+	        dirname='init_model',
+	        feeded_var_names=['obs'],
+	        target_vars=[prob],
+	        params_filename='params',
+	        model_filename='model',
+	        executor=exe)
+
+step2: 构造ESAgent
+###################
+
+- 调用 ``load_config`` 加载配置文件。
+- 调用 ``load_inference_model`` 函数加载模型参数。
+- 调用 ``init_solver`` 初始化solver。
+
+配置文件主要是用于指定进化算法类型（比如Gaussian或者CMA）,使用的optimizer类型（Adam或者SGD）。
+
+.. code-block:: c++
+
+    ESAgent agent = ESAgent();
+    agent.load_config(config);
+    agent.load_inference_model(model_dir);
+    agent.init_solver();
+
+    // 附：EvoKit配置项示范
+    solver {
+        type: BASIC_ES
+        optimizer { // 线下Adam更新
+            type: ADAM
+            base_lr: 0.05
+            adam {
+                beta1: 0.9
+                beta2: 0.999
+                epsilon: 1e-08
+            }
+        }
+        sampling { // 线上高斯采样
+            type: GAUSSIAN_SAMPLING
+            gaussian_sampling {
+                std: 0.5
+                cached: true
+                seed: 1024
+                cache_size : 100000
+            }
+        }
+    }
+
+
+step3: 生成用于采样的Agent
+###################
+
+主要关注三个接口：
+
+- 调用 ``clone`` 生成一个用于sampling的agent。
+- 调用 ``add_noise`` 给这个agent的参数空间增加噪声，同时返回该噪声对应的唯一信息，这个信息得记录在log中，用于线下更新。
+- 调用 ``predict`` 提供预测接口。
+
+.. code-block:: c++
+
+    auto sampling_agent = agent.clone();
+    auto sampling_info = sampling_agent.add_noise();
+    sampling_agent.predict(feature);
+
+step4: 用采样的数据更新模型参数
+###################
+
+用户提供两组数据：
+
+- 采样参数过程中用于线下复现采样噪声的sampling_info
+- 扰动参数后，新参数的评估结果
+
+.. code-block:: c++
+
+    agent.update(sampling_infos, rewards);
+
+主代码以及注释
+#################
+
+以下的代码演示通过多线程同时采样, 提升解决问题的效率。
+
+.. code-block:: c++
+
+    int main(int argc, char* argv[]) {
+        std::vector<CartPole> envs;
+        // 构造10个环境，用于多线程训练
+        for (int i = 0; i < ITER; ++i) {
+            envs.push_back(CartPole());
+        }
+    
+        // 初始化ESAgent
+        std::string model_dir = "./demo/cartpole/init_model";
+        std::string config_path = "./demo/cartpole/config.prototxt";
+        std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+        agent->load_config(config_path); // 加载配置
+
+        agent->load_inference_model(FLAGS_model_dir); // 加载初始预测模型
+        agent->init_solver(); // 初始化solver，注意要在load_inference_model后执行
+    
+        // 生成10个agent用于同时采样
+        std::vector<std::shared_ptr<ESAgent>> sampling_agents;
+        for (int i = 0; i < ITER; ++i) {
+            sampling_agents.push_back(agent->clone());
+        }
+    
+        std::vector<SamplingInfo> sampling_infos;
+        std::vector<float> rewards(ITER, 0.0f);
+        sampling_infos.resize(ITER);
+        omp_set_num_threads(10);
+    
+        // 共迭代100轮
+        for (int epoch = 0; epoch < 100; ++epoch) {
+            #pragma omp parallel for schedule(dynamic, 1)
+            for (int i = 0; i < ITER; ++i) {
+                std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
+                SamplingInfo sampling_info;
+                sampling_agent->add_noise(sampling_info);
+                float reward = evaluate(envs[i], sampling_agent);
+                // 保存采样的sampling_info以及对应的评估结果reward
+                sampling_infos[i] = sampling_info;
+                rewards[i] = reward;
+            }
+            // 更新模型参数，注意：参数更新后会自动同步到sampling_agent中
+            agent->update(sampling_infos, rewards);
+    
+            int reward = evaluate(envs[0], agent);
+            LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; // 打印每一轮reward
+        }
+    }
+
+如何运行demo
+#################
+
+- 下载代码
+
+  在icode上clone代码，我们的仓库路径是： ``baidu/nlp/deep-es`` ``TO DO: 修改库路径``
+
+- 编译demo
+
+  通过bcloud的云端集群编译即可，命令为： ``bb``
+
+- 运行demo
+
+  编译完成后，我们需要增加动态库查找路径：
+
+  ``export LD_LIBRARY_PATH=./output/so/:$LD_LIBRARY_PATH``
+
+  运行demo： ``./output/bin/cartpole/train``
+
+问题解决
+####################
+
+在使用过程中有任何问题，请加hi群: 1692822 (PARL官方答疑群)进行咨询，开发同学会直接回答任何的使用问题。
--- a/docs/EvoKit/online_example.rst
+++ b/docs/EvoKit/online_example.rst
+Example for Online Products
+#########################
+
+``本教程的目标: 演示通过EvoKit库上线后，如何迭代算法，更新模型参数。``
+
+在产品线中，线上无法实时拿到用户日志，经常是通过保存用户点击/时长日志，在线下根据用户数据更新模型，然后再推送到线上，完成算法的更新。
+本教程继续围绕经典的CartPole环境,展示如何通过在线采样/离线更新的方式，来更新迭代ES算法。
+
+demo的完整代码示例放在demp/online_example文件夹中。
+``TO DO: 文件夹``
+
+初始化solver
+---------------------
+构造solver，对它初始化，并保存到文件。初始化solver仅需在开始时调用一次。
+
+.. code-block:: c++
+
+    std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+    agent->load_config(FLAGS_config_path);
+    agent->load_inference_model(FLAGS_model_dir);
+    agent->init_solver();
+    agent->save_solver(FLAGS_model_dir);
+
+
+线上采样
+---------------------
+加载模型和solver，记录线上采样返回的sampling_info以及评估的reward，并通过二进制的方式记录到log文件中。
+
+.. code-block:: c++
+
+    std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+    agent->load_config(FLAGS_config_path);
+    agent->load_inference_model(FLAGS_model_dir);
+    agent->load_solver(FLAGS_model_dir);
+
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (int i = 0; i < ITER; ++i) {
+        std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
+        SamplingInfo sampling_info;
+        sampling_agent->add_noise(sampling_info);
+        float reward = evaluate(envs[i], sampling_agent);
+        sampling_infos[i] = sampling_info;
+        rewards[i] = reward;
+    } 
+
+    // save sampling information and log in binary fomrat
+    std::ofstream log_stream(FLAGS_log_path, std::ios::binary);
+    for (int i = 0; i < ITER; ++i) {
+        std::string data;
+        sampling_infos[i].SerializeToString(&data);
+        int size = data.size();
+        log_stream.write((char*) &rewards[i], sizeof(float));
+        log_stream.write((char*) &size, sizeof(int));
+        log_stream.write(data.c_str(), size);
+    } 
+    log_stream.close();
+
+
+线下更新
+-----------------------
+在加载好之前记录的log之后，调用 ``update`` 函数进行更新，然后通过 ``save_inference_model`` 和 ``save_solver`` 函数保存更新后的参数到本地，推送到线上。
+
+.. code-block:: c++
+
+    std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+    agent->load_config(FLAGS_config_path);
+    agent->load_inference_model(FLAGS_model_dir);
+    agent->load_solver(FLAGS_model_dir);
+
+    // load training data
+    std::vector<SamplingInfo> sampling_infos;
+    std::vector<float> rewards(ITER, 0.0f);
+    sampling_infos.resize(ITER);
+    std::ifstream log_stream(FLAGS_log_path);
+    CHECK(log_stream.good()) << "[EvoKit] cannot open log: " << FLAGS_log_path;
+    char buffer[1000];
+    for (int i = 0; i < ITER; ++i) {
+        int size;
+        log_stream.read((char*) &rewards[i], sizeof(float));
+        log_stream.read((char*) &size, sizeof(int));
+        log_stream.read(buffer, size);
+        buffer[size] = 0;
+        std::string data(buffer);
+        sampling_infos[i].ParseFromString(data);
+    } 
+
+    // update model and save parameter
+    agent->update(sampling_infos, rewards);
+    agent->save_inference_model(FLAGS_updated_model_dir);
+    agent->save_solver(FLAGS_updated_model_dir);
+
+
+主代码
+-----------------------
+
+将以上代码分别编译成可执行文件。
+
+- 初始化solver: ``init_solver`` 。
+- 线上采样: ``online_sampling`` 。
+- 线下更新: ``offline update`` 。
+
+.. code-block:: shell
+
+    #------------------------init solver------------------------
+    ./init_solver \
+        --model_dir="./model_warehouse/model_dir_0" \
+        --config_path="config.prototxt"
+
+
+    for ((epoch=0;epoch<200;++epoch));do
+    #------------------------online sampling------------------------
+        ./online_sampling \
+            --log_path="./sampling_log" \
+            --model_dir="./model_warehouse/model_dir_$epoch" \
+            --config_path="./config.prototxt"
+
+    #------------------------offline update------------------------
+        next_epoch=$((epoch+1))
+        ./offline_update \
+            --log_path='./sampling_log' \
+            --model_dir="./model_warehouse/model_dir_$epoch" \
+            --updated_model_dir="./model_warehouse/model_dir_${next_epoch}" \
+            --config_path="./config.prototxt"
+    done
--- a/docs/EvoKit/overview.rst
+++ b/docs/EvoKit/overview.rst
+Overview
+------------------
+
+``EvoKit`` 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库，主打 **快速上线验证** 。
+
+.. image:: ../../evo_kit/DeepES.gif
+  :align: center
+  :width: 400px
+
+特性
+#########
+
+**1. 多种进化算法支持。** 支持高斯采样、CMA、GA等算法，更多算法持续接入中。
+
+**2. 主流优化器支持。** 支持SGD/Momentum/Adam等多个主流优化器，有效提升算法收敛效率。
+
+**3. 一站式上线。** 整合了线上采样和线下更新流程, 提供Bcloud/Cmake等编译方式, 助力快速上线。
+
+**4. 深度学习框架全系列兼容。** 裸写的网络，paddle/lego/Torch等深度学习框架，EvoKit都支持。
+
+**5. 同步/异步更新方式。** 支持多个采样模型/多份采样数据异步更新，完美契合业务场景。
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -101,3 +101,37 @@ def setup(app):


 add_module_names = False
+
+latex_engine = 'xelatex'
+latex_use_xindy = False
+latex_elements = {
+    'preamble': '\\usepackage[UTF8]{ctex}\n',
+}
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #'preamble': '',
+    'preamble':
+    r'''
+    \hypersetup{unicode=true}
+    \usepackage{CJKutf8}
+    \DeclareUnicodeCharacter{00A0}{\nobreakspace}
+    \DeclareUnicodeCharacter{2203}{\ensuremath{\exists}}
+    \DeclareUnicodeCharacter{2200}{\ensuremath{\forall}}
+    \DeclareUnicodeCharacter{2286}{\ensuremath{\subseteq}}
+    \DeclareUnicodeCharacter{2713}{x}
+    \DeclareUnicodeCharacter{27FA}{\ensuremath{\Longleftrightarrow}}
+    \DeclareUnicodeCharacter{221A}{\ensuremath{\sqrt{}}}
+    \DeclareUnicodeCharacter{221B}{\ensuremath{\sqrt[3]{}}}
+    \DeclareUnicodeCharacter{2295}{\ensuremath{\oplus}}
+    \DeclareUnicodeCharacter{2297}{\ensuremath{\otimes}}
+    \begin{CJK}{UTF8}{gbsn}
+    \AtEndDocument{\end{CJK}}
+    ''',
+}
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -46,7 +46,7 @@ Abstractions
    :maxdepth: 1
    :caption: Installation

-   installation.rst
+    installation.rst

 .. toctree::
    :maxdepth: 1
@@ -58,9 +58,10 @@ Abstractions
    :maxdepth: 1
    :caption: Tutorial

-    getting_started.rst
-    new_alg.rst
-    save_param.rst
+    tutorial/getting_started.rst
+    tutorial/new_alg.rst
+    tutorial/save_param.rst
+    tutorial/tensorboard.rst

 .. toctree::
    :maxdepth: 2
@@ -83,3 +84,11 @@ Abstractions
   model.rst
   algorithm.rst
   agent.rst
+
+.. toctree::
+   :maxdepth: 2
+   :caption: EvoKit
+
+   EvoKit/overview.rst
+   EvoKit/minimal_example.rst
+   EvoKit/online_example.rst
--- a/docs/tutorial/add_histogram.jpg
+++ b/docs/tutorial/add_histogram.jpg
--- a/docs/tutorial/add_scalar.jpg
+++ b/docs/tutorial/add_scalar.jpg
--- a/docs/getting_started.rst
+++ b/docs/getting_started.rst
@@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi
 Summary
 -----------

-.. image:: ../examples/QuickStart/performance.gif
+.. image:: ../../examples/QuickStart/performance.gif
  :width: 300px
-.. image:: ./images/quickstart.png
+.. image:: ../images/quickstart.png
  :width: 300px
 In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem.


--- a/docs/new_alg.rst
+++ b/docs/new_alg.rst
@@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods:
        
        Args:
            model (parl.Model): model defining forward network of Q function
-            hyperparas (dict): (deprecated) dict of hyper parameters.
            act_dim (int): dimension of the action space
            gamma (float): discounted factor for reward computation.
            lr (float): learning rate.

--- a/docs/save_param.rst
+++ b/docs/save_param.rst
@@ -22,5 +22,5 @@ Here is a demonstration of usage:
    agent.restore('./model.ckpt')    

    # restore the parameters from ./model.ckpt to another_agent
-    another_agent =  AtariAgent()
+    another_agent = AtariAgent()
    another_agent.restore('./model.ckpt')    
--- a/docs/tutorial/tensorboard.rst
+++ b/docs/tutorial/tensorboard.rst
+summary
+===============
+
+Visualize the results with tensorboard. 
+
+add_scalar
+-------------
+
+Common used arguments:
+
+* summary.add_scalar(tag, scalar_value, global_step=None)
+    * tag *(string)* – Data identifier
+    * scalar_value *(float or string/blobname)* – Value to save
+    * global_step *(int)* – Global step value to record
+
+Example:
+
+.. code-block:: python
+
+    from parl.utils import summary
+
+    x = range(100)
+    for i in x:
+        summary.add_scalar('y=2x', i * 2, i)
+
+Expected result:
+
+    .. image:: add_scalar.jpg
+        :scale: 50 %
+            
+add_histogram
+----------------
+
+Common used arguments:
+
+* summary.add_scalar(tag, scalar_value, global_step=None)
+    * tag *(string)* – Data identifier
+    * values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram
+    * global_step *(int)* – Global step value to record
+
+Example:
+
+.. code-block:: python
+
+    from parl.utils import summary
+    import numpy as np
+
+    for i in range(10):
+        x = np.random.random(1000)
+        summary.add_histogram('distribution centers', x + i, i)
+
+Expected result:
+
+    .. image:: add_histogram.jpg
+        :scale: 50 %
--- a/evo_kit/CMakeLists.txt
+++ b/evo_kit/CMakeLists.txt
+cmake_minimum_required (VERSION 2.6)
+project (EvoKit)
+
+########## options ##########
+option(WITH_PADDLE "Compile EvoKit with PaddleLite framework." OFF)
+option(WITH_TORCH "Compile EvoKit with Torch framework." OFF)
+
+message("WITH_PADDLE: "${WITH_PADDLE})
+message("WITH_TORCH: "${WITH_TORCH})
+
+if (NOT (WITH_PADDLE OR WITH_TORCH))
+  message("ERROR: You should choose at least one framework to compile EvoKit.")
+  return()
+elseif(WITH_PADDLE AND WITH_TORCH)
+  message("ERROR: You cannot choose more than one framework to compile EvoKit.")
+  return()
+endif()
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+
+file(GLOB src "core/src/*.cc" "core/proto/evo_kit/*.cc")
+include_directories("core/include")
+include_directories("core/proto")
+include_directories("benchmark")
+
+########## PaddleLite config ##########
+if (WITH_PADDLE)
+  add_definitions(-g -O3 -pthread)
+
+  include_directories("paddle/include")
+  include_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/include" 
+                   "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/include")
+  link_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/lib" 
+                   "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/lib")
+
+  file(GLOB framework_src "paddle/src/*.cc")
+  set(TARGET EvoKit_paddle)
+########## Torch config ##########
+elseif (WITH_TORCH)
+  # list(APPEND CMAKE_PREFIX_PATH "./libtorch")
+  # find_package(Torch REQUIRED ON)  # TODO: not necessary for now
+
+  include_directories("torch/include")
+
+  file(GLOB framework_src "torch/src/*.cc")
+  set(TARGET EvoKit_torch)
+else ()
+  message("ERROR: You should choose at least one framework to compile EvoKit.")
+endif()
+
+
+add_library(${TARGET} STATIC ${src} ${framework_src})
+target_link_libraries(${TARGET} gflags protobuf pthread glog)
+
+
+# ########## PaddleLite libraries ##########
+# if (WITH_PADDLE)
+#   target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+#   target_link_libraries(${TARGET} -lmklml_intel)
+#   target_link_libraries(${TARGET} -ldl)
+# ########## Torch libraries ##########
+# elseif (WITH_TORCH)
+#   target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
+# endif()
+
+file(GLOB include "core/include/evo_kit/*.h")
+file(GLOB proto_include "core/proto/evo_kit/*.h")
+file(GLOB torch_include "torch/include/evo_kit/*.h")
+file(GLOB paddle_include "paddle/include/evo_kit/*.h")
+file(GLOB benchmark_include "benchmark/*.h")
+file(GLOB findcmake "cmake/Torch/*.cmake")
+
+set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/libevokit")
+install(TARGETS ${TARGET} ARCHIVE DESTINATION "lib")
+install(FILES ${include} ${proto_include} DESTINATION "include/evo_kit")
+install(FILES ${torch_include} DESTINATION "torch/evo_kit")
+install(FILES ${paddle_include} DESTINATION "paddle/evo_kit")
+install(FILES ${benchmark_include} DESTINATION "include")
+install(FILES ${findcmake} DESTINATION "cmake/Torch")
--- a/evo_kit/DeepES.gif
+++ b/evo_kit/DeepES.gif
--- a/evo_kit/README.md
+++ b/evo_kit/README.md
+# EvoKit
+EvoKit 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库，主打快速上线验证 。
+<p align="center">
+<img src="DeepES.gif" alt="PARL" width="500"/>
+</p>
+
+## 使用示范
+```c++
+//实例化一个预测，根据配置文件加载模型，采样方式（Gaussian\CMA sampling..)、更新方式(SGD\Adam)等
+auto agent = ESAgent(config); 
+
+for (int i = 0; i < 10; ++i) {
+   auto sampling_agnet = agent->clone(); // clone出一个sampling agent
+   SamplingInfo info;
+   sampling_agent->add_noise(info); // 参数扰动，同时保存随机种子到info中
+   int reward = evaluate(env, sampling_agent); //评估参数
+   noisy_info.push_back(info); // 记录随机噪声对应种子
+   noisy_rewards.push_back(reward); // 记录评估结果
+}
+//根据评估结果、随机种子更新参数，然后重复以上过程，直到收敛。
+agent->update(noisy_info, noisy_rewards);
+```
+
+## 一键运行demo列表
+- **PaddleLite**: sh ./scripts/build.sh paddle
+- **Torch**: sh ./scripts/build.sh torch
+- **裸写网络**：
+
+## 相关依赖:
+- Protobuf2
+- OpenMP
+- [glog](https://github.com/gflags/gflags/blob/master/INSTALL.md)
+- [gflag](https://github.com/google/glog)
+
+## 额外依赖：
+
+### 使用PaddleLite
+下载PaddleLite的X86预编译库，或者编译PaddleLite源码，得到inference_lite_lib文件夹，放在当前目录中。(可参考：[PaddleLite使用X86预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html))
+
+### 使用torch 
+下载[libtorch](https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip)或者编译torch源码，得到libtorch文件夹，放在当前目录中。
--- a/evo_kit/benchmark/cartpole.h
+++ b/evo_kit/benchmark/cartpole.h
+// Third party code
+// This code is copied or modified from openai/gym's cartpole.py
+#include <iostream>
+#include <random>
+#include <cassert>
+#include <vector>
+
+const double kPi = 3.1415926535898;
+
+class CartPole {
+public:
+  double gravity = 9.8;
+  double masscart = 1.0;
+  double masspole = 0.1;
+  double total_mass = (masspole + masscart);
+  double length = 0.5; // actually half the pole's length;
+  double polemass_length = (masspole * length);
+  double force_mag = 10.0;
+  double tau = 0.02; // seconds between state updates;
+
+  // Angle at which to fail the episode
+  double theta_threshold_radians = 12 * 2 * kPi / 360;
+  double x_threshold = 2.4;
+  int steps_beyond_done = -1;
+
+  std::vector<float> state = {0, 0, 0, 0};
+  double reward;
+  bool done;
+  int step_ = 0;
+
+  const float* getState() {
+    return state.data();
+  }
+
+  double getReward() {
+    return reward;
+  }
+
+  double isDone() {
+    return done;
+  }
+
+  void reset() {
+    std::random_device rd;
+    std::default_random_engine generator(rd());
+    std::uniform_real_distribution<float> distribution(-0.05, 0.05);
+    for (int i = 0; i < 4; ++i) {
+      state[i] = distribution(generator);
+    }
+    
+    steps_beyond_done = -1;
+    step_ = 0;
+  }
+
+  CartPole() {
+    reset();
+  }
+
+  void step(int action) {
+    float x = state[0];
+    float x_dot = state[1];
+    float theta = state[2];
+    float theta_dot = state[3];
+
+    auto force = (action == 1) ? force_mag : -force_mag;
+    auto costheta = std::cos(theta);
+    auto sintheta = std::sin(theta);
+    auto temp = (force + polemass_length * theta_dot * theta_dot * sintheta) /
+      total_mass;
+    auto thetaacc = (gravity * sintheta - costheta * temp) /
+      (length * (4.0 / 3.0 - masspole * costheta * costheta / total_mass));
+    auto xacc = temp - polemass_length * thetaacc * costheta / total_mass;
+
+    x = x + tau * x_dot;
+    x_dot = x_dot + tau * xacc;
+    theta = theta + tau * theta_dot;
+    theta_dot = theta_dot + tau * thetaacc;
+
+    state = {x, x_dot, theta, theta_dot};
+
+    done = x < -x_threshold || x > x_threshold ||
+      theta < -theta_threshold_radians || theta > theta_threshold_radians ||
+      step_ > 200;
+
+    if (!done) {
+      reward = 1.0;
+    } else if (steps_beyond_done == -1) {
+      // Pole just fell!
+      steps_beyond_done = 0;
+      reward = 0;
+    } else {
+      if (steps_beyond_done == 0) {
+        assert(false); // Can't do this
+      }
+    }
+    step_++;
+  }
+};
--- a/evo_kit/cmake/Torch/EvoKitConfig.cmake
+++ b/evo_kit/cmake/Torch/EvoKitConfig.cmake
+# FindEvoKit
+# -------
+#
+# Finds the EvoKit library
+#
+# This will define the following variables:
+#
+#   EVOKIT_FOUND        -- True if the system has the EvoKit library
+#   EVOKIT_INCLUDE_DIRS -- The include directories for EvoKit
+#   EVOKIT_LIBRARY    -- Libraries to link against
+#
+# and the following imported targets:
+#
+#   EvoKit
+
+include(FindPackageHandleStandardArgs)
+
+if (DEFINED ENV{EVOKIT_INSTALL_PREFIX})
+  set(EVOKIT_INSTALL_PREFIX $ENV{EVOKIT_INSTALL_PREFIX})
+else()
+  # Assume we are in <install-prefix>/cmake/Torch/EvoKitConfig.cmake
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  get_filename_component(EVOKIT_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
+endif()
+
+# Include directories.
+if (EXISTS "${EVOKIT_INSTALL_PREFIX}/include")
+  set(EVOKIT_INCLUDE_DIRS
+    ${EVOKIT_INSTALL_PREFIX}/include
+    ${EVOKIT_INSTALL_PREFIX}/torch)
+else()
+  set(EVOKIT_INCLUDE_DIRS
+    ${EVOKIT_INSTALL_PREFIX}/include
+    ${EVOKIT_INSTALL_PREFIX}/torch)
+endif()
+
+find_library(EVOKIT_LIBRARY libEvoKit_torch.a PATHS "${EVOKIT_INSTALL_PREFIX}/lib")
+
+include_directories("${EVOKIT_INSTALL_PREFIX}/torch")
+include_directories("${EVOKIT_INSTALL_PREFIX}/include")
+
+find_package_handle_standard_args(EvoKit DEFAULT_MSG EVOKIT_LIBRARY EVOKIT_INCLUDE_DIRS)
+message(STATUS "EVOKIT_FOUND: ${EVOKIT_FOUND}")
+message(STATUS "EVOKIT_INCLUDE_DIRS: ${EVOKIT_INCLUDE_DIRS}")
+message(STATUS "EVOKIT_LIBRARY: ${EVOKIT_LIBRARY}")
--- a/evo_kit/core/include/evo_kit/adam_optimizer.h
+++ b/evo_kit/core/include/evo_kit/adam_optimizer.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_ADAM_OPTIMIZER_H
+#define EVO_KIT_ADAM_OPTIMIZER_H
+
+#include <cmath>
+#include <unordered_map>
+#include "evo_kit/optimizer.h"
+
+namespace evo_kit {
+
+/*@brief AdamOptimizer.
+  * Implements Adam algorithm.
+  *
+  *@Args:
+  *     base_lr: learning rate (default: 1e-3).
+  *     beta1: coefficients used for computing running averages of gradient (default: 0.9).
+  *     beta2: coefficients used for computing running averages of gradient's square (default: 0.999).
+  *     epsilon: term added to the denominator to improve numerical stability (default: 1e-8).
+  */
+class AdamOptimizer: public Optimizer {
+public:
+    AdamOptimizer(float base_lr, float beta1 = 0.9, float beta2 = 0.999,
+                  float epsilon = 1e-8): Optimizer(base_lr), \
+        _beta1(beta1), _beta2(beta2), _epsilon(epsilon) {}
+    ~AdamOptimizer();
+
+protected:
+    void compute_step(float* gradient, int size, std::string param_name);
+
+private:
+    float _beta1;
+    float _beta2;
+    float _epsilon;
+    std::unordered_map<std::string, float*> _momentum;
+    std::unordered_map<std::string, float*> _velocity;
+};
+
+}//namespace
+
+#endif
--- a/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h
+++ b/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
+#define EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
+
+#include <glog/logging.h>
+#include <random>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "sampling_method.h"
+#include "utils.h"
+
+namespace evo_kit {
+
+class CachedGaussianSampling: public SamplingMethod {
+
+public:
+    CachedGaussianSampling();
+
+    ~CachedGaussianSampling();
+
+    /*Initialize the sampling algorithm given the config with the protobuf format.
+     *EvoKit library uses only one configuration file for all sampling algorithms.
+      A defalut configuration file can be found at: . // TODO: where?
+      Usally you won't have to modify the configuration items of other algorithms
+      if you are not using them.
+     */
+    bool load_config(const EvoKitConfig& config);
+
+    /*@brief generate Gaussian noise and the related key.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: generate Gaussian successfully or not.
+     */
+    bool sampling(int* key, float* noise, int64_t size);
+
+    /*@brief reconstruct the Gaussion noise given the key.
+     * This function is often used for updating the neuron network parameters in the offline environment.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: reconstruct Gaussian successfully or not.
+     */
+    bool resampling(int key, float* noise, int64_t size);
+
+private:
+    float _std;
+    int _cache_size;
+    float* _noise_cache = nullptr;
+
+    bool _create_noise_cache();
+};
+
+}
+
+#endif
--- a/evo_kit/core/include/evo_kit/gaussian_sampling.h
+++ b/evo_kit/core/include/evo_kit/gaussian_sampling.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef EVO_KIT_GAUSSIAN_SAMPLING_H
+#define EVO_KIT_GAUSSIAN_SAMPLING_H
+
+#include <random>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "evo_kit/sampling_method.h"
+#include "evo_kit/utils.h"
+
+namespace evo_kit {
+
+class GaussianSampling: public SamplingMethod {
+
+public:
+    GaussianSampling() {}
+
+    ~GaussianSampling() {}
+
+    /*Initialize the sampling algorithm given the config with the protobuf format.
+     *EvoKit library uses only one configuration file for all sampling algorithms.
+      A defalut configuration file can be found at: . // TODO: where?
+      Usally you won't have to modify the configuration items of other algorithms
+      if you are not using them.
+     */
+    bool load_config(const EvoKitConfig& config);
+
+    /*@brief generate Gaussian noise and the related key.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: generate Gaussian successfully or not.
+     */
+    bool sampling(int* key, float* noise, int64_t size);
+
+    /*@brief reconstruct the Gaussion noise given the key.
+     * This function is often used for updating the neuron network parameters in the offline environment.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: reconstruct Gaussian successfully or not.
+     */
+    bool resampling(int key, float* noise, int64_t size);
+
+private:
+    float _std;
+};
+
+}
+
+#endif
--- a/evo_kit/core/include/evo_kit/optimizer.h
+++ b/evo_kit/core/include/evo_kit/optimizer.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_OPTIMIZER_H
+#define EVO_KIT_OPTIMIZER_H
+
+#include <glog/logging.h>
+#include <unordered_map>
+
+namespace evo_kit {
+
+/*@brief Optimizer. Base class for optimizers.
+ *
+ *@Args:
+ *     base_lr: learning rate (default: 1e-3).
+ *
+ * .. warning: update () is based on the parameter level,
+ *             you need to perform update () on each parameter.
+ *
+ * Subclasses are required to implement the following functions:
+ * 1. compute_steps
+ */
+class Optimizer {
+public:
+    Optimizer() : _base_lr(1e-3), _update_times(0) {}
+    Optimizer(float base_lr) : _base_lr(base_lr), _update_times(0) {}
+    virtual ~Optimizer() {
+        _params_size.clear();
+    }
+
+    template<typename T>
+    bool update(T weights, float* gradient, int size, std::string param_name = "") {
+        /*@ Performs a single optimization step (parameter update) at the parameter level.
+          *
+          *@Args:
+          *     weights (array): parameter weights.
+          *     gradient (array): gradient for updating weights.
+          *     size: size of gradient.
+          *     param_name: the name corresponding to the weights.
+          */
+        if (_params_size.count(param_name) == 0) {
+            _params_size[param_name] = size;
+        } else if (_params_size[param_name] != size) {
+            LOG(WARNING) << "[Warning] Update times: " << int(_update_times / _params_size.size()) \
+                         << ". Size of weights[" << param_name << "] is " << _params_size[param_name] << ", not " << size;
+            return false;
+        }
+
+        ++_update_times;
+        compute_step(gradient, size, param_name);
+
+        for (int i = 0; i < size; ++i) {
+            weights[i] -= _base_lr * gradient[i];
+        }
+
+        return true;
+    } // template function
+
+protected:
+    virtual void compute_step(float* graident, int size, std::string param_name = "") = 0;
+    float _base_lr;
+    float _update_times;
+    std::unordered_map<std::string, int> _params_size;
+};
+
+
+}//namespace
+#endif
--- a/evo_kit/core/include/evo_kit/optimizer_factory.h
+++ b/evo_kit/core/include/evo_kit/optimizer_factory.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_OPTIMIZER_FACTORY_H
+#define EVO_KIT_OPTIMIZER_FACTORY_H
+
+#include <algorithm>
+#include <glog/logging.h>
+#include <memory>
+#include "evo_kit/adam_optimizer.h"
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/optimizer.h"
+#include "evo_kit/sgd_optimizer.h"
+
+namespace evo_kit {
+/* @brief: create an optimizer according to the configuration"
+ * @args:
+ *    config: configuration for the optimizer
+ *
+ */
+std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config);
+
+} // namespace
+
+#endif
--- a/evo_kit/core/include/evo_kit/sampling_factory.h
+++ b/evo_kit/core/include/evo_kit/sampling_factory.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SAMPLING_FACTORY_H
+#define EVO_KIT_SAMPLING_FACTORY_H
+
+#include <algorithm>
+#include <glog/logging.h>
+#include <memory>
+#include "evo_kit/cached_gaussian_sampling.h"
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/sampling_method.h"
+
+namespace evo_kit {
+/* @brief: create an sampling_method according to the configuration"
+ * @args:
+ *    config: configuration for the EvoKit
+ *
+ */
+std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& Config);
+
+} // namespace
+
+#endif
--- a/evo_kit/core/include/evo_kit/sampling_method.h
+++ b/evo_kit/core/include/evo_kit/sampling_method.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SAMPLING_METHOD_H
+#define EVO_KIT_SAMPLING_METHOD_H
+
+#include <string>
+#include <random>
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit {
+
+/*Base class for sampling algorithms. All algorithms are required to override the following functions:
+ *
+ * 1. load_config
+ * 2. sampling
+ * 3. resampling
+ *
+ * View an demostrative algorithm in gaussian_sampling.h
+ * */
+
+class SamplingMethod {
+
+public:
+
+    SamplingMethod(): _seed(0) {}
+
+    virtual ~SamplingMethod() {}
+
+    /*Initialize the sampling algorithm given the config with the protobuf format.
+     *EvoKit library uses only one configuration file for all sampling algorithms.
+      A defalut configuration file can be found at: . // TODO: where?
+      Usally you won't have to modify the configuration items of other algorithms
+      if you are not using them.
+     */
+    virtual bool load_config(const EvoKitConfig& config) = 0;
+
+    /*@brief generate Gaussian noise and the related key.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: generate Gaussian successfully or not.
+     */
+    virtual bool sampling(int* key, float* noise, int64_t size) = 0;
+
+    /*@brief reconstruct the Gaussion noise given the key.
+     * This function is often used for updating the neuron network parameters in the offline environment.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: reconstruct Gaussian successfully or not.
+     */
+    virtual bool resampling(int key, float* noise, int64_t size) = 0;
+
+    bool set_seed(int seed) {
+        _seed = seed;
+        srand(_seed);
+        return true;
+    }
+
+    int get_seed() {
+        return _seed;
+    }
+
+protected:
+    int _seed;
+
+};
+
+}
+#endif
--- a/evo_kit/core/include/evo_kit/sgd_optimizer.h
+++ b/evo_kit/core/include/evo_kit/sgd_optimizer.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SGD_OPTIMIZER_H
+#define EVO_KIT_SGD_OPTIMIZER_H
+
+#include <cmath>
+#include <unordered_map>
+#include "evo_kit/optimizer.h"
+
+namespace evo_kit {
+
+/*@brief SGDOptimizer.
+  * Implements stochastic gradient descent (optionally with momentum).
+  *
+  *@Args:
+  *     base_lr: learning rate (default: 1e-3).
+  *     momentum: momentum factor (default: 0.9).
+  */
+class SGDOptimizer: public Optimizer {
+public:
+    SGDOptimizer(float base_lr, float momentum = 0.9): Optimizer(base_lr), _momentum(momentum) {}
+    ~SGDOptimizer();
+
+protected:
+    void compute_step(float* gradient, int size, std::string param_name);
+
+private:
+    float _momentum;
+    std::unordered_map<std::string, float*> _velocity;
+};
+
+} // namespace
+
+#endif
--- a/evo_kit/core/include/evo_kit/utils.h
+++ b/evo_kit/core/include/evo_kit/utils.h
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_UTILS_H
+#define EVO_KIT_UTILS_H
+
+#include <algorithm>
+#include <fstream>
+#include <glog/logging.h>
+#include <google/protobuf/text_format.h>
+#include <string>
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit {
+
+/*Return ranks that is normliazed to [-0.5, 0.5] with the rewards as input.
+  Args:
+    reward: an array of rewards
+*/
+bool compute_centered_ranks(std::vector<float>& reward);
+
+std::string read_file(const std::string& filename);
+
+/* Load a protobuf-based configuration from the file.
+ * Args:
+ *  config_file: file path.
+ *  proto_config: protobuff message for configuration.
+ * return
+ */
+template<typename T>
+bool load_proto_conf(const std::string& config_file, T& proto_config) {
+    bool success = true;
+    std::ifstream fin(config_file);
+
+    if (!fin || fin.fail()) {
+        LOG(ERROR) << "open prototxt config failed: " << config_file;
+        success = false;
+    } else {
+        fin.seekg(0, std::ios::end);
+        size_t file_size = fin.tellg();
+        fin.seekg(0, std::ios::beg);
+
+        char* file_content_buffer = new char[file_size];
+        fin.read(file_content_buffer, file_size);
+
+        std::string proto_str(file_content_buffer, file_size);
+
+        if (!google::protobuf::TextFormat::ParseFromString(proto_str, &proto_config)) {
+            LOG(ERROR) << "Failed to load config: " << config_file;
+            success = false;
+        }
+
+        delete[] file_content_buffer;
+        fin.close();
+    }
+
+    return success;
+}
+
+template<typename T>
+bool save_proto_conf(const std::string& config_file, T& proto_config) {
+    bool success = true;
+    std::ofstream ofs(config_file, std::ofstream::out);
+
+    if (!ofs || ofs.fail()) {
+        LOG(ERROR) << "open prototxt config failed: " << config_file;
+        success = false;
+    } else {
+        std::string config_str;
+        success = google::protobuf::TextFormat::PrintToString(proto_config, &config_str);
+
+        if (!success) {
+            return success;
+        }
+
+        ofs << config_str;
+    }
+
+    return success;
+}
+
+std::vector<std::string> list_all_model_dirs(std::string path);
+
+}
+
+#endif
--- a/evo_kit/core/proto/evo_kit/evo_kit.proto
+++ b/evo_kit/core/proto/evo_kit/evo_kit.proto
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package evo_kit;
+
+message EvoKitConfig {
+  //sampling configuration
+  optional int32 seed = 1 [default = 18];
+  optional int32 buffer_size = 2 [default = 100000];
+  optional GaussianSamplingConfig gaussian_sampling = 3;
+  // Optimizer Configuration
+  optional OptimizerConfig optimizer = 4;
+  // AsyncESAgent Configuration
+  optional AsyncESConfig async_es = 5;
+}
+
+message GaussianSamplingConfig {
+  optional float std = 1 [default = 1.0];
+  optional bool cached = 2 [default = false];
+  optional int32 cache_size = 3 [default = 100000];
+}
+
+message OptimizerConfig{
+  optional string type = 1 [default = "SGD"];
+  optional float base_lr = 2 [default = 1e-3]; // The base learning rate.
+  optional float momentum = 3 [default = 0.9]; // The momentum value for SGD.
+
+  // ------------Adam Optimizer---------
+  optional float beta1 = 4 [default = 0.9];
+  optional float beta2 = 5 [default = 0.999];
+  optional float epsilon = 6 [default = 1e-8];
+}
+
+message SamplingInfo{
+  repeated int32 key = 1;
+  optional int32 model_iter_id = 2;
+}
+
+message AsyncESConfig{
+  optional string model_warehouse = 1 [default = "./model_warehouse"];
+  repeated string model_md5 = 2;
+  optional int32 max_to_keep = 3 [default = 5];
+  optional int32 model_iter_id = 4 [default = 0];
+}
--- a/evo_kit/core/src/adam_optimizer.cc
+++ b/evo_kit/core/src/adam_optimizer.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/adam_optimizer.h"
+
+namespace evo_kit {
+
+AdamOptimizer::~AdamOptimizer() {
+    for (auto iter = _momentum.begin(); iter != _momentum.end(); iter++) {
+        delete[] iter->second;
+    }
+
+    for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
+        delete[] iter->second;
+    }
+
+    _momentum.clear();
+    _velocity.clear();
+}
+
+void AdamOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
+    if (_momentum.count(param_name) == 0) {
+        _momentum[param_name] = new float [size];
+        memset(_momentum[param_name], 0, size * sizeof(float));
+    }
+
+    if (_velocity.count(param_name) == 0) {
+        _velocity[param_name] = new float [size];
+        memset(_velocity[param_name], 0, size * sizeof(float));
+    }
+
+    int true_update_times = int(_update_times / _velocity.size());
+    float alpha = std::sqrt(1 - std::pow(_beta2, _update_times)) / (1 - std::pow(_beta1,
+                  _update_times));
+
+    for (int i = 0; i < size; ++i) {
+        _momentum[param_name][i] = _beta1 * _momentum[param_name][i] + (1 - _beta1) * gradient[i];
+        _velocity[param_name][i] = _beta2 * _velocity[param_name][i] + (1 - _beta2) * gradient[i] *
+                                   gradient[i];
+        gradient[i] = alpha * _momentum[param_name][i] / (std::sqrt(_velocity[param_name][i]) + _epsilon);
+    }
+}
+
+}//namespace
--- a/evo_kit/core/src/cached_gaussian_sampling.cc
+++ b/evo_kit/core/src/cached_gaussian_sampling.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/cached_gaussian_sampling.h"
+
+namespace evo_kit {
+
+CachedGaussianSampling::CachedGaussianSampling() {}
+
+CachedGaussianSampling::~CachedGaussianSampling() {
+    delete[] _noise_cache;
+}
+
+bool CachedGaussianSampling::load_config(const EvoKitConfig& config) {
+    bool success = true;
+    _std = config.gaussian_sampling().std();
+    success = set_seed(config.seed());
+    CHECK(success) << "[EvoKit] Fail to set seed while load config.";
+    _cache_size = config.gaussian_sampling().cache_size();
+    _noise_cache = new float [_cache_size];
+    memset(_noise_cache, 0, _cache_size * sizeof(float));
+    success = _create_noise_cache();
+    CHECK(success) << "[EvoKit] Fail to create noise_cache while load config.";
+    return success;
+}
+
+bool CachedGaussianSampling::sampling(int* key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (_noise_cache == nullptr) {
+        LOG(ERROR) << "[EvoKit] Please use load_config() first.";
+        success = false;
+        return success;
+    }
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+        return success;
+    }
+
+    if ((size >= _cache_size) || (size < 0)) {
+        LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
+                   "), cache_size: " << _cache_size;
+        success = false;
+        return success;
+    }
+
+    int rand_key = rand();
+    std::default_random_engine generator(rand_key);
+    std::uniform_int_distribution<unsigned int> uniform(0, _cache_size - size);
+    int index = uniform(generator);
+    *key = index;
+
+    for (int64_t i = 0; i < size; ++i) {
+        *(noise + i) = *(_noise_cache + index + i);
+    }
+
+    return success;
+}
+
+bool CachedGaussianSampling::resampling(int key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (_noise_cache == nullptr) {
+        LOG(ERROR) << "[EvoKit] Please use load_config() first.";
+        success = false;
+        return success;
+    }
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+        return success;
+    }
+
+    if ((size >= _cache_size) || (size < 0)) {
+        LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
+                   "), cache_size: " << _cache_size;
+        success = false;
+        return success;
+    }
+
+    if ((key > _cache_size - size) || (key < 0)) {
+        LOG(ERROR) << "[EvoKit] Resampling key " << key << " is out of bounds [0, "
+                    << _cache_size - size <<
+                   "], cache_size: " << _cache_size << ", size: " << size;
+        success = false;
+        return success;
+    }
+
+    for (int64_t i = 0; i < size; ++i) {
+        *(noise + i) = *(_noise_cache + key + i);
+    }
+
+    return success;
+}
+
+bool CachedGaussianSampling::_create_noise_cache() {
+    std::default_random_engine generator(_seed);
+    std::normal_distribution<float> norm;
+
+    for (int64_t i = 0; i < _cache_size; ++i) {
+        *(_noise_cache + i) = norm(generator) * _std;
+    }
+
+    return true;
+}
+
+}
--- a/evo_kit/core/src/gaussian_sampling.cc
+++ b/evo_kit/core/src/gaussian_sampling.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/gaussian_sampling.h"
+
+namespace evo_kit {
+
+bool GaussianSampling::load_config(const EvoKitConfig& config) {
+    bool success = true;
+    _std = config.gaussian_sampling().std();
+    success = set_seed(config.seed());
+    return success;
+}
+
+bool GaussianSampling::sampling(int* key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+        return success;
+    }
+
+    int rand_key = rand();
+    *key = rand_key;
+    std::default_random_engine generator(rand_key);
+    std::normal_distribution<float> norm;
+
+    for (int64_t i = 0; i < size; ++i) {
+        *(noise + i) = norm(generator) * _std;
+    }
+
+    return success;
+}
+
+bool GaussianSampling::resampling(int key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+    } else {
+        std::default_random_engine generator(key);
+        std::normal_distribution<float> norm;
+
+        for (int64_t i = 0; i < size; ++i) {
+            *(noise + i) = norm(generator) * _std;
+        }
+    }
+
+    return success;
+}
+
+}
--- a/evo_kit/core/src/optimizer_factory.cc
+++ b/evo_kit/core/src/optimizer_factory.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/optimizer_factory.h"
+
+namespace evo_kit {
+
+std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config) {
+    std::shared_ptr<Optimizer> optimizer;
+    std::string opt_type = optimizer_config.type();
+    std::transform(opt_type.begin(), opt_type.end(), opt_type.begin(), ::tolower);
+
+    if (opt_type == "sgd") {
+        optimizer = std::make_shared<SGDOptimizer>(optimizer_config.base_lr(), \
+                    optimizer_config.momentum());
+    } else if (opt_type == "adam") {
+        optimizer = std::make_shared<AdamOptimizer>(optimizer_config.base_lr(), \
+                    optimizer_config.beta1(), \
+                    optimizer_config.beta2(), \
+                    optimizer_config.epsilon());
+    } else {
+        LOG(ERROR) << "type of OptimizerConfig must be SGD or Adam."; // NotImplementedError
+    }
+
+    return optimizer;
+}
+
+}//namespace
--- a/evo_kit/core/src/sampling_factory.cc
+++ b/evo_kit/core/src/sampling_factory.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/sampling_factory.h"
+
+namespace evo_kit {
+
+
+std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& config) {
+    std::shared_ptr<SamplingMethod> sampling_method;
+    bool cached = config.gaussian_sampling().cached();
+
+    if (cached) {
+        sampling_method = std::make_shared<CachedGaussianSampling>();
+    } else {
+        sampling_method = std::make_shared<GaussianSampling>();
+    }
+
+    bool success = sampling_method->load_config(config);
+
+    if (success) {
+        return sampling_method;
+    } else {
+        LOG(ERROR) << "[EvoKit] Fail to create sampling_method";
+        return nullptr;
+    }
+
+}
+
+}//namespace
--- a/evo_kit/core/src/sgd_optimizer.cc
+++ b/evo_kit/core/src/sgd_optimizer.cc
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/sgd_optimizer.h"
+
+namespace evo_kit {
+
+SGDOptimizer::~SGDOptimizer() {
+    for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
+        delete[] iter->second;
+    }
+
+    _velocity.clear();
+}
+
+void SGDOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
+    if (_velocity.count(param_name) == 0) {
+        _velocity[param_name] = new float [size];
+        memset(_velocity[param_name], 0, size * sizeof(float));
+    }
+
+    for (int i = 0; i < size; ++i) {
+        _velocity[param_name][i] = _momentum * _velocity[param_name][i] + (1 - _momentum) * gradient[i];
+        gradient[i] = _velocity[param_name][i];
+    }
+}
+
+
+}//namespace
--- a/evo_kit/core/src/utils.cc
+++ b/evo_kit/core/src/utils.cc
--- a/evo_kit/demo/cartpole_config.prototxt
+++ b/evo_kit/demo/cartpole_config.prototxt
+seed: 1024
+gaussian_sampling {
+  std: 0.5
+  cached: true
+  cache_size: 100000
+}
+optimizer {
+  type: "Adam"
+  base_lr: 0.05
+  momentum: 0.9
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-08
+}
+async_es {
+  model_iter_id: 0
+}
--- a/evo_kit/demo/paddle/cartpole_async_solver.cc
+++ b/evo_kit/demo/paddle/cartpole_async_solver.cc
--- a/evo_kit/demo/paddle/cartpole_init_model.zip
+++ b/evo_kit/demo/paddle/cartpole_init_model.zip
--- a/evo_kit/demo/paddle/cartpole_solver_parallel.cc
+++ b/evo_kit/demo/paddle/cartpole_solver_parallel.cc
--- a/evo_kit/demo/paddle/gen_cartpole_init_model.py
+++ b/evo_kit/demo/paddle/gen_cartpole_init_model.py
--- a/evo_kit/demo/torch/CMakeLists.txt
+++ b/evo_kit/demo/torch/CMakeLists.txt
--- a/evo_kit/demo/torch/cartpole_solver_parallel.cc
+++ b/evo_kit/demo/torch/cartpole_solver_parallel.cc
--- a/evo_kit/demo/torch/model.h
+++ b/evo_kit/demo/torch/model.h
--- a/evo_kit/paddle/include/evo_kit/async_es_agent.h
+++ b/evo_kit/paddle/include/evo_kit/async_es_agent.h
--- a/evo_kit/paddle/include/evo_kit/es_agent.h
+++ b/evo_kit/paddle/include/evo_kit/es_agent.h
--- a/evo_kit/paddle/src/async_es_agent.cc
+++ b/evo_kit/paddle/src/async_es_agent.cc
--- a/evo_kit/paddle/src/es_agent.cc
+++ b/evo_kit/paddle/src/es_agent.cc
--- a/evo_kit/scripts/build_torch_demo.sh
+++ b/evo_kit/scripts/build_torch_demo.sh
--- a/evo_kit/scripts/lib_install.sh
+++ b/evo_kit/scripts/lib_install.sh
--- a/evo_kit/test/CMakeLists.txt
+++ b/evo_kit/test/CMakeLists.txt
--- a/evo_kit/test/include/torch_demo_model.h
+++ b/evo_kit/test/include/torch_demo_model.h
--- a/evo_kit/test/prototxt/torch_sin_cached_config.prototxt
+++ b/evo_kit/test/prototxt/torch_sin_cached_config.prototxt
--- a/evo_kit/test/prototxt/torch_sin_config.prototxt
+++ b/evo_kit/test/prototxt/torch_sin_config.prototxt
--- a/evo_kit/test/run_test.sh
+++ b/evo_kit/test/run_test.sh
--- a/evo_kit/test/src/optimizers_test.cc
+++ b/evo_kit/test/src/optimizers_test.cc
--- a/evo_kit/test/src/sampling_test.cc
+++ b/evo_kit/test/src/sampling_test.cc
--- a/evo_kit/test/src/torch_agent_test.cc
+++ b/evo_kit/test/src/torch_agent_test.cc
--- a/evo_kit/test/src/utils_test.cc
+++ b/evo_kit/test/src/utils_test.cc
--- a/evo_kit/test/unit_test.cc
+++ b/evo_kit/test/unit_test.cc
--- a/evo_kit/torch/include/evo_kit/es_agent.h
+++ b/evo_kit/torch/include/evo_kit/es_agent.h
--- a/examples/A2C/README.md
+++ b/examples/A2C/README.md
--- a/examples/A2C/atari_agent.py
+++ b/examples/A2C/atari_agent.py
--- a/examples/A2C/train.py
+++ b/examples/A2C/train.py
--- a/examples/DDPG/mujoco_agent.py
+++ b/examples/DDPG/mujoco_agent.py
--- a/examples/DDPG/mujoco_model.py
+++ b/examples/DDPG/mujoco_model.py
--- a/examples/DDPG/train.py
+++ b/examples/DDPG/train.py
--- a/examples/DQN/README.md
+++ b/examples/DQN/README.md
--- a/examples/DQN/cartpole.jpg
+++ b/examples/DQN/cartpole.jpg
--- a/examples/LiftSim_baseline/rl_benchmark/agent.py
+++ b/examples/LiftSim_baseline/rl_benchmark/agent.py
--- a/examples/LiftSim_baseline/rl_benchmark/model.py
+++ b/examples/LiftSim_baseline/rl_benchmark/model.py
--- a/examples/DQN/replay_memory.py
+++ b/examples/DQN/replay_memory.py
--- a/examples/DQN/train.py
+++ b/examples/DQN/train.py
--- a/examples/DQN/.benchmark/merge.png
+++ b/examples/DQN/.benchmark/merge.png
--- a/examples/DQN/.benchmark/table.png
+++ b/examples/DQN/.benchmark/table.png
--- a/examples/DQN_variant/README.md
+++ b/examples/DQN_variant/README.md
--- a/examples/DQN/atari.py
+++ b/examples/DQN/atari.py
--- a/examples/DQN/atari_agent.py
+++ b/examples/DQN/atari_agent.py
--- a/examples/DQN/atari_model.py
+++ b/examples/DQN/atari_model.py
--- a/examples/DQN/atari_wrapper.py
+++ b/examples/DQN/atari_wrapper.py
--- a/examples/DQN_variant/replay_memory.py
+++ b/examples/DQN_variant/replay_memory.py
--- a/examples/DQN/rom_files/breakout.bin
+++ b/examples/DQN/rom_files/breakout.bin
--- a/examples/DQN/rom_files/pong.bin
+++ b/examples/DQN/rom_files/pong.bin
--- a/examples/DQN_variant/train.py
+++ b/examples/DQN_variant/train.py
--- a/examples/DQN/utils.py
+++ b/examples/DQN/utils.py
--- a/examples/ES/README.md
+++ b/examples/ES/README.md
--- a/examples/ES/train.py
+++ b/examples/ES/train.py
--- a/examples/GA3C/train.py
+++ b/examples/GA3C/train.py
--- a/examples/IMPALA/atari_agent.py
+++ b/examples/IMPALA/atari_agent.py
--- a/examples/IMPALA/train.py
+++ b/examples/IMPALA/train.py
--- a/examples/LiftSim_baseline/README.md
+++ b/examples/LiftSim_baseline/README.md
--- a/examples/LiftSim_baseline/__init__.py
+++ b/examples/LiftSim_baseline/__init__.py
--- a/examples/LiftSim_baseline/rl_10.png
+++ b/examples/LiftSim_baseline/rl_10.png
--- a/examples/LiftSim_baseline/rl_benchmark/dispatcher.py
+++ b/examples/LiftSim_baseline/rl_benchmark/dispatcher.py
--- a/examples/LiftSim_baseline/wrapper.py
+++ b/examples/LiftSim_baseline/wrapper.py
--- a/examples/LiftSim_baseline/wrapper_utils.py
+++ b/examples/LiftSim_baseline/wrapper_utils.py
--- a/examples/MADDPG/README.md
+++ b/examples/MADDPG/README.md
--- a/examples/MADDPG/train.py
+++ b/examples/MADDPG/train.py
--- a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
+++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
--- a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
+++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
--- a/examples/SAC/train.py
+++ b/examples/SAC/train.py
--- a/examples/TD3/train.py
+++ b/examples/TD3/train.py
--- a/examples/offline-Q-learning/atari.py
+++ b/examples/offline-Q-learning/atari.py
--- a/examples/offline-Q-learning/atari_wrapper.py
+++ b/examples/offline-Q-learning/atari_wrapper.py
--- a/examples/offline-Q-learning/dqn.py
+++ b/examples/offline-Q-learning/dqn.py
--- a/examples/offline-Q-learning/parallel_run.py
+++ b/examples/offline-Q-learning/parallel_run.py
--- a/examples/offline-Q-learning/replay_memory.py
+++ b/examples/offline-Q-learning/replay_memory.py
--- a/examples/offline-Q-learning/rom_files
+++ b/examples/offline-Q-learning/rom_files
--- a/examples/offline-Q-learning/utils.py
+++ b/examples/offline-Q-learning/utils.py
--- a/examples/others/deepes.py
+++ b/examples/others/deepes.py
--- a/examples/tutorials/README.md
+++ b/examples/tutorials/README.md
--- a/examples/tutorials/lesson1/gridworld.py
+++ b/examples/tutorials/lesson1/gridworld.py
--- a/examples/tutorials/lesson2/q_learning/agent.py
+++ b/examples/tutorials/lesson2/q_learning/agent.py
--- a/examples/tutorials/lesson2/q_learning/gridworld.py
+++ b/examples/tutorials/lesson2/q_learning/gridworld.py
--- a/examples/tutorials/lesson2/q_learning/train.py
+++ b/examples/tutorials/lesson2/q_learning/train.py
--- a/examples/tutorials/lesson2/sarsa/agent.py
+++ b/examples/tutorials/lesson2/sarsa/agent.py
--- a/examples/tutorials/lesson2/sarsa/gridworld.py
+++ b/examples/tutorials/lesson2/sarsa/gridworld.py
--- a/examples/tutorials/lesson2/sarsa/train.py
+++ b/examples/tutorials/lesson2/sarsa/train.py
--- a/examples/tutorials/lesson3/dqn/agent.py
+++ b/examples/tutorials/lesson3/dqn/agent.py
--- a/examples/tutorials/lesson3/dqn/algorithm.py
+++ b/examples/tutorials/lesson3/dqn/algorithm.py
--- a/examples/tutorials/lesson3/dqn/model.py
+++ b/examples/tutorials/lesson3/dqn/model.py
--- a/examples/tutorials/lesson3/dqn/replay_memory.py
+++ b/examples/tutorials/lesson3/dqn/replay_memory.py
--- a/examples/tutorials/lesson3/dqn/train.py
+++ b/examples/tutorials/lesson3/dqn/train.py
--- a/examples/tutorials/lesson4/policy_gradient/agent.py
+++ b/examples/tutorials/lesson4/policy_gradient/agent.py
--- a/examples/tutorials/lesson4/policy_gradient/algorithm.py
+++ b/examples/tutorials/lesson4/policy_gradient/algorithm.py
--- a/parl/framework/agent_base.py
+++ b/parl/framework/agent_base.py
--- a/examples/tutorials/lesson4/policy_gradient/train.py
+++ b/examples/tutorials/lesson4/policy_gradient/train.py
--- a/examples/tutorials/lesson5/ddpg/agent.py
+++ b/examples/tutorials/lesson5/ddpg/agent.py
--- a/examples/tutorials/lesson5/ddpg/algorithm.py
+++ b/examples/tutorials/lesson5/ddpg/algorithm.py
--- a/examples/tutorials/lesson5/ddpg/env.py
+++ b/examples/tutorials/lesson5/ddpg/env.py
--- a/examples/tutorials/lesson5/ddpg/model.py
+++ b/examples/tutorials/lesson5/ddpg/model.py
--- a/examples/tutorials/lesson5/ddpg/replay_memory.py
+++ b/examples/tutorials/lesson5/ddpg/replay_memory.py
--- a/examples/tutorials/lesson5/ddpg/train.py
+++ b/examples/tutorials/lesson5/ddpg/train.py
--- a/papers/AAAI_2020.md
+++ b/papers/AAAI_2020.md
--- a/parl/__init__.py
+++ b/parl/__init__.py
--- a/parl/algorithms/fluid/a3c.py
+++ b/parl/algorithms/fluid/a3c.py
--- a/parl/algorithms/fluid/ddpg.py
+++ b/parl/algorithms/fluid/ddpg.py
--- a/parl/algorithms/fluid/ddqn.py
+++ b/parl/algorithms/fluid/ddqn.py
--- a/parl/algorithms/fluid/dqn.py
+++ b/parl/algorithms/fluid/dqn.py
--- a/parl/algorithms/fluid/impala/impala.py
+++ b/parl/algorithms/fluid/impala/impala.py
--- a/parl/algorithms/fluid/impala/vtrace.py
+++ b/parl/algorithms/fluid/impala/vtrace.py
--- a/parl/algorithms/fluid/maddpg.py
+++ b/parl/algorithms/fluid/maddpg.py
--- a/parl/algorithms/fluid/policy_gradient.py
+++ b/parl/algorithms/fluid/policy_gradient.py
--- a/parl/algorithms/fluid/ppo.py
+++ b/parl/algorithms/fluid/ppo.py
--- a/parl/algorithms/fluid/sac.py
+++ b/parl/algorithms/fluid/sac.py
--- a/parl/algorithms/fluid/tests/algs_test.py
+++ b/parl/algorithms/fluid/tests/algs_test.py
--- a/parl/algorithms/torch/__init__.py
+++ b/parl/algorithms/torch/__init__.py
--- a/parl/algorithms/torch/a2c.py
+++ b/parl/algorithms/torch/a2c.py
--- a/parl/algorithms/torch/ppo.py
+++ b/parl/algorithms/torch/ppo.py
--- a/parl/core/fluid/agent.py
+++ b/parl/core/fluid/agent.py
--- a/parl/core/fluid/algorithm.py
+++ b/parl/core/fluid/algorithm.py
--- a/parl/core/fluid/layers/tests/param_sharing_test.py
+++ b/parl/core/fluid/layers/tests/param_sharing_test.py
--- a/parl/core/fluid/model.py
+++ b/parl/core/fluid/model.py
--- a/parl/core/fluid/tests/agent_base_test_.py
+++ b/parl/core/fluid/tests/agent_base_test_.py
--- a/parl/core/fluid/tests/model_base_test_.py
+++ b/parl/core/fluid/tests/model_base_test_.py
--- a/parl/core/fluid/tests/policy_distribution_test_.py
+++ b/parl/core/fluid/tests/policy_distribution_test_.py
--- a/parl/core/torch/agent.py
+++ b/parl/core/torch/agent.py
--- a/parl/core/torch/tests/agent_base_test_torch.py
+++ b/parl/core/torch/tests/agent_base_test_torch.py
--- a/parl/framework/__init__.py
+++ b/parl/framework/__init__.py
--- a/parl/framework/policy_distribution.py
+++ b/parl/framework/policy_distribution.py
--- a/parl/layers/__init__.py
+++ b/parl/layers/__init__.py
--- a/parl/plutils/__init__.py
+++ b/parl/plutils/__init__.py
--- a/parl/plutils/common.py
+++ b/parl/plutils/common.py
--- a/parl/remote/client.py
+++ b/parl/remote/client.py
--- a/parl/remote/cluster_monitor.py
+++ b/parl/remote/cluster_monitor.py
--- a/parl/framework/algorithm_base.py
+++ b/parl/framework/algorithm_base.py
--- a/parl/remote/job.py
+++ b/parl/remote/job.py
--- a/parl/remote/log_server.py
+++ b/parl/remote/log_server.py
--- a/parl/remote/master.py
+++ b/parl/remote/master.py
--- a/parl/remote/message.py
+++ b/parl/remote/message.py
--- a/parl/remote/monitor.py
+++ b/parl/remote/monitor.py
--- a/parl/remote/remote_decorator.py
+++ b/parl/remote/remote_decorator.py
--- a/parl/remote/scripts.py
+++ b/parl/remote/scripts.py
--- a/parl/remote/start.py
+++ b/parl/remote/start.py
--- a/parl/remote/static/js/ansi_up.js
+++ b/parl/remote/static/js/ansi_up.js
--- a/parl/remote/static/js/jquery.ajax-cross-origin.min.js
+++ b/parl/remote/static/js/jquery.ajax-cross-origin.min.js
--- a/parl/remote/static/js/parl.js
+++ b/parl/remote/static/js/parl.js
--- a/parl/remote/templates/clients.html
+++ b/parl/remote/templates/clients.html
--- a/parl/remote/templates/jobs.html
+++ b/parl/remote/templates/jobs.html
--- a/parl/remote/tests/actor_max_memory_test.py
+++ b/parl/remote/tests/actor_max_memory_test.py
--- a/parl/remote/tests/cluster_monitor_2_test.py
+++ b/parl/remote/tests/cluster_monitor_2_test.py
--- a/parl/remote/tests/cluster_monitor_3_test.py
+++ b/parl/remote/tests/cluster_monitor_3_test.py
--- a/parl/remote/tests/cluster_monitor_test.py
+++ b/parl/remote/tests/cluster_monitor_test.py
--- a/parl/remote/tests/cluster_test.py
+++ b/parl/remote/tests/cluster_test.py
--- a/parl/framework/model_base.py
+++ b/parl/framework/model_base.py
--- a/parl/remote/tests/log_server_test.py
+++ b/parl/remote/tests/log_server_test.py
--- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py
+++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py
--- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py
+++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py
--- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py
+++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py
--- a/parl/remote/tests/recursive_actor_test.py
+++ b/parl/remote/tests/recursive_actor_test.py
--- a/parl/remote/tests/reset_job_test.py
+++ b/parl/remote/tests/reset_job_test.py
--- a/parl/remote/tests/reset_job_test_alone.py
+++ b/parl/remote/tests/reset_job_test_alone.py
--- a/parl/remote/tests/send_job_test.py
+++ b/parl/remote/tests/send_job_test.py
--- a/parl/remote/tests/sync_config_file_test.py
+++ b/parl/remote/tests/sync_config_file_test.py
--- a/parl/remote/utils.py
+++ b/parl/remote/utils.py
--- a/parl/remote/worker.py
+++ b/parl/remote/worker.py
--- a/parl/utils/communication.py
+++ b/parl/utils/communication.py
--- a/parl/utils/machine_info.py
+++ b/parl/utils/machine_info.py
--- a/examples/LiftSim_baseline/rl_benchmark/__init__.py
+++ b/examples/LiftSim_baseline/rl_benchmark/__init__.py
--- a/parl/utils/tensorboard.py
+++ b/parl/utils/tensorboard.py
--- a/parl/utils/tests/tensorboard_test.py
+++ b/parl/utils/tests/tensorboard_test.py
--- a/parl/utils/utils.py
+++ b/parl/utils/utils.py
--- a/examples/LiftSim_baseline/demo.py
+++ b/examples/LiftSim_baseline/demo.py
--- a/setup.py
+++ b/setup.py