diff --git a/.copyright.hook b/.copyright.hook
index 3be6d0ae5bf352aa08ee44ab2144670f1bf03510..1b0acacb97a1b3059fcc88fb44b6168fa0419473 100644
--- a/.copyright.hook
+++ b/.copyright.hook
@@ -1,6 +1,5 @@
 from __future__ import absolute_import
 from __future__ import print_function
-from __future__ import unicode_literals
 
 import argparse
 import io, re
diff --git a/.teamcity/Dockerfile b/.teamcity/Dockerfile
index c3d1c209eb04bf7379969a28d0be4ce1bfe10c0d..99eec25ba86ed4d2acf77faf25f14d9092b09595 100644
--- a/.teamcity/Dockerfile
+++ b/.teamcity/Dockerfile
@@ -18,3 +18,7 @@
 FROM parl/parl-test:cuda9.0-cudnn7-v2
 
 COPY ./requirements.txt /root/
+
+RUN apt-get install -y libgflags-dev libgoogle-glog-dev libomp-dev unzip
+RUN apt-get install -y libgtest-dev && cd /usr/src/gtest && mkdir build \
+	&& cd build && cmake .. && make  && cp libgtest*.a /usr/local/lib
diff --git a/.teamcity/build.sh b/.teamcity/build.sh
index 6a33424797690bcd088381bd8173ae7d881c2dbc..1f3c0cd20e3dfc0fa3eb378d21d5e490d8afea33 100755
--- a/.teamcity/build.sh
+++ b/.teamcity/build.sh
@@ -69,7 +69,7 @@ function run_test_with_gpu() {
     Running unit tests with GPU...
     ========================================
 EOF
-    ctest --output-on-failure -j10
+    ctest --output-on-failure -j20 --verbose
     cd ${REPO_ROOT}
     rm -rf ${REPO_ROOT}/build
 }
@@ -90,7 +90,7 @@ function run_test_with_cpu() {
     =====================================================
 EOF
     if [ $# -eq 1 ];then
-      ctest --output-on-failure -j10
+      ctest --output-on-failure -j20 --verbose
     else
       ctest --output-on-failure 
     fi
@@ -145,7 +145,8 @@ function main() {
           ;;
         test)
           # test code compability in environments with various python versions
-          declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
+          #declare -a envs=("py36_torch" "py37_torch" "py27" "py36" "py37")
+          declare -a envs=("py27" "py36")
           for env in "${envs[@]}";do
               cd /work
               source ~/.bashrc
@@ -158,7 +159,7 @@ function main() {
               echo ========================================
               pip install .
               if [ \( $env == "py27" -o $env == "py36" -o $env == "py37" \) ]
-              then  
+              then
                 pip install -r .teamcity/requirements.txt
                 run_test_with_cpu $env
                 run_test_with_cpu $env "DIS_TESTING_SERIALLY"
@@ -169,6 +170,10 @@ function main() {
                 pip install -r .teamcity/requirements_torch.txt
                 run_test_with_cpu $env "DIS_TESTING_TORCH"
               fi
+              # clean env
+              export LC_ALL=C.UTF-8
+              export LANG=C.UTF-8
+              xparl stop
           done
           run_test_with_gpu
 
diff --git a/.teamcity/requirements.txt b/.teamcity/requirements.txt
index 354e3632e02ce8e678df2024a6d16657281c1a0e..8ed94543532fee0c02b048a36dba05832ae3d161 100644
--- a/.teamcity/requirements.txt
+++ b/.teamcity/requirements.txt
@@ -3,4 +3,3 @@ paddlepaddle-gpu==1.6.1.post97
 gym
 details
 parameterized
-timeout_decorator
diff --git a/.teamcity/requirements_torch.txt b/.teamcity/requirements_torch.txt
index dd2808a12eaab7e3158d09334ffb916917427417..5cdd9ea56ad6cc2db2ecd1fc6f7e046ff84507b7 100644
--- a/.teamcity/requirements_torch.txt
+++ b/.teamcity/requirements_torch.txt
@@ -2,4 +2,3 @@
 gym
 details
 parameterized
-timeout_decorator
diff --git a/.scripts/update_readme_paddle_version.py b/.teamcity/update_readme_paddle_version.py
similarity index 94%
rename from .scripts/update_readme_paddle_version.py
rename to .teamcity/update_readme_paddle_version.py
index 56d56914c65956a2bb753bc58269d59034766b1c..901d2d672d9f3eff1021241ac80b6e9f75d0886a 100644
--- a/.scripts/update_readme_paddle_version.py
+++ b/.teamcity/update_readme_paddle_version.py
@@ -37,7 +37,8 @@ if __name__ == '__main__':
 
     exclude_examples = [
         'NeurIPS2019-Learn-to-Move-Challenge',
-        'NeurIPS2018-AI-for-Prosthetics-Challenge', 'EagerMode'
+        'NeurIPS2018-AI-for-Prosthetics-Challenge', 'LiftSim_baseline',
+        'EagerMode'
     ]
     for example in os.listdir('../examples/'):
         if example not in exclude_examples:
diff --git a/.teamcity/windows_test.sh b/.teamcity/windows_test.sh
new file mode 100755
index 0000000000000000000000000000000000000000..a6d12a6f6c9c212e406f8e900a03c3f4f0cfc44b
--- /dev/null
+++ b/.teamcity/windows_test.sh
@@ -0,0 +1,76 @@
+#!/usr/bin/env bash
+
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# NOTE: You need install mingw-cmake.
+
+function init() {
+    RED='\033[0;31m'
+    BLUE='\033[0;34m'
+    BOLD='\033[1m'
+    NONE='\033[0m'
+
+    REPO_ROOT=`pwd`
+}
+
+
+function abort(){
+    echo "Your change doesn't follow PaddlePaddle's code style." 1>&2
+    echo "Please use pre-commit to check what is wrong." 1>&2
+    exit 1
+}
+
+function run_test_with_cpu() {
+    export CUDA_VISIBLE_DEVICES="-1"
+
+    mkdir -p ${REPO_ROOT}/build
+    cd ${REPO_ROOT}/build
+    if [ $# -eq 1 ];then
+        cmake  -G "MinGW Makefiles" ..
+    else
+        cmake  -G "MinGW Makefiles" .. -$2=ON
+    fi
+    cat <<EOF
+    =====================================================
+    Running unit tests with CPU in the environment: $1
+    =====================================================
+EOF
+    if [ $# -eq 1 ];then
+      ctest --output-on-failure -j10
+    else
+      ctest --output-on-failure 
+    fi
+    cd ${REPO_ROOT}
+    rm -rf ${REPO_ROOT}/build
+}
+
+function main() {
+    set -e
+    local CMD=$1
+    
+    init
+    env="unused_variable"
+    # run unittest in windows (used in local machine)
+    pip install -i https://pypi.tuna.tsinghua.edu.cn/simple .
+    pip uninstall -y torch torchvision
+    pip install -i https://pypi.tuna.tsinghua.edu.cn/simple paddlepaddle==1.6.1 gym details parameterized
+    run_test_with_cpu $env
+    run_test_with_cpu $env "DIS_TESTING_SERIALLY"
+    pip uninstall -y paddlepaddle
+    pip install torch==1.4.0+cpu torchvision==0.5.0+cpu -f https://download.pytorch.org/whl/torch_stable.html  
+    run_test_with_cpu $env "DIS_TESTING_TORCH"
+}
+
+main $@
diff --git a/CMakeLists.txt b/CMakeLists.txt
index add8cf32c20319dd3dee74de86e046e2a93aa009..a77bd684aecc364bf0053e36724fcf0fe880d2f0 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -33,6 +33,7 @@ function(py_test TARGET_NAME)
     add_test(NAME ${TARGET_NAME}
         COMMAND python -u ${py_test_SRCS} ${py_test_ARGS}
         WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
+    set_tests_properties(${TARGET_NAME} PROPERTIES TIMEOUT 300)
 endfunction()
 
 function(import_test TARGET_NAME)
diff --git a/README.cn.md b/README.cn.md
index e1750bcb962743e59695dd146ed76a33634a1509..508302a67c7c5c89625350f669a4cdeafff0bb60 100644
--- a/README.cn.md
+++ b/README.cn.md
@@ -3,7 +3,7 @@
 </p>
 
 [English](./README.md) | 简体中文   
-[**文档**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md)
+[**文档**](https://parl.readthedocs.io/en/stable/index.html)
 
 > PARL 是一个高性能、灵活的强化学习框架。
 # 特点
@@ -48,7 +48,7 @@ class Agent(object):
 parl.connect('localhost:8037')
 agent = Agent()
 agent.say_hello()
-ans = agent.sum(1,5) # run remotely and not comsume any local computation resources 
+ans = agent.sum(1,5) # run remotely and not comsume any local computation resources
 ```
 两步调度外部的计算资源：
 1. 使用`parl.remote_class`修饰一个类，之后这个类就被转化为可以运行在其他CPU或者机器上的类。
@@ -61,8 +61,8 @@ ans = agent.sum(1,5) # run remotely and not comsume any local computation resour
 
 # 安装:
 ### 依赖
-- Python 2.7 or 3.5+. 
-- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**，如果你只用并行部分的接口不需要安装paddle) 
+- Python 2.7 or 3.5+. (**Windows系统**目前仅支持python3.6+以上的环境）
+- [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**非必须的**，如果你只用并行部分的接口不需要安装paddle)
 
 
 ```
@@ -83,6 +83,6 @@ pip install parl
 - [冠军解决方案：NIPS2018强化学习假肢挑战赛](examples/NeurIPS2018-AI-for-Prosthetics-Challenge/)
 - [冠军解决方案：NIPS2019强化学习仿生人控制赛事](examples/NeurIPS2019-Learn-to-Move-Challenge/)
 
-<img src="examples/NeurIPS2019-Learn-to-Move-Challenge/image/performance.gif" width = "300" height ="200" alt="NeurlIPS2018"/> <img src=".github/Half-Cheetah.gif" width = "300" height ="200" alt="Half-Cheetah"/> <img src=".github/Breakout.gif" width = "200" height ="200" alt="Breakout"/> 
+<img src="examples/NeurIPS2019-Learn-to-Move-Challenge/image/performance.gif" width = "300" height ="200" alt="NeurlIPS2018"/> <img src=".github/Half-Cheetah.gif" width = "300" height ="200" alt="Half-Cheetah"/> <img src=".github/Breakout.gif" width = "200" height ="200" alt="Breakout"/>
 <br>
 <img src=".github/Aircraft.gif"  width = "808" height ="300"  alt="NeurlIPS2018"/>
diff --git a/README.md b/README.md
index a5cbdd76a71c01a04c33f79fe701322a57795010..ed8ae1e28a6864e0a1d171a172d17dfe1bc03b8f 100644
--- a/README.md
+++ b/README.md
@@ -3,7 +3,7 @@
 </p>
 
 English | [简体中文](./README.cn.md)   
-[**Documentation**](https://parl.readthedocs.io) | [**中文文档**](docs/zh_CN/Overview.md)
+[**Documentation**](https://parl.readthedocs.io/en/stable/index.html)
 
 > PARL is a flexible and high-efficient reinforcement learning framework.
 
@@ -64,7 +64,7 @@ For users, they can write code in a simple way, just like writing multi-thread c
 
 # Install:
 ### Dependencies
-- Python 2.7 or 3.5+. 
+- Python 2.7 or 3.5+(On **Windows**, PARL only supprorts the enviroment with python3.6+).
 - [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle) (**Optional**, if you only want to use APIs related to parallelization alone)  
 
 
diff --git a/benchmark/torch/AlphaZero/.pic/good_moves.png b/benchmark/torch/AlphaZero/.pic/good_moves.png
new file mode 100644
index 0000000000000000000000000000000000000000..f007fc4a6f2dbc9df9a6a8163de08dcf59cb82dc
Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/good_moves.png differ
diff --git a/benchmark/torch/AlphaZero/.pic/perfect_moves.png b/benchmark/torch/AlphaZero/.pic/perfect_moves.png
new file mode 100644
index 0000000000000000000000000000000000000000..72c3913ea58498446e92d170255c71606e194fe0
Binary files /dev/null and b/benchmark/torch/AlphaZero/.pic/perfect_moves.png differ
diff --git a/benchmark/torch/AlphaZero/Arena.py b/benchmark/torch/AlphaZero/Arena.py
new file mode 100644
index 0000000000000000000000000000000000000000..a0791803eb1061485f2f6a647540d9bc9d4f45ee
--- /dev/null
+++ b/benchmark/torch/AlphaZero/Arena.py
@@ -0,0 +1,105 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+from tqdm import tqdm
+from parl.utils import logger
+
+
+class Arena():
+    """
+    An Arena class where any 2 agents can be pit against each other.
+    """
+
+    def __init__(self, player1, player2, game, display=None):
+        """
+        Input:
+            player 1,2: two functions that takes board as input, return action
+            game: Game object
+            display: a function that takes board as input and prints it (e.g.
+                     display in othello/OthelloGame). Is necessary for verbose
+                     mode.
+
+        see othello/OthelloPlayers.py for an example. See pit.py for pitting
+        human players/other baselines with each other.
+        """
+        self.player1 = player1
+        self.player2 = player2
+        self.game = game
+        self.display = display
+
+    def playGame(self, verbose=False):
+        """
+        Executes one episode of a game.
+
+        Returns:
+            either
+                winner: player who won the game (1 if player1, -1 if player2)
+            or
+                draw result returned from the game that is neither 1, -1, nor 0.
+        """
+        players = [self.player2, None, self.player1]
+        curPlayer = 1
+        board = self.game.getInitBoard()
+        it = 0
+        while self.game.getGameEnded(board, curPlayer) == 0:
+            it += 1
+            if verbose:
+                assert self.display
+                print("Turn ", str(it), "Player ", str(curPlayer))
+                self.display(board)
+            action = players[curPlayer + 1](self.game.getCanonicalForm(
+                board, curPlayer))
+
+            valids = self.game.getValidMoves(
+                self.game.getCanonicalForm(board, curPlayer), 1)
+
+            if valids[action] == 0:
+                logger.error('Action {} is not valid!'.format(action))
+                logger.debug('valids = {}'.format(valids))
+                assert valids[action] > 0
+            board, curPlayer = self.game.getNextState(board, curPlayer, action)
+        if verbose:
+            assert self.display
+            print("Game over: Turn ", str(it), "Result ",
+                  str(self.game.getGameEnded(board, 1)))
+            self.display(board)
+        return curPlayer * self.game.getGameEnded(board, curPlayer)
+
+    def playGames(self, num, verbose=False):
+        """
+        Plays num games in which player1 starts num/2 games and player2 starts
+        num/2 games.
+
+        Returns:
+            oneWon: games won by player1
+            twoWon: games won by player2
+            draws:  games won by nobody
+        """
+
+        num = int(num / 2)
+        oneWon = 0
+        twoWon = 0
+        draws = 0
+        for _ in tqdm(range(num), desc="Arena.playGames (1)"):
+            gameResult = self.playGame(verbose=verbose)
+            if gameResult == 1:
+                oneWon += 1
+            elif gameResult == -1:
+                twoWon += 1
+            else:
+                draws += 1
+
+        self.player1, self.player2 = self.player2, self.player1
+
+        for _ in tqdm(range(num), desc="Arena.playGames (2)"):
+            gameResult = self.playGame(verbose=verbose)
+            if gameResult == -1:
+                oneWon += 1
+            elif gameResult == 1:
+                twoWon += 1
+            else:
+                draws += 1
+
+        return oneWon, twoWon, draws
diff --git a/benchmark/torch/AlphaZero/Coach.py b/benchmark/torch/AlphaZero/Coach.py
new file mode 100644
index 0000000000000000000000000000000000000000..01394b076db969db42a7277b5d95f82bd661db3d
--- /dev/null
+++ b/benchmark/torch/AlphaZero/Coach.py
@@ -0,0 +1,246 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import sys
+import threading
+import queue
+import pickle
+from pickle import Pickler, Unpickler
+from random import shuffle
+from parl.utils import tensorboard
+
+import numpy as np
+from tqdm import tqdm
+
+import parl
+from parl.utils import logger
+
+from actor import Actor
+from utils import split_group, get_test_dataset
+from alphazero_agent import create_agent
+
+
+class Coach():
+    """
+    This class executes the self-play, learning and evaluating. 
+    """
+
+    def __init__(self, game, args):
+        self.game = game
+        self.args = args
+
+        # neural network of current generation
+        self.current_agent = create_agent(self.game)
+        # neural network of previous generation
+        self.previous_agent = create_agent(self.game)
+
+        # history of examples from args.numItersForTrainExamplesHistory latest iterations
+        self.trainExamplesHistory = []
+
+        self.remote_actors_signal_queues = []
+        self.remote_actors_return_queue = queue.Queue()
+
+        self.test_dataset = get_test_dataset()
+
+    def _run_remote_tasks(self, signal_queue):
+        # The remote actor will actually run on the local machine or other machines of xparl cluster
+        remote_actor = Actor(self.game, self.args)
+
+        while True:
+            # receive running task signal
+            # signal: specify task type and task input data (optional)
+            signal = signal_queue.get()
+
+            if signal["task"] == "self-play":
+                episode_num_each_actor = self.args.numEps // self.args.actors_num
+                result = remote_actor.self_play(
+                    self.current_agent.get_weights(), episode_num_each_actor)
+                self.remote_actors_return_queue.put({"self-play": result})
+
+            elif signal["task"] == "pitting":
+                games_num_each_actor = self.args.arenaCompare // self.args.actors_num
+                result = remote_actor.pitting(
+                    self.previous_agent.get_weights(),
+                    self.current_agent.get_weights(), games_num_each_actor)
+                self.remote_actors_return_queue.put({"pitting": result})
+
+            elif signal["task"] == "evaluate_test_dataset":
+                test_dataset = signal["test_dataset"]
+                result = remote_actor.evaluate_test_dataset(
+                    self.current_agent.get_weights(), test_dataset)
+                self.remote_actors_return_queue.put({
+                    "evaluate_test_dataset":
+                    result
+                })
+            else:
+                raise NotImplementedError
+
+    def _create_remote_actors(self):
+        # connect to xparl cluster to submit jobs
+        parl.connect(self.args.master_address)
+
+        for i in range(self.args.actors_num):
+            signal_queue = queue.Queue()
+            self.remote_actors_signal_queues.append(signal_queue)
+
+            remote_thread = threading.Thread(
+                target=self._run_remote_tasks, args=(signal_queue, ))
+            remote_thread.setDaemon(True)
+            remote_thread.start()
+
+    def learn(self):
+        """Each iteration:
+        1. Performs numEps episodes of self-play.
+        2. Retrains neural network with examples in trainExamplesHistory
+           (which has a maximum length of numItersForTrainExamplesHistory).
+        3. Evaluates the new neural network with the test dataset.
+        4. Pits the new neural network against the old one and accepts it
+           only if it wins >= updateThreshold fraction of games.
+        """
+
+        # create remote actors to run tasks (self-play/pitting/evaluate_test_dataset) in parallel.
+        self._create_remote_actors()
+
+        for iteration in range(1, self.args.numIters + 1):
+            logger.info('Starting Iter #{} ...'.format(iteration))
+
+            ####################
+            logger.info('Step1: self-play in parallel...')
+            iterationTrainExamples = []
+            # update weights of remote actors to the latest weights, and ask them to run self-play task
+            for signal_queue in self.remote_actors_signal_queues:
+                signal_queue.put({"task": "self-play"})
+            # wait for all remote actors (a total of self.args.actors_num) to return the self-play results
+            for _ in range(self.args.actors_num):
+                result = self.remote_actors_return_queue.get()
+                iterationTrainExamples.extend(result["self-play"])
+
+            # save the iteration examples to the history
+            self.trainExamplesHistory.append(iterationTrainExamples)
+            if len(self.trainExamplesHistory
+                   ) > self.args.numItersForTrainExamplesHistory:
+                logger.warning("Removing the oldest entry in trainExamples.")
+                self.trainExamplesHistory.pop(0)
+            self.saveTrainExamples(iteration)  # backup history to a file
+
+            ####################
+            logger.info('Step2: train neural network...')
+            # shuffle examples before training
+            trainExamples = []
+            for e in self.trainExamplesHistory:
+                trainExamples.extend(e)
+            shuffle(trainExamples)
+
+            # training new network, keeping a copy of the old one
+            self.current_agent.save(
+                os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+            self.previous_agent.restore(
+                os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+
+            self.current_agent.learn(trainExamples)
+
+            ####################
+            logger.info('Step3: evaluate test dataset in parallel...')
+            cnt = 0
+            # update weights of remote actors to the latest weights, and ask them to evaluate assigned test dataset
+            for i, data in enumerate(
+                    split_group(
+                        self.test_dataset,
+                        len(self.test_dataset) // self.args.actors_num)):
+                self.remote_actors_signal_queues[i].put({
+                    "task":
+                    "evaluate_test_dataset",
+                    "test_dataset":
+                    data
+                })
+                cnt += len(data)
+            perfect_moves_cnt, good_moves_cnt = 0, 0
+            # wait for all remote actors (a total of self.args.actors_num) to return the evaluating results
+            for _ in range(self.args.actors_num):
+                (perfect_moves,
+                 good_moves) = self.remote_actors_return_queue.get(
+                 )["evaluate_test_dataset"]
+                perfect_moves_cnt += perfect_moves
+                good_moves_cnt += good_moves
+            logger.info('perfect moves rate: {}, good moves rate: {}'.format(
+                perfect_moves_cnt / cnt, good_moves_cnt / cnt))
+            tensorboard.add_scalar('perfect_moves_rate',
+                                   perfect_moves_cnt / cnt, iteration)
+            tensorboard.add_scalar('good_moves_rate', good_moves_cnt / cnt,
+                                   iteration)
+
+            ####################
+            logger.info(
+                'Step4: pitting against previous generation in parallel...')
+            # transfer weights of previous generation and current generation to the remote actors, and ask them to pit.
+            for signal_queue in self.remote_actors_signal_queues:
+                signal_queue.put({"task": "pitting"})
+            previous_wins, current_wins, draws = 0, 0, 0
+            for _ in range(self.args.actors_num):
+                (pwins_, cwins_,
+                 draws_) = self.remote_actors_return_queue.get()["pitting"]
+                previous_wins += pwins_
+                current_wins += cwins_
+                draws += draws_
+
+            logger.info('NEW/PREV WINS : %d / %d ; DRAWS : %d' %
+                        (current_wins, previous_wins, draws))
+            if previous_wins + current_wins == 0 or float(current_wins) / (
+                    previous_wins + current_wins) < self.args.updateThreshold:
+                logger.info('REJECTING NEW MODEL')
+                self.current_agent.restore(
+                    os.path.join(self.args.checkpoint, 'temp.pth.tar'))
+            else:
+                logger.info('ACCEPTING NEW MODEL')
+                self.current_agent.save(
+                    os.path.join(self.args.checkpoint, 'best.pth.tar'))
+            self.current_agent.save(
+                os.path.join(self.args.checkpoint,
+                             self.getCheckpointFile(iteration)))
+
+    def getCheckpointFile(self, iteration):
+        return 'checkpoint_' + str(iteration) + '.pth.tar'
+
+    def saveTrainExamples(self, iteration):
+        folder = self.args.checkpoint
+        if not os.path.exists(folder):
+            os.makedirs(folder)
+        filename = os.path.join(
+            folder,
+            self.getCheckpointFile(iteration) + ".examples")
+        with open(filename, "wb+") as f:
+            Pickler(f).dump(self.trainExamplesHistory)
+        f.closed
+
+    def loadModel(self):
+        self.current_agent.restore(
+            os.path.join(self.args.load_folder_file[0],
+                         self.args.load_folder_file[1]))
+
+    def loadTrainExamples(self):
+        modelFile = os.path.join(self.args.load_folder_file[0],
+                                 self.args.load_folder_file[1])
+        examplesFile = modelFile + ".examples"
+        if not os.path.isfile(examplesFile):
+            logger.warning(
+                "File {} with trainExamples not found!".format(examplesFile))
+            r = input("Continue? [y|n]")
+            if r != "y":
+                sys.exit()
+        else:
+            logger.info("File with trainExamples found. Loading it...")
+            with open(examplesFile, "rb") as f:
+                self.trainExamplesHistory = Unpickler(f).load()
+            logger.info('Loading done!')
diff --git a/benchmark/torch/AlphaZero/MCTS.py b/benchmark/torch/AlphaZero/MCTS.py
new file mode 100644
index 0000000000000000000000000000000000000000..b011efe15dbdc10ccbe2c07e6d30b2e2aaa82d9d
--- /dev/null
+++ b/benchmark/torch/AlphaZero/MCTS.py
@@ -0,0 +1,164 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import math
+import time
+
+import numpy as np
+
+EPS = 1e-8
+
+
+class MCTS():
+    """
+    This class handles the MCTS tree.
+    """
+
+    def __init__(self, game, nn_agent, args, dirichlet_noise=False):
+        self.game = game
+        self.nn_agent = nn_agent
+        self.args = args
+        self.dirichlet_noise = dirichlet_noise
+        self.Qsa = {}  # stores Q values for s,a (as defined in the paper)
+        self.Nsa = {}  # stores #times edge s,a was visited
+        self.Ns = {}  # stores #times board s was visited
+        self.Ps = {}  # stores initial policy (returned by neural net)
+
+        self.Es = {}  # stores game.getGameEnded ended for board s
+        self.Vs = {}  # stores game.getValidMoves for board s
+
+    def getActionProb(self, canonicalBoard, temp=1):
+        """
+        This function performs numMCTSSims simulations of MCTS starting from
+        canonicalBoard.
+
+        Returns:
+            probs: a policy vector where the probability of the ith action is
+                   proportional to Nsa[(s,a)]**(1./temp)
+        """
+        for i in range(self.args.numMCTSSims):
+            dir_noise = (i == 0 and self.dirichlet_noise)
+            self.search(canonicalBoard, dirichlet_noise=dir_noise)
+
+        s = self.game.stringRepresentation(canonicalBoard)
+        counts = [
+            self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
+            for a in range(self.game.getActionSize())
+        ]
+
+        if temp == 0:
+            bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
+            bestA = np.random.choice(bestAs)
+            probs = [0] * len(counts)
+            probs[bestA] = 1
+            return probs
+
+        counts = [x**(1. / temp) for x in counts]
+        counts_sum = float(sum(counts))
+        probs = [x / counts_sum for x in counts]
+        return probs
+
+    def search(self, canonicalBoard, dirichlet_noise=False):
+        """
+        This function performs one iteration of MCTS. It is recursively called
+        till a leaf node is found. The action chosen at each node is one that
+        has the maximum upper confidence bound as in the paper.
+
+        Once a leaf node is found, the neural network is called to return an
+        initial policy P and a value v for the state. This value is propagated
+        up the search path. In case the leaf node is a terminal state, the
+        outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
+        updated.
+
+        NOTE: the return values are the negative of the value of the current
+        state. This is done since v is in [-1,1] and if v is the value of a
+        state for the current player, then its value is -v for the other player.
+
+        Returns:
+            v: the negative of the value of the current canonicalBoard
+        """
+
+        s = self.game.stringRepresentation(canonicalBoard)
+
+        if s not in self.Es:
+            self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
+        if self.Es[s] != 0:
+            # terminal node
+            return -self.Es[s]
+
+        if s not in self.Ps:
+            # leaf node
+            self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
+
+            valids = self.game.getValidMoves(canonicalBoard, 1)
+            self.Ps[s] = self.Ps[s] * valids  # masking invalid moves
+            if dirichlet_noise:
+                self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            if sum_Ps_s > 0:
+                self.Ps[s] /= sum_Ps_s  # renormalize
+            else:
+                # if all valid moves were masked make all valid moves equally probable
+
+                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
+                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
+                print("All valid moves were masked, doing a workaround.")
+                self.Ps[s] = self.Ps[s] + valids
+                self.Ps[s] /= np.sum(self.Ps[s])
+
+            self.Vs[s] = valids
+            self.Ns[s] = 0
+            return -v
+
+        valids = self.Vs[s]
+        if dirichlet_noise:
+            self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            self.Ps[s] /= sum_Ps_s  # renormalize
+        cur_best = -float('inf')
+        best_act = -1
+
+        # pick the action with the highest upper confidence bound
+        for a in range(self.game.getActionSize()):
+            if valids[a]:
+                if (s, a) in self.Qsa:
+                    u = self.Qsa[
+                        (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                            self.Ns[s]) / (1 + self.Nsa[(s, a)])
+                else:
+                    u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                        self.Ns[s] + EPS)  # Q = 0 ?
+
+                if u > cur_best:
+                    cur_best = u
+                    best_act = a
+
+        a = best_act
+        next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
+        next_s = self.game.getCanonicalForm(next_s, next_player)
+
+        v = self.search(next_s)
+
+        if (s, a) in self.Qsa:
+            self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
+                (s, a)] + v) / (self.Nsa[(s, a)] + 1)
+            self.Nsa[(s, a)] += 1
+
+        else:
+            self.Qsa[(s, a)] = v
+            self.Nsa[(s, a)] = 1
+
+        self.Ns[s] += 1
+        return -v
+
+    def applyDirNoise(self, s, valids):
+        dir_values = np.random.dirichlet(
+            [self.args.dirichletAlpha] * np.count_nonzero(valids))
+        dir_idx = 0
+        for idx in range(len(self.Ps[s])):
+            if self.Ps[s][idx]:
+                self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
+                    0.25 * dir_values[dir_idx])
+                dir_idx += 1
diff --git a/benchmark/torch/AlphaZero/README.md b/benchmark/torch/AlphaZero/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..72d9c807fb5066c51b49520b8aca3a5e666e133c
--- /dev/null
+++ b/benchmark/torch/AlphaZero/README.md
@@ -0,0 +1,58 @@
+## AlphaZero baseline for Connect4 game (distributed version)
+- In this example, we provide a fine-tuned AlphaZero baseline to solve the Connect4 game, based on the code of [alpha-zero-general](https://github.com/suragnair/alpha-zero-general) repo.
+- We take advantage of the parallelism capacity of [PARL](https://github.com/PaddlePaddle/PARL) to support running self-play and evaluating tasks in parallel.
+- We also provide scripts to pack your well-trained model to a submission file, which can be submitted to the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition directly.
+
+### Dependencies
+- python3
+- [parl==1.3](https://github.com/PaddlePaddle/PARL)
+- torch
+- tqdm
+
+### Training 
+1. Download the [1k connect4 validation set](https://www.kaggle.com/petercnudde/1k-connect4-validation-set) to the current directory. (filename: `refmoves1k_kaggle`)
+
+2. Start xparl cluster
+```bash
+# You can change following `cpu_num` and `args.actor_nums` in the main.py 
+# based on the CPU number of your machine.
+
+xparl start --port 8010 --cpu_num 25
+```
+
+```bash
+# [OPTIONAL] You can also run the following script in other machines to add more CPU resource 
+#            to the xparl cluster, so you can increase the parallelism (args.actor_nums).
+
+xparl connect --address MASTER_IP:8010 --cpu_num [CPU_NUM]
+```
+
+3. Run training script
+```bash
+python main.py
+```
+
+4. Visualize (good moves rate and perfect moves rate)
+```
+tensorboard --logdir .
+```
+
+### Submitting
+To submit the well-trained model to the Kaggle, you can use our provided script to generate `submission.py`, for example:
+```bash
+python gen_submission.py saved_model/best.pth.tar
+```
+
+### Performance
+- Following are `good moves rate` and `perfect moves rate` indicators in tensorbaord, please refer to the [link](https://www.kaggle.com/petercnudde/scoring-connect-x-agents) for specific meaning.
+
+<img src=".pic/good_moves.png" width = "300" alt="good moves rate"/> <img src=".pic/perfect_moves.png" width = "300" alt="perfect moves rate"/>
+
+> It takes about 1 day to run 25 iterations on the machine with 25 cpus.
+
+- It can reach about score 1368 (rank 5 on 2020/06/04) in the Kaggle [Connect X](https://www.kaggle.com/c/connectx/leaderboard) competition.
+
+
+### Reference
+- [suragnair/alpha-zero-general](https://github.com/suragnair/alpha-zero-general)
+- [Scoring connect-x agents](https://www.kaggle.com/petercnudde/scoring-connect-x-agents)
diff --git a/benchmark/torch/AlphaZero/actor.py b/benchmark/torch/AlphaZero/actor.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ed719b92d292903f81f7c92a983927bf5c9cab5
--- /dev/null
+++ b/benchmark/torch/AlphaZero/actor.py
@@ -0,0 +1,165 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import parl
+import os
+from alphazero_agent import create_agent
+from MCTS import MCTS
+from Arena import Arena
+from utils import win_loss_draw
+
+
+@parl.remote_class
+class Actor(object):
+    def __init__(self, game, args):
+        os.environ['OMP_NUM_THREADS'] = "1"
+        self.game = game
+        self.args = args
+
+        # neural network of previous generation
+        self.previous_agent = create_agent(self.game, cuda=False)
+        # neural network of current generation
+        self.current_agent = create_agent(self.game, cuda=False)
+
+        # MCTS of previous generation
+        self.previous_mcts = MCTS(
+            self.game, self.previous_agent, self.args, dirichlet_noise=True)
+        # MCTS of current generation
+        self.current_mcts = MCTS(
+            self.game, self.current_agent, self.args, dirichlet_noise=True)
+
+    def self_play(self, current_weights, game_num):
+        """Collecting training data by self-play.
+        
+        Args:
+            current_weights (numpy.array): latest weights of neural network
+            game_num (int): game number of self-play
+
+        Returns:
+            train_examples (list): examples of the form (canonicalBoard, currPlayer, pi,v)
+        """
+
+        # update weights of current neural network with latest weights
+        self.current_agent.set_weights(current_weights)
+
+        train_examples = []
+        for _ in range(game_num):
+            # reset node state of MCTS
+            self.current_mcts = MCTS(
+                self.game, self.current_agent, self.args, dirichlet_noise=True)
+            train_examples.extend(self._executeEpisode())
+        return train_examples
+
+    def pitting(self, previous_weights, current_weights, games_num):
+        """Fighting between previous generation agent and current generation agent
+
+        Args:
+            previous_weights (numpy.array): weights of previous generation neural network
+            current_weights (numpy.array): weights of current generation neural network
+            game_num (int): game number of fighting 
+
+        Returns:
+            tuple of (game number of previous agent won, game number of current agent won, game number of draw)
+        """
+        # update weights of previous and current neural network
+        self.previous_agent.set_weights(previous_weights)
+        self.current_agent.set_weights(current_weights)
+
+        # reset node state of MCTS
+        self.previous_mcts = MCTS(self.game, self.previous_agent, self.args)
+        self.current_mcts = MCTS(self.game, self.current_agent, self.args)
+
+        arena = Arena(
+            lambda x: np.argmax(self.previous_mcts.getActionProb(x, temp=0)),
+            lambda x: np.argmax(self.current_mcts.getActionProb(x, temp=0)),
+            self.game)
+        previous_wins, current_wins, draws = arena.playGames(games_num)
+
+        return (previous_wins, current_wins, draws)
+
+    def evaluate_test_dataset(self, current_weights, test_dataset):
+        """Evaluate performance of latest neural nerwork
+        
+        Args:
+            current_weights (numpy.array): latest weights of neural network
+            test_dataset (list): game number of self-play
+
+        Returns:
+            tuple of (number of perfect moves, number of good moves)
+        """
+        # update weights of current neural network with latest weights
+        self.current_agent.set_weights(current_weights)
+
+        perfect_move_count, good_move_count = 0, 0
+        for data in test_dataset:
+            self.current_mcts = MCTS(self.game, self.current_agent, self.args)
+
+            x = self.game.getCanonicalForm(data['board'], data['player'])
+            agent_move = int(
+                np.argmax(self.current_mcts.getActionProb(x, temp=0)))
+
+            moves = data["move_score"]
+            perfect_score = max(moves)
+            perfect_moves = [i for i in range(7) if moves[i] == perfect_score]
+
+            if agent_move in perfect_moves:
+                perfect_move_count += 1
+            if win_loss_draw(
+                    moves[agent_move]) == win_loss_draw(perfect_score):
+                good_move_count += 1
+
+        return (perfect_move_count, good_move_count)
+
+    def _executeEpisode(self):
+        """
+
+        This function executes one episode of self-play, starting with player 1.
+        As the game goes on, each turn is added as a training example to
+        trainExamples. The game is played till the game ends. After the game
+        ends, the outcome of the game is used to assign values to each example
+        in trainExamples.
+
+        It uses a temp=1 if episodeStep < tempThresholdStep, and thereafter
+        uses temp=0.
+
+        Returns:
+            trainExamples: a list of examples of the form (canonicalBoard, currPlayer, pi,v)
+                           pi is the MCTS informed policy vector, v is +1 if
+                           the player eventually won the game, else -1.
+        """
+        trainExamples = []
+        board = self.game.getInitBoard()
+        self.curPlayer = 1
+        episodeStep = 0
+
+        while True:
+            episodeStep += 1
+            canonicalBoard = self.game.getCanonicalForm(board, self.curPlayer)
+            temp = int(episodeStep < self.args.tempThresholdStep)
+
+            pi = self.current_mcts.getActionProb(canonicalBoard, temp=temp)
+            sym = self.game.getSymmetries(canonicalBoard, pi)
+            for b, p in sym:  # board, pi
+                trainExamples.append([b, self.curPlayer, p, None])
+
+            action = np.random.choice(len(pi), p=pi)
+            board, self.curPlayer = self.game.getNextState(
+                board, self.curPlayer, action)
+
+            r = self.game.getGameEnded(board, self.curPlayer)
+
+            if r != 0:
+                return [(x[0], x[2], r * ((-1)**(x[1] != self.curPlayer)))
+                        for x in trainExamples]
diff --git a/benchmark/torch/AlphaZero/alphazero_agent.py b/benchmark/torch/AlphaZero/alphazero_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..9e7e497e4818f30ae8d71bee109f4ff6f9795962
--- /dev/null
+++ b/benchmark/torch/AlphaZero/alphazero_agent.py
@@ -0,0 +1,150 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import os
+import numpy as np
+import parl
+import torch
+import torch.optim as optim
+
+from tqdm import tqdm
+from utils import *
+from connect4_model import Connect4Model
+
+args = dotdict({
+    'lr': 0.001,
+    'dropout': 0.3,
+    'epochs': 5,
+    'batch_size': 64,
+    'num_channels': 64,
+})
+
+
+class AlphaZero(parl.Algorithm):
+    def __init__(self, model):
+        self.model = model
+
+    def learn(self, boards, target_pis, target_vs, optimizer):
+        self.model.train()  # train mode
+
+        # compute model output
+        out_log_pi, out_v = self.model(boards)
+
+        pi_loss = -torch.sum(target_pis * out_log_pi) / target_pis.size()[0]
+
+        v_loss = torch.sum(
+            (target_vs - out_v.view(-1))**2) / target_vs.size()[0]
+
+        total_loss = pi_loss + v_loss
+
+        # compute gradient and do SGD step
+        optimizer.zero_grad()
+        total_loss.backward()
+        optimizer.step()
+
+        return total_loss, pi_loss, v_loss
+
+    def predict(self, board):
+        self.model.eval()  # eval mode
+
+        with torch.no_grad():
+            log_pi, v = self.model(board)
+
+        pi = torch.exp(log_pi)
+        return pi, v
+
+
+def create_agent(game, cuda=True):
+    cuda = cuda and torch.cuda.is_available()
+
+    model = Connect4Model(game, args)
+    if cuda:
+        model.cuda()
+
+    algorithm = AlphaZero(model)
+
+    alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
+    return alphazero_agent
+
+
+class AlphaZeroAgent(parl.Agent):
+    def __init__(self, algorithm, game, cuda):
+        super(AlphaZeroAgent, self).__init__(algorithm)
+        self.cuda = cuda
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+
+    def learn(self, examples):
+        """
+        Args:
+            examples: list of examples, each example is of form (board, pi, v)
+        """
+        optimizer = optim.Adam(self.algorithm.model.parameters(), lr=args.lr)
+
+        for epoch in range(args.epochs):
+            print('EPOCH ::: ' + str(epoch + 1))
+
+            batch_count = int(len(examples) / args.batch_size)
+
+            pbar = tqdm(range(batch_count), desc='Training Net')
+            for _ in pbar:
+                sample_ids = np.random.randint(
+                    len(examples), size=args.batch_size)
+                boards, pis, vs = list(zip(*[examples[i] for i in sample_ids]))
+                boards = torch.FloatTensor(np.array(boards).astype(np.float64))
+                target_pis = torch.FloatTensor(np.array(pis))
+                target_vs = torch.FloatTensor(np.array(vs).astype(np.float64))
+
+                if self.cuda:
+                    boards, target_pis, target_vs = boards.contiguous().cuda(
+                    ), target_pis.contiguous().cuda(), target_vs.contiguous(
+                    ).cuda()
+
+                total_loss, pi_loss, v_loss = self.algorithm.learn(
+                    boards, target_pis, target_vs, optimizer)
+
+                # record loss with tqdm
+                pbar.set_postfix(Loss_pi=pi_loss.item(), Loss_v=v_loss.item())
+
+    def predict(self, board):
+        """
+        Args:
+            board (np.array): input board
+
+        Return:
+            pi (np.array): probability of actions
+            v (np.array): estimated value of input
+        """
+        # preparing input
+        board = torch.FloatTensor(board.astype(np.float64))
+        if self.cuda:
+            board = board.contiguous().cuda()
+        board = board.view(1, self.board_x, self.board_y)
+
+        pi, v = self.algorithm.predict(board)
+
+        return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
+
+
+def create_agent(game, cuda=True):
+    cuda = cuda and torch.cuda.is_available()
+
+    model = Connect4Model(game, args)
+    if cuda:
+        model.cuda()
+
+    algorithm = AlphaZero(model)
+
+    alphazero_agent = AlphaZeroAgent(algorithm, game, cuda)
+    return alphazero_agent
diff --git a/benchmark/torch/AlphaZero/connect4_game.py b/benchmark/torch/AlphaZero/connect4_game.py
new file mode 100644
index 0000000000000000000000000000000000000000..c10e8ca4afbca839ef71b18fd8f39f7493f30a4d
--- /dev/null
+++ b/benchmark/torch/AlphaZero/connect4_game.py
@@ -0,0 +1,239 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import numpy as np
+from collections import namedtuple
+
+DEFAULT_HEIGHT = 6
+DEFAULT_WIDTH = 7
+DEFAULT_WIN_LENGTH = 4
+
+WinState = namedtuple('WinState', 'is_ended winner')
+
+
+class Board():
+    """
+    Connect4 Board.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        "Set up initial board configuration."
+        self.height = height or DEFAULT_HEIGHT
+        self.width = width or DEFAULT_WIDTH
+        self.win_length = win_length or DEFAULT_WIN_LENGTH
+
+        if np_pieces is None:
+            self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
+        else:
+            self.np_pieces = np_pieces
+            assert self.np_pieces.shape == (self.height, self.width)
+
+    def add_stone(self, column, player):
+        "Create copy of board containing new stone."
+        available_idx, = np.where(self.np_pieces[:, column] == 0)
+        if len(available_idx) == 0:
+            raise ValueError(
+                "Can't play column %s on board %s" % (column, self))
+
+        self.np_pieces[available_idx[-1]][column] = player
+
+    def get_valid_moves(self):
+        "Any zero value in top row in a valid move"
+        return self.np_pieces[0] == 0
+
+    def get_win_state(self):
+        for player in [-1, 1]:
+            player_pieces = self.np_pieces == -player
+            # Check rows & columns for win
+            if (self._is_straight_winner(player_pieces)
+                    or self._is_straight_winner(player_pieces.transpose())
+                    or self._is_diagonal_winner(player_pieces)):
+                return WinState(True, -player)
+
+        # draw has very little value.
+        if not self.get_valid_moves().any():
+            return WinState(True, None)
+
+        # Game is not ended yet.
+        return WinState(False, None)
+
+    def with_np_pieces(self, np_pieces):
+        """Create copy of board with specified pieces."""
+        if np_pieces is None:
+            np_pieces = self.np_pieces
+        return Board(self.height, self.width, self.win_length, np_pieces)
+
+    def _is_diagonal_winner(self, player_pieces):
+        """Checks if player_pieces contains a diagonal win."""
+        win_length = self.win_length
+        for i in range(len(player_pieces) - win_length + 1):
+            for j in range(len(player_pieces[0]) - win_length + 1):
+                if all(player_pieces[i + x][j + x] for x in range(win_length)):
+                    return True
+            for j in range(win_length - 1, len(player_pieces[0])):
+                if all(player_pieces[i + x][j - x] for x in range(win_length)):
+                    return True
+        return False
+
+    def _is_straight_winner(self, player_pieces):
+        """Checks if player_pieces contains a vertical or horizontal win."""
+        run_lengths = [
+            player_pieces[:, i:i + self.win_length].sum(axis=1)
+            for i in range(len(player_pieces) - self.win_length + 2)
+        ]
+        return max([x.max() for x in run_lengths]) >= self.win_length
+
+    def __str__(self):
+        return str(self.np_pieces)
+
+
+class Connect4Game(object):
+    """
+    Connect4 Game class implementing the alpha-zero-general Game interface.
+
+    Use 1 for player1 and -1 for player2.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        self._base_board = Board(height, width, win_length, np_pieces)
+
+    def getInitBoard(self):
+        """
+        Returns:
+            startBoard: a representation of the board (ideally this is the form
+                        that will be the input to your neural network)
+        """
+        return self._base_board.np_pieces
+
+    def getBoardSize(self):
+        """
+        Returns:
+            (x,y): a tuple of board dimensions
+        """
+        return (self._base_board.height, self._base_board.width)
+
+    def getActionSize(self):
+        """
+        Returns:
+            actionSize: number of all possible actions
+        """
+        return self._base_board.width
+
+    def getNextState(self, board, player, action):
+        """Returns a copy of the board with updated move, original board is unmodified.
+
+        Input:
+            board: current board
+            player: current player (1 or -1)
+            action: action taken by current player
+
+        Returns:
+            nextBoard: board after applying action
+            nextPlayer: player who plays in the next turn (should be -player)
+
+        """
+        b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
+        b.add_stone(action, player)
+        return b.np_pieces, -player
+
+    def getValidMoves(self, board, player):
+        """Any zero value in top row in a valid move.
+
+        Input:
+            board: current board
+            player: current player
+
+        Returns:
+            validMoves: a binary vector of length self.getActionSize(), 1 for
+                        moves that are valid from the current board and player,
+                        0 for invalid moves
+        """
+        return self._base_board.with_np_pieces(
+            np_pieces=board).get_valid_moves()
+
+    def getGameEnded(self, board, player):
+        """
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            r: 0 if game has not ended. 1 if player won, -1 if player lost,
+               small non-zero value for draw.
+               
+        """
+        b = self._base_board.with_np_pieces(np_pieces=board)
+        winstate = b.get_win_state()
+        if winstate.is_ended:
+            if winstate.winner is None:
+                # draw has very little value.
+                return 1e-4
+            elif winstate.winner == player:
+                return +1
+            elif winstate.winner == -player:
+                return -1
+            else:
+                raise ValueError('Unexpected winstate found: ', winstate)
+        else:
+            # 0 used to represent unfinished game.
+            return 0
+
+    def getCanonicalForm(self, board, player):
+        """ 
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            canonicalBoard: returns canonical form of board. The canonical form
+                            should be independent of player. For e.g. in chess,
+                            the canonical form can be chosen to be from the pov
+                            of white. When the player is white, we can return
+                            board as is. When the player is black, we can invert
+                            the colors and return the board.
+        """
+        return board * player
+
+    def getSymmetries(self, board, pi):
+        """Board is left/right board symmetric
+
+        Input:
+            board: current board
+            pi: policy vector of size self.getActionSize()
+
+        Returns:
+            symmForms: a list of [(board,pi)] where each tuple is a symmetrical
+                       form of the board and the corresponding pi vector. This
+                       is used when training the neural network from examples.
+        """
+        return [(board, pi),
+                (np.array(board[:, ::-1], copy=True),
+                 np.array(pi[::-1], copy=True))]
+
+    def stringRepresentation(self, board):
+        """
+        Input:
+            board: current board
+
+        Returns:
+            boardString: a quick conversion of board to a string format.
+                         Required by MCTS for hashing.
+        """
+        return board.tostring()
+
+    @staticmethod
+    def display(board):
+        print(" -----------------------")
+        print(' '.join(map(str, range(len(board[0])))))
+        print(board)
+        print(" -----------------------")
diff --git a/benchmark/torch/AlphaZero/connect4_model.py b/benchmark/torch/AlphaZero/connect4_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..6c0f7705bfc40d1645d77c79ac7e47f1f721a317
--- /dev/null
+++ b/benchmark/torch/AlphaZero/connect4_model.py
@@ -0,0 +1,86 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+class Connect4Model(parl.Model):
+    def __init__(self, game, args):
+        # game params
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+        self.args = args
+
+        super(Connect4Model, self).__init__()
+        self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1, padding=1)
+        self.conv3 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+        self.conv4 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+
+        self.bn1 = nn.BatchNorm2d(args.num_channels)
+        self.bn2 = nn.BatchNorm2d(args.num_channels)
+        self.bn3 = nn.BatchNorm2d(args.num_channels)
+        self.bn4 = nn.BatchNorm2d(args.num_channels)
+
+        self.fc1 = nn.Linear(
+            args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
+        self.fc_bn1 = nn.BatchNorm1d(128)
+
+        self.fc2 = nn.Linear(128, 64)
+        self.fc_bn2 = nn.BatchNorm1d(64)
+
+        self.fc3 = nn.Linear(64, self.action_size)
+
+        self.fc4 = nn.Linear(64, 1)
+
+    def forward(self, s):
+        """
+        Args:
+            s(torch.Tensor): batch_size x board_x x board_y
+        """
+        # batch_size x 1 x board_x x board_y
+        s = s.view(-1, 1, self.board_x, self.board_y)
+        # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn1(self.conv1(s)))
+        # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn2(self.conv2(s)))
+        # batch_size x num_channels x (board_x-2) x (board_y-2)
+        s = F.relu(self.bn3(self.conv3(s)))
+        # batch_size x num_channels x (board_x-4) x (board_y-4)
+        s = F.relu(self.bn4(self.conv4(s)))
+        s = s.view(
+            -1,
+            self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
+
+        s = F.dropout(
+            F.relu(self.fc_bn1(self.fc1(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 128
+        s = F.dropout(
+            F.relu(self.fc_bn2(self.fc2(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 64
+
+        pi = self.fc3(s)  # batch_size x action_size
+        v = self.fc4(s)  # batch_size x 1
+
+        return F.log_softmax(pi, dim=1), torch.tanh(v)
diff --git a/parl/framework/model_base.py b/benchmark/torch/AlphaZero/gen_submission.py
similarity index 50%
rename from parl/framework/model_base.py
rename to benchmark/torch/AlphaZero/gen_submission.py
index e4057a7706c2e26e66db340128679919290cb1bd..03728ec2cda4f155229ba7b4d18c7f2a22734e05 100644
--- a/parl/framework/model_base.py
+++ b/benchmark/torch/AlphaZero/gen_submission.py
@@ -12,13 +12,29 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
+import sys
+import base64
+import inspect
+import os
 
-warnings.simplefilter('default')
+assert len(sys.argv) == 2, "please specify model path."
+model_path = sys.argv[1]
 
-warnings.warn(
-    "module `parl.framework.model_base.Model` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Model` instead.",
-    DeprecationWarning,
-    stacklevel=2)
+with open(model_path, 'rb') as f:
+    raw_bytes = f.read()
+    encoded_weights = base64.encodebytes(raw_bytes)
 
-from parl.core.fluid.model import *
+# encode weights of model to byte string
+submission_file = """
+import base64
+decoded = base64.b64decode({})
+
+""".format(encoded_weights)
+
+# insert code snippet of loading weights
+with open('submission_template.py', 'r') as f:
+    submission_file += ''.join(f.readlines())
+
+# generate final submission file
+with open('submission.py', 'w') as f:
+    f.write(submission_file)
diff --git a/benchmark/torch/AlphaZero/main.py b/benchmark/torch/AlphaZero/main.py
new file mode 100644
index 0000000000000000000000000000000000000000..433e2ff0efb35e6a39df53a845a25a8110b20993
--- /dev/null
+++ b/benchmark/torch/AlphaZero/main.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from Coach import Coach
+from connect4_game import Connect4Game
+from utils import *
+
+from parl.utils import logger
+
+args = dotdict({
+    # master address of xparl cluster
+    'master_address': 'localhost:8010',
+    # number of remote actors (execute tasks [self-play/pitting/evaluate_test_dataset] in parallel).
+    'actors_num': 25,
+
+    # total number of iteration
+    'numIters': 200,
+    # Number of complete self-play games to simulate during a new iteration.
+    'numEps': 500,
+    # Number of games to play during arena (pitting) play to determine if new neural network will be accepted.
+    'arenaCompare': 50,
+    # Number of games moves for MCTS to simulate.
+    'numMCTSSims': 800,
+    # temp=1 (Temperature, τ (tau)) if episodeStep < tempThresholdStep, and thereafter uses temp=0.
+    'tempThresholdStep': 15,
+    # During arena playoff, new neural net will be accepted if threshold or more of games are won.
+    'updateThreshold': 0.6,
+    # CPUCT parameter
+    'cpuct': 4,
+    # alpha parameter of dirichlet noise which is added to the policy (pi)
+    'dirichletAlpha': 1.0,
+    # history of examples from numItersForTrainExamplesHistory latest iterations (training data)
+    'numItersForTrainExamplesHistory': 20,
+
+    # folder to save model and training examples
+    'checkpoint': './saved_model/',
+    # whether to load saved model and training examples
+    'load_model': False,
+    'load_folder_file': ('./saved_model', 'checkpoint_1.pth.tar'),
+})
+
+# Plays arenaCompare games in which player1 starts arenaCompare/2 games and player2 starts arenaCompare/2 games.
+assert args.arenaCompare % 2 == 0
+
+# make sure the tasks can be split evenly among different remote actors
+assert args.numEps % args.actors_num == 0
+assert (args.arenaCompare // 2) % args.actors_num == 0
+assert 1000 % args.actors_num == 0  # there are 1000 boards state in test_dataset
+
+
+def main():
+    game = Connect4Game()
+
+    c = Coach(game, args)
+
+    if args.load_model:
+        logger.info('Loading checkpoint {}...'.format(args.load_folder_file))
+        c.loadModel()
+        logger.info("Loading 'trainExamples' from file {}...".format(
+            args.load_folder_file))
+        c.loadTrainExamples()
+
+    c.learn()
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/torch/AlphaZero/submission_template.py b/benchmark/torch/AlphaZero/submission_template.py
new file mode 100644
index 0000000000000000000000000000000000000000..d9ba9e7eb85b0815403d98ae015c80f07f068334
--- /dev/null
+++ b/benchmark/torch/AlphaZero/submission_template.py
@@ -0,0 +1,559 @@
+# Third party code
+#
+# The following code are copied or modified from:
+# https://github.com/suragnair/alpha-zero-general
+
+import os
+os.environ['OMP_NUM_THREADS'] = "1"
+
+
+# ===== utils.py =====
+class dotdict(dict):
+    def __getattr__(self, name):
+        return self[name]
+
+
+# ===== MCTS.py ======
+import math
+import time
+import numpy as np
+
+EPS = 1e-8
+
+
+class MCTS():
+    """
+    This class handles the MCTS tree.
+    """
+
+    def __init__(self, game, nn_agent, args, dirichlet_noise=False):
+        self.game = game
+        self.nn_agent = nn_agent
+        self.args = args
+        self.dirichlet_noise = dirichlet_noise
+        self.Qsa = {}  # stores Q values for s,a (as defined in the paper)
+        self.Nsa = {}  # stores #times edge s,a was visited
+        self.Ns = {}  # stores #times board s was visited
+        self.Ps = {}  # stores initial policy (returned by neural net)
+
+        self.Es = {}  # stores game.getGameEnded ended for board s
+        self.Vs = {}  # stores game.getValidMoves for board s
+
+    def getActionProb(self, canonicalBoard, temp=1, timelimit=4.9):
+        """
+        This function performs numMCTSSims simulations of MCTS starting from
+        canonicalBoard.
+
+        Returns:
+            probs: a policy vector where the probability of the ith action is
+                   proportional to Nsa[(s,a)]**(1./temp)
+        """
+        dir_noise = self.dirichlet_noise
+        start_time = time.time()
+        while time.time() - start_time < timelimit:
+            self.search(canonicalBoard, dirichlet_noise=dir_noise)
+
+        s = self.game.stringRepresentation(canonicalBoard)
+        counts = [
+            self.Nsa[(s, a)] if (s, a) in self.Nsa else 0
+            for a in range(self.game.getActionSize())
+        ]
+
+        if temp == 0:
+            bestAs = np.array(np.argwhere(counts == np.max(counts))).flatten()
+            bestA = np.random.choice(bestAs)
+            probs = [0] * len(counts)
+            probs[bestA] = 1
+            return probs
+
+        counts = [x**(1. / temp) for x in counts]
+        counts_sum = float(sum(counts))
+        probs = [x / counts_sum for x in counts]
+        return probs
+
+    def search(self, canonicalBoard, dirichlet_noise=False):
+        """
+        This function performs one iteration of MCTS. It is recursively called
+        till a leaf node is found. The action chosen at each node is one that
+        has the maximum upper confidence bound as in the paper.
+
+        Once a leaf node is found, the neural network is called to return an
+        initial policy P and a value v for the state. This value is propagated
+        up the search path. In case the leaf node is a terminal state, the
+        outcome is propagated up the search path. The values of Ns, Nsa, Qsa are
+        updated.
+
+        NOTE: the return values are the negative of the value of the current
+        state. This is done since v is in [-1,1] and if v is the value of a
+        state for the current player, then its value is -v for the other player.
+
+        Returns:
+            v: the negative of the value of the current canonicalBoard
+        """
+
+        s = self.game.stringRepresentation(canonicalBoard)
+
+        if s not in self.Es:
+            self.Es[s] = self.game.getGameEnded(canonicalBoard, 1)
+        if self.Es[s] != 0:
+            # terminal node
+            return -self.Es[s]
+
+        if s not in self.Ps:
+            # leaf node
+            self.Ps[s], v = self.nn_agent.predict(canonicalBoard)
+
+            valids = self.game.getValidMoves(canonicalBoard, 1)
+            self.Ps[s] = self.Ps[s] * valids  # masking invalid moves
+            if dirichlet_noise:
+                self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            if sum_Ps_s > 0:
+                self.Ps[s] /= sum_Ps_s  # renormalize
+            else:
+                # if all valid moves were masked make all valid moves equally probable
+
+                # NB! All valid moves may be masked if either your NNet architecture is insufficient or you've get overfitting or something else.
+                # If you have got dozens or hundreds of these messages you should pay attention to your NNet and/or training process.
+                print("All valid moves were masked, doing a workaround.")
+                self.Ps[s] = self.Ps[s] + valids
+                self.Ps[s] /= np.sum(self.Ps[s])
+
+            self.Vs[s] = valids
+            self.Ns[s] = 0
+            return -v
+
+        valids = self.Vs[s]
+        if dirichlet_noise:
+            self.applyDirNoise(s, valids)
+            sum_Ps_s = np.sum(self.Ps[s])
+            self.Ps[s] /= sum_Ps_s  # renormalize
+        cur_best = -float('inf')
+        best_act = -1
+
+        # pick the action with the highest upper confidence bound
+        for a in range(self.game.getActionSize()):
+            if valids[a]:
+                if (s, a) in self.Qsa:
+                    u = self.Qsa[
+                        (s, a)] + self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                            self.Ns[s]) / (1 + self.Nsa[(s, a)])
+                else:
+                    u = self.args.cpuct * self.Ps[s][a] * math.sqrt(
+                        self.Ns[s] + EPS)  # Q = 0 ?
+
+                if u > cur_best:
+                    cur_best = u
+                    best_act = a
+
+        a = best_act
+        next_s, next_player = self.game.getNextState(canonicalBoard, 1, a)
+        next_s = self.game.getCanonicalForm(next_s, next_player)
+
+        v = self.search(next_s)
+
+        if (s, a) in self.Qsa:
+            self.Qsa[(s, a)] = (self.Nsa[(s, a)] * self.Qsa[
+                (s, a)] + v) / (self.Nsa[(s, a)] + 1)
+            self.Nsa[(s, a)] += 1
+
+        else:
+            self.Qsa[(s, a)] = v
+            self.Nsa[(s, a)] = 1
+
+        self.Ns[s] += 1
+        return -v
+
+    def applyDirNoise(self, s, valids):
+        dir_values = np.random.dirichlet(
+            [self.args.dirichletAlpha] * np.count_nonzero(valids))
+        dir_idx = 0
+        for idx in range(len(self.Ps[s])):
+            if self.Ps[s][idx]:
+                self.Ps[s][idx] = (0.75 * self.Ps[s][idx]) + (
+                    0.25 * dir_values[dir_idx])
+                dir_idx += 1
+
+
+# ===== connect4_game.py ======
+import numpy as np
+from collections import namedtuple
+
+DEFAULT_HEIGHT = 6
+DEFAULT_WIDTH = 7
+DEFAULT_WIN_LENGTH = 4
+
+WinState = namedtuple('WinState', 'is_ended winner')
+
+
+class Board():
+    """
+    Connect4 Board.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        "Set up initial board configuration."
+        self.height = height or DEFAULT_HEIGHT
+        self.width = width or DEFAULT_WIDTH
+        self.win_length = win_length or DEFAULT_WIN_LENGTH
+
+        if np_pieces is None:
+            self.np_pieces = np.zeros([self.height, self.width], dtype=np.int)
+        else:
+            self.np_pieces = np_pieces
+            assert self.np_pieces.shape == (self.height, self.width)
+
+    def add_stone(self, column, player):
+        "Create copy of board containing new stone."
+        available_idx, = np.where(self.np_pieces[:, column] == 0)
+        if len(available_idx) == 0:
+            raise ValueError(
+                "Can't play column %s on board %s" % (column, self))
+
+        self.np_pieces[available_idx[-1]][column] = player
+
+    def get_valid_moves(self):
+        "Any zero value in top row in a valid move"
+        return self.np_pieces[0] == 0
+
+    def get_win_state(self):
+        for player in [-1, 1]:
+            player_pieces = self.np_pieces == -player
+            # Check rows & columns for win
+            if (self._is_straight_winner(player_pieces)
+                    or self._is_straight_winner(player_pieces.transpose())
+                    or self._is_diagonal_winner(player_pieces)):
+                return WinState(True, -player)
+
+        # draw has very little value.
+        if not self.get_valid_moves().any():
+            return WinState(True, None)
+
+        # Game is not ended yet.
+        return WinState(False, None)
+
+    def with_np_pieces(self, np_pieces):
+        """Create copy of board with specified pieces."""
+        if np_pieces is None:
+            np_pieces = self.np_pieces
+        return Board(self.height, self.width, self.win_length, np_pieces)
+
+    def _is_diagonal_winner(self, player_pieces):
+        """Checks if player_pieces contains a diagonal win."""
+        win_length = self.win_length
+        for i in range(len(player_pieces) - win_length + 1):
+            for j in range(len(player_pieces[0]) - win_length + 1):
+                if all(player_pieces[i + x][j + x] for x in range(win_length)):
+                    return True
+            for j in range(win_length - 1, len(player_pieces[0])):
+                if all(player_pieces[i + x][j - x] for x in range(win_length)):
+                    return True
+        return False
+
+    def _is_straight_winner(self, player_pieces):
+        """Checks if player_pieces contains a vertical or horizontal win."""
+        run_lengths = [
+            player_pieces[:, i:i + self.win_length].sum(axis=1)
+            for i in range(len(player_pieces) - self.win_length + 2)
+        ]
+        return max([x.max() for x in run_lengths]) >= self.win_length
+
+    def __str__(self):
+        return str(self.np_pieces)
+
+
+class Connect4Game(object):
+    """
+    Connect4 Game class implementing the alpha-zero-general Game interface.
+
+    Use 1 for player1 and -1 for player2.
+    """
+
+    def __init__(self,
+                 height=None,
+                 width=None,
+                 win_length=None,
+                 np_pieces=None):
+        self._base_board = Board(height, width, win_length, np_pieces)
+
+    def getInitBoard(self):
+        """
+        Returns:
+            startBoard: a representation of the board (ideally this is the form
+                        that will be the input to your neural network)
+        """
+        return self._base_board.np_pieces
+
+    def getBoardSize(self):
+        """
+        Returns:
+            (x,y): a tuple of board dimensions
+        """
+        return (self._base_board.height, self._base_board.width)
+
+    def getActionSize(self):
+        """
+        Returns:
+            actionSize: number of all possible actions
+        """
+        return self._base_board.width
+
+    def getNextState(self, board, player, action):
+        """Returns a copy of the board with updated move, original board is unmodified.
+
+        Input:
+            board: current board
+            player: current player (1 or -1)
+            action: action taken by current player
+
+        Returns:
+            nextBoard: board after applying action
+            nextPlayer: player who plays in the next turn (should be -player)
+
+        """
+        b = self._base_board.with_np_pieces(np_pieces=np.copy(board))
+        b.add_stone(action, player)
+        return b.np_pieces, -player
+
+    def getValidMoves(self, board, player):
+        """Any zero value in top row in a valid move.
+
+        Input:
+            board: current board
+            player: current player
+
+        Returns:
+            validMoves: a binary vector of length self.getActionSize(), 1 for
+                        moves that are valid from the current board and player,
+                        0 for invalid moves
+        """
+        return self._base_board.with_np_pieces(
+            np_pieces=board).get_valid_moves()
+
+    def getGameEnded(self, board, player):
+        """
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            r: 0 if game has not ended. 1 if player won, -1 if player lost,
+               small non-zero value for draw.
+               
+        """
+        b = self._base_board.with_np_pieces(np_pieces=board)
+        winstate = b.get_win_state()
+        if winstate.is_ended:
+            if winstate.winner is None:
+                # draw has very little value.
+                return 1e-4
+            elif winstate.winner == player:
+                return +1
+            elif winstate.winner == -player:
+                return -1
+            else:
+                raise ValueError('Unexpected winstate found: ', winstate)
+        else:
+            # 0 used to represent unfinished game.
+            return 0
+
+    def getCanonicalForm(self, board, player):
+        """ 
+        Input:
+            board: current board
+            player: current player (1 or -1)
+
+        Returns:
+            canonicalBoard: returns canonical form of board. The canonical form
+                            should be independent of player. For e.g. in chess,
+                            the canonical form can be chosen to be from the pov
+                            of white. When the player is white, we can return
+                            board as is. When the player is black, we can invert
+                            the colors and return the board.
+        """
+        return board * player
+
+    def getSymmetries(self, board, pi):
+        """Board is left/right board symmetric
+
+        Input:
+            board: current board
+            pi: policy vector of size self.getActionSize()
+
+        Returns:
+            symmForms: a list of [(board,pi)] where each tuple is a symmetrical
+                       form of the board and the corresponding pi vector. This
+                       is used when training the neural network from examples.
+        """
+        return [(board, pi),
+                (np.array(board[:, ::-1], copy=True),
+                 np.array(pi[::-1], copy=True))]
+
+    def stringRepresentation(self, board):
+        """
+        Input:
+            board: current board
+
+        Returns:
+            boardString: a quick conversion of board to a string format.
+                         Required by MCTS for hashing.
+        """
+        return board.tostring()
+
+    @staticmethod
+    def display(board):
+        print(" -----------------------")
+        print(' '.join(map(str, range(len(board[0])))))
+        print(board)
+        print(" -----------------------")
+
+
+# ===== connect4_model ======
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+
+#class Connect4Model(parl.Model): # Kaggle doesn't support parl package
+class Connect4Model(nn.Module):
+    def __init__(self, game, args):
+        # game params
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+        self.args = args
+
+        super(Connect4Model, self).__init__()
+        self.conv1 = nn.Conv2d(1, args.num_channels, 3, stride=1, padding=1)
+        self.conv2 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1, padding=1)
+        self.conv3 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+        self.conv4 = nn.Conv2d(
+            args.num_channels, args.num_channels, 3, stride=1)
+
+        self.bn1 = nn.BatchNorm2d(args.num_channels)
+        self.bn2 = nn.BatchNorm2d(args.num_channels)
+        self.bn3 = nn.BatchNorm2d(args.num_channels)
+        self.bn4 = nn.BatchNorm2d(args.num_channels)
+
+        self.fc1 = nn.Linear(
+            args.num_channels * (self.board_x - 4) * (self.board_y - 4), 128)
+        self.fc_bn1 = nn.BatchNorm1d(128)
+
+        self.fc2 = nn.Linear(128, 64)
+        self.fc_bn2 = nn.BatchNorm1d(64)
+
+        self.fc3 = nn.Linear(64, self.action_size)
+
+        self.fc4 = nn.Linear(64, 1)
+
+    def forward(self, s):
+        #                                                            s: batch_size x board_x x board_y
+        s = s.view(-1, 1, self.board_x,
+                   self.board_y)  # batch_size x 1 x board_x x board_y
+        s = F.relu(self.bn1(
+            self.conv1(s)))  # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn2(
+            self.conv2(s)))  # batch_size x num_channels x board_x x board_y
+        s = F.relu(self.bn3(self.conv3(
+            s)))  # batch_size x num_channels x (board_x-2) x (board_y-2)
+        s = F.relu(self.bn4(self.conv4(
+            s)))  # batch_size x num_channels x (board_x-4) x (board_y-4)
+        s = s.view(
+            -1,
+            self.args.num_channels * (self.board_x - 4) * (self.board_y - 4))
+
+        s = F.dropout(
+            F.relu(self.fc_bn1(self.fc1(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 128
+        s = F.dropout(
+            F.relu(self.fc_bn2(self.fc2(s))),
+            p=self.args.dropout,
+            training=self.training)  # batch_size x 64
+
+        pi = self.fc3(s)  # batch_size x action_size
+        v = self.fc4(s)  # batch_size x 1
+
+        return F.log_softmax(pi, dim=1), torch.tanh(v)
+
+
+# ===== simple agent ======
+args = dotdict({
+    'dropout': 0.3,
+    'num_channels': 64,
+})
+
+
+class SimpleAgent():
+    def __init__(self, game, cuda=True):
+        self.cuda = cuda and torch.cuda.is_available()
+        self.model = Connect4Model(game, args)
+        if self.cuda:
+            self.model.cuda()
+
+        self.board_x, self.board_y = game.getBoardSize()
+        self.action_size = game.getActionSize()
+
+    def predict(self, board):
+        """
+        Args:
+            board (np.array): input board
+
+        Return:
+            pi (np.array): probability of actions
+            v (np.array): estimated value of input
+        """
+        # preparing input
+        board = torch.FloatTensor(board.astype(np.float64))
+        if self.cuda:
+            board = board.contiguous().cuda()
+        board = board.view(1, self.board_x, self.board_y)
+
+        self.model.eval()  # eval mode
+
+        with torch.no_grad():
+            log_pi, v = self.model(board)
+
+        pi = torch.exp(log_pi)
+
+        return pi.data.cpu().numpy()[0], v.data.cpu().numpy()[0]
+
+    def load_checkpoint(self, buffer):
+        map_location = None if self.cuda else 'cpu'
+        checkpoint = torch.load(buffer, map_location=map_location)
+        self.model.load_state_dict(checkpoint)
+
+
+# ===== predict function ======
+import base64
+import io
+
+game = Connect4Game()
+
+# AlphaZero players
+agent = SimpleAgent(game)
+buffer = io.BytesIO(decoded)
+agent.load_checkpoint(buffer)
+mcts_args = dotdict({'numMCTSSims': 800, 'cpuct': 1.0})
+mcts = MCTS(game, agent, mcts_args)
+
+
+def alphazero_agent(obs, config):
+    board = np.reshape(obs.board.copy(), game.getBoardSize()).astype(int)
+    board[np.where(board == 2)] = -1
+
+    player = 1
+    if obs.mark == 2:
+        player = -1
+
+    x = game.getCanonicalForm(board, player)
+
+    action = np.argmax(
+        mcts.getActionProb(x, temp=0, timelimit=config.timeout - 0.5))
+    return int(action)
diff --git a/benchmark/torch/AlphaZero/utils.py b/benchmark/torch/AlphaZero/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ae500cdae19f002538c563b6cbae725c7b0d9af
--- /dev/null
+++ b/benchmark/torch/AlphaZero/utils.py
@@ -0,0 +1,65 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+
+class dotdict(dict):
+    def __getattr__(self, name):
+        try:
+            return self[name]
+        except KeyError:
+            raise AttributeError(name)
+
+
+def win_loss_draw(score):
+    if score > 0:
+        return 'win'
+    if score < 0:
+        return 'loss'
+    return 'draw'
+
+
+"""
+split one list to multiple lists
+"""
+split_group = lambda the_list, group_size: zip(*(iter(the_list), ) * group_size)
+
+import numpy as np
+import json
+from connect4_game import Connect4Game
+
+
+def get_test_dataset():
+    game = Connect4Game()
+    test_dataset = []
+    with open("refmoves1k_kaggle") as f:
+        for line in f:
+            data = json.loads(line)
+
+            board = data["board"]
+            board = np.reshape(board, game.getBoardSize()).astype(int)
+            board[np.where(board == 2)] = -1
+
+            # find out how many moves are played to set the correct mark.
+            ply = len([x for x in data["board"] if x > 0])
+            if ply & 1:
+                player = -1
+            else:
+                player = 1
+
+            test_dataset.append({
+                'board': board,
+                'player': player,
+                'move_score': data['move score'],
+            })
+    return test_dataset
diff --git a/benchmark/torch/a2c/train.py b/benchmark/torch/a2c/train.py
index f2985367f8304edb6bccc93f894a7d04f5f305c8..9a498023988bc72a0a0aa43d4850c25ced8d2856 100644
--- a/benchmark/torch/a2c/train.py
+++ b/benchmark/torch/a2c/train.py
@@ -27,7 +27,7 @@ from parl.env.atari_wrappers import wrap_deepmind
 from parl.utils.window_stat import WindowStat
 from parl.utils.time_stat import TimeStat
 from parl.utils import machine_info
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
 from parl.algorithms import A2C
 
 from atari_model import ActorCritic
@@ -205,19 +205,19 @@ class Learner(object):
         }
 
         if metric['mean_episode_rewards'] is not None:
-            tensorboard.add_scalar('train/mean_reward',
-                                   metric['mean_episode_rewards'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/total_loss', metric['total_loss'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/pi_loss', metric['pi_loss'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/vf_loss', metric['vf_loss'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/entropy', metric['entropy'],
-                                   self.sample_total_steps)
-            tensorboard.add_scalar('train/learn_rate', metric['lr'],
-                                   self.sample_total_steps)
+            summary.add_scalar('train/mean_reward',
+                               metric['mean_episode_rewards'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/total_loss', metric['total_loss'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/pi_loss', metric['pi_loss'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/vf_loss', metric['vf_loss'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/entropy', metric['entropy'],
+                               self.sample_total_steps)
+            summary.add_scalar('train/learn_rate', metric['lr'],
+                               self.sample_total_steps)
 
         logger.info(metric)
 
diff --git a/benchmark/torch/dqn/replay_memory.py b/benchmark/torch/dqn/replay_memory.py
index ea8c6565155ddacae568e901566f9b390ee3a8b8..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29 100644
--- a/benchmark/torch/dqn/replay_memory.py
+++ b/benchmark/torch/dqn/replay_memory.py
@@ -16,16 +16,16 @@ import numpy as np
 import copy
 from collections import deque, namedtuple
 
-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
 
 
 class ReplayMemory(object):
-    def __init__(self, max_size, state_shape, context_len):
+    def __init__(self, max_size, obs_shape, context_len):
         self.max_size = int(max_size)
-        self.state_shape = state_shape
+        self.obs_shape = obs_shape
         self.context_len = int(context_len)
 
-        self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
+        self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
         self.action = np.zeros((self.max_size, ), dtype='int32')
         self.reward = np.zeros((self.max_size, ), dtype='float32')
         self.isOver = np.zeros((self.max_size, ), dtype='bool')
@@ -48,42 +48,41 @@ class ReplayMemory(object):
         else:
             self._context.append(exp)
 
-    def recent_state(self):
-        """ maintain recent state for training"""
+    def recent_obs(self):
+        """ maintain recent obs for training"""
         lst = list(self._context)
-        states = [np.zeros(self.state_shape, dtype='uint8')] * \
+        obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
                     (self._context.maxlen - len(lst))
-        states.extend([k.state for k in lst])
-        return states
+        obs.extend([k.obs for k in lst])
+        return obs
 
     def sample(self, idx):
-        """ return state, action, reward, isOver,
-            note that some frames in state may be generated from last episode,
-            they should be removed from state
+        """ return obs, action, reward, isOver,
+            note that some frames in obs may be generated from last episode,
+            they should be removed from obs
             """
-        state = np.zeros(
-            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
-        state_idx = np.arange(idx,
-                              idx + self.context_len + 1) % self._curr_size
+        obs = np.zeros(
+            (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+        obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
 
         # confirm that no frame was generated from last episode
         has_last_episode = False
         for k in range(self.context_len - 2, -1, -1):
-            to_check_idx = state_idx[k]
+            to_check_idx = obs_idx[k]
             if self.isOver[to_check_idx]:
                 has_last_episode = True
-                state_idx = state_idx[k + 1:]
-                state[k + 1:] = self.state[state_idx]
+                obs_idx = obs_idx[k + 1:]
+                obs[k + 1:] = self.obs[obs_idx]
                 break
 
         if not has_last_episode:
-            state = self.state[state_idx]
+            obs = self.obs[obs_idx]
 
         real_idx = (idx + self.context_len - 1) % self._curr_size
         action = self.action[real_idx]
         reward = self.reward[real_idx]
         isOver = self.isOver[real_idx]
-        return state, reward, action, isOver
+        return obs, reward, action, isOver
 
     def __len__(self):
         return self._curr_size
@@ -92,7 +91,7 @@ class ReplayMemory(object):
         return self._curr_size
 
     def _assign(self, pos, exp):
-        self.state[pos] = exp.state
+        self.obs[pos] = exp.obs
         self.reward[pos] = exp.reward
         self.action[pos] = exp.action
         self.isOver[pos] = exp.isOver
@@ -107,8 +106,8 @@ class ReplayMemory(object):
         return self._process_batch(batch_exp)
 
     def _process_batch(self, batch_exp):
-        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
         reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
         action = np.asarray([e[2] for e in batch_exp], dtype='int8')
         isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
-        return [state, action, reward, isOver]
+        return [obs, action, reward, isOver]
diff --git a/benchmark/torch/dqn/train.py b/benchmark/torch/dqn/train.py
index 9db3b8f776fa669772bb2748cbfed0a7067f5909..ba64b95c93a9b4879621331ad30cce3cbcbcac16 100644
--- a/benchmark/torch/dqn/train.py
+++ b/benchmark/torch/dqn/train.py
@@ -22,11 +22,11 @@ import parl
 
 import numpy as np
 from tqdm import tqdm
-from parl.utils import tensorboard, logger
+from parl.utils import summary, logger
 from parl.algorithms import DQN, DDQN
 
 from agent import AtariAgent
-from atari_wrapper import FireResetEnv, FrameStack, LimitLength, MapState
+from atari_wrapper import FireResetEnv, FrameStack, LimitLength
 from model import AtariModel
 from replay_memory import ReplayMemory, Experience
 from utils import get_player
@@ -43,57 +43,57 @@ GAMMA = 0.99
 def run_train_episode(env, agent, rpm):
     total_reward = 0
     all_cost = []
-    state = env.reset()
+    obs = env.reset()
     steps = 0
     while True:
         steps += 1
-        context = rpm.recent_state()
-        context.append(state)
+        context = rpm.recent_obs()
+        context.append(obs)
         context = np.stack(context, axis=0)
         action = agent.sample(context)
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append(Experience(state, action, reward, isOver))
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append(Experience(obs, action, reward, isOver))
         if rpm.size() > MEMORY_WARMUP_SIZE:
             if steps % UPDATE_FREQ == 0:
-                batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
                     args.batch_size)
-                batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-                batch_next_state = batch_all_state[:, 1:, :, :]
-                cost = agent.learn(batch_state, batch_action, batch_reward,
-                                   batch_next_state, batch_isOver)
+                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+                batch_next_obs = batch_all_obs[:, 1:, :, :]
+                cost = agent.learn(batch_obs, batch_action, batch_reward,
+                                   batch_next_obs, batch_isOver)
                 all_cost.append(cost)
         total_reward += reward
-        state = next_state
+        obs = next_obs
         if isOver:
             mean_loss = np.mean(all_cost) if all_cost else None
             return total_reward, steps, mean_loss
 
 
 def run_evaluate_episode(env, agent):
-    state = env.reset()
+    obs = env.reset()
     total_reward = 0
     while True:
-        pred_Q = agent.predict(state)
+        pred_Q = agent.predict(obs)
         action = pred_Q.max(1)[1].item()
-        state, reward, isOver, _ = env.step(action)
+        obs, reward, isOver, _ = env.step(action)
         total_reward += reward
         if isOver:
             return total_reward
 
 
-def get_fixed_states(rpm, batch_size):
-    states = []
+def get_fixed_obs(rpm, batch_size):
+    obs = []
     for _ in range(3):
-        batch_all_state = rpm.sample_batch(batch_size)[0]
-        batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-        states.append(batch_state)
-    fixed_states = np.concatenate(states, axis=0)
-    return fixed_states
+        batch_all_obs = rpm.sample_batch(batch_size)[0]
+        batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+        obs.append(batch_obs)
+    fixed_obs = np.concatenate(obs, axis=0)
+    return fixed_obs
 
 
-def evaluate_fixed_Q(agent, states):
+def evaluate_fixed_Q(agent, obs):
     with torch.no_grad():
-        max_pred_Q = agent.alg.model(states).max(1)[0].mean()
+        max_pred_Q = agent.alg.model(obs).max(1)[0].mean()
     return max_pred_Q.item()
 
 
@@ -131,9 +131,9 @@ def main():
             total_reward, steps, _ = run_train_episode(env, agent, rpm)
             pbar.update(steps)
 
-    # Get fixed states to check value function.
-    fixed_states = get_fixed_states(rpm, args.batch_size)
-    fixed_states = torch.tensor(fixed_states, dtype=torch.float, device=device)
+    # Get fixed obs to check value function.
+    fixed_obs = get_fixed_obs(rpm, args.batch_size)
+    fixed_obs = torch.tensor(fixed_obs, dtype=torch.float, device=device)
 
     # train
     test_flag = 0
@@ -152,18 +152,17 @@ def main():
                 for _ in range(3):
                     eval_rewards.append(run_evaluate_episode(test_env, agent))
 
-                tensorboard.add_scalar('dqn/eval', np.mean(eval_rewards),
-                                       total_steps)
-                tensorboard.add_scalar('dqn/score', total_reward, total_steps)
-                tensorboard.add_scalar('dqn/loss', loss, total_steps)
-                tensorboard.add_scalar('dqn/exploration', agent.exploration,
-                                       total_steps)
-                tensorboard.add_scalar('dqn/Q value',
-                                       evaluate_fixed_Q(agent, fixed_states),
-                                       total_steps)
-                tensorboard.add_scalar('dqn/grad_norm',
-                                       get_grad_norm(agent.alg.model),
-                                       total_steps)
+                summary.add_scalar('dqn/eval', np.mean(eval_rewards),
+                                   total_steps)
+                summary.add_scalar('dqn/score', total_reward, total_steps)
+                summary.add_scalar('dqn/loss', loss, total_steps)
+                summary.add_scalar('dqn/exploration', agent.exploration,
+                                   total_steps)
+                summary.add_scalar('dqn/Q value',
+                                   evaluate_fixed_Q(agent, fixed_obs),
+                                   total_steps)
+                summary.add_scalar('dqn/grad_norm',
+                                   get_grad_norm(agent.alg.model), total_steps)
 
 
 if __name__ == '__main__':
diff --git a/benchmark/torch/ppo/arguments.py b/benchmark/torch/ppo/arguments.py
new file mode 100644
index 0000000000000000000000000000000000000000..b7d5d33df54b4652a416f0f9bbb49c3d1bd4a522
--- /dev/null
+++ b/benchmark/torch/ppo/arguments.py
@@ -0,0 +1,103 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import torch
+
+
+def get_args():
+    parser = argparse.ArgumentParser(description='RL')
+    parser.add_argument(
+        '--lr', type=float, default=3e-4, help='learning rate (default: 3e-4)')
+    parser.add_argument(
+        '--eps',
+        type=float,
+        default=1e-5,
+        help='RMSprop optimizer epsilon (default: 1e-5)')
+    parser.add_argument(
+        '--gamma',
+        type=float,
+        default=0.99,
+        help='discount factor for rewards (default: 0.99)')
+    parser.add_argument(
+        '--gae-lambda',
+        type=float,
+        default=0.95,
+        help='gae lambda parameter (default: 0.95)')
+    parser.add_argument(
+        '--entropy-coef',
+        type=float,
+        default=0.,
+        help='entropy term coefficient (default: 0.)')
+    parser.add_argument(
+        '--value-loss-coef',
+        type=float,
+        default=0.5,
+        help='value loss coefficient (default: 0.5)')
+    parser.add_argument(
+        '--max-grad-norm',
+        type=float,
+        default=0.5,
+        help='max norm of gradients (default: 0.5)')
+    parser.add_argument(
+        '--seed', type=int, default=1, help='random seed (default: 1)')
+    parser.add_argument(
+        '--num-steps',
+        type=int,
+        default=2048,
+        help='number of maximum forward steps in ppo (default: 2048)')
+    parser.add_argument(
+        '--ppo-epoch',
+        type=int,
+        default=10,
+        help='number of ppo epochs (default: 10)')
+    parser.add_argument(
+        '--num-mini-batch',
+        type=int,
+        default=32,
+        help='number of batches for ppo (default: 32)')
+    parser.add_argument(
+        '--clip-param',
+        type=float,
+        default=0.2,
+        help='ppo clip parameter (default: 0.2)')
+    parser.add_argument(
+        '--log-interval',
+        type=int,
+        default=1,
+        help='log interval, one log per n updates (default: 1)')
+    parser.add_argument(
+        '--eval-interval',
+        type=int,
+        default=10,
+        help='eval interval, one eval per n updates (default: 10)')
+    parser.add_argument(
+        '--num-env-steps',
+        type=int,
+        default=10e5,
+        help='number of environment steps to train (default: 10e5)')
+    parser.add_argument(
+        '--env-name',
+        default='Hopper-v2',
+        help='environment to train on (default: Hopper-v2)')
+    parser.add_argument(
+        '--use-linear-lr-decay',
+        action='store_true',
+        default=False,
+        help='use a linear schedule on the learning rate')
+    args = parser.parse_args()
+
+    args.cuda = torch.cuda.is_available()
+
+    return args
diff --git a/benchmark/torch/ppo/evaluation.py b/benchmark/torch/ppo/evaluation.py
new file mode 100644
index 0000000000000000000000000000000000000000..8aa020ca66a0c3a97d8deea55e37dabc4cf7512b
--- /dev/null
+++ b/benchmark/torch/ppo/evaluation.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import torch
+
+import utils
+from wrapper import make_env
+
+
+def evaluate(agent, ob_rms, env_name, seed, device):
+    if seed != None:
+        seed += 1
+    eval_envs = make_env(env_name, seed, None)
+    vec_norm = utils.get_vec_normalize(eval_envs)
+    if vec_norm is not None:
+        vec_norm.eval()
+        vec_norm.ob_rms = ob_rms
+
+    eval_episode_rewards = []
+
+    obs = eval_envs.reset()
+    eval_masks = torch.zeros(1, 1, device=device)
+
+    while len(eval_episode_rewards) < 10:
+        with torch.no_grad():
+            action = agent.predict(obs)
+
+        # Obser reward and next obs
+        obs, _, done, infos = eval_envs.step(action)
+
+        eval_masks = torch.tensor(
+            [[0.0] if done_ else [1.0] for done_ in done],
+            dtype=torch.float32,
+            device=device)
+
+        for info in infos:
+            if 'episode' in info.keys():
+                eval_episode_rewards.append(info['episode']['r'])
+
+    eval_envs.close()
+
+    print(" Evaluation using {} episodes: mean reward {:.5f}\n".format(
+        len(eval_episode_rewards), np.mean(eval_episode_rewards)))
+    return np.mean(eval_episode_rewards)
diff --git a/benchmark/torch/ppo/mujoco_agent.py b/benchmark/torch/ppo/mujoco_agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..096f683f958829c0780ecc59d9ed144367c15f38
--- /dev/null
+++ b/benchmark/torch/ppo/mujoco_agent.py
@@ -0,0 +1,78 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+import torch
+
+
+class MujocoAgent(parl.Agent):
+    def __init__(self, algorithm, device):
+        self.alg = algorithm
+        self.device = device
+
+    def predict(self, obs):
+        obs = torch.from_numpy(obs).float().to(self.device)
+        action = self.alg.predict(obs)
+        return action.cpu().numpy()
+
+    def sample(self, obs):
+        obs = torch.from_numpy(obs).to(self.device)
+        value, action, action_log_probs = self.alg.sample(obs)
+        return value.cpu().numpy(), action.cpu().numpy(), \
+            action_log_probs.cpu().numpy()
+
+    def learn(self, next_value, gamma, gae_lambda, ppo_epoch, num_mini_batch,
+              rollouts):
+        value_loss_epoch = 0
+        action_loss_epoch = 0
+        dist_entropy_epoch = 0
+
+        for e in range(ppo_epoch):
+            data_generator = rollouts.sample_batch(next_value, gamma,
+                                                   gae_lambda, num_mini_batch)
+
+            for sample in data_generator:
+                obs_batch, actions_batch, \
+                    value_preds_batch, return_batch, old_action_log_probs_batch, \
+                            adv_targ = sample
+
+                obs_batch = torch.from_numpy(obs_batch).to('cuda')
+                actions_batch = torch.from_numpy(actions_batch).to('cuda').to(
+                    'cuda')
+                value_preds_batch = torch.from_numpy(value_preds_batch).to(
+                    'cuda')
+                return_batch = torch.from_numpy(return_batch).to('cuda')
+                old_action_log_probs_batch = torch.from_numpy(
+                    old_action_log_probs_batch).to('cuda')
+                adv_targ = torch.from_numpy(adv_targ).to('cuda')
+
+                value_loss, action_loss, dist_entropy = self.alg.learn(
+                    obs_batch, actions_batch, value_preds_batch, return_batch,
+                    old_action_log_probs_batch, adv_targ)
+
+                value_loss_epoch += value_loss
+                action_loss_epoch += action_loss
+                dist_entropy_epoch += dist_entropy
+
+        num_updates = ppo_epoch * num_mini_batch
+
+        value_loss_epoch /= num_updates
+        action_loss_epoch /= num_updates
+        dist_entropy_epoch /= num_updates
+
+        return value_loss_epoch, action_loss_epoch, dist_entropy_epoch
+
+    def value(self, obs):
+        obs = torch.from_numpy(obs).to(self.device)
+        return self.alg.value(obs).cpu().numpy()
diff --git a/benchmark/torch/ppo/mujoco_model.py b/benchmark/torch/ppo/mujoco_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..83b762da2bd5a922d2a20605df641b6aec0ad949
--- /dev/null
+++ b/benchmark/torch/ppo/mujoco_model.py
@@ -0,0 +1,64 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+from torch.distributions import Normal
+
+
+class MujocoModel(parl.Model):
+    def __init__(self, obs_dim, act_dim):
+        super(MujocoModel, self).__init__()
+        self.actor = Actor(obs_dim, act_dim)
+        self.critic = Critic(obs_dim)
+
+    def policy(self, obs):
+        return self.actor(obs)
+
+    def value(self, obs):
+        return self.critic(obs)
+
+
+class Actor(parl.Model):
+    def __init__(self, obs_dim, act_dim):
+        super(Actor, self).__init__()
+        self.fc1 = nn.Linear(obs_dim, 64)
+        self.fc2 = nn.Linear(64, 64)
+
+        self.fc_mean = nn.Linear(64, act_dim)
+        self.log_std = nn.Parameter(torch.zeros(act_dim))
+
+    def forward(self, obs):
+        x = torch.tanh(self.fc1(obs))
+        x = torch.tanh(self.fc2(x))
+
+        mean = self.fc_mean(x)
+        return mean, self.log_std
+
+
+class Critic(parl.Model):
+    def __init__(self, obs_dim):
+        super(Critic, self).__init__()
+        self.fc1 = nn.Linear(obs_dim, 64)
+        self.fc2 = nn.Linear(64, 64)
+        self.fc3 = nn.Linear(64, 1)
+
+    def forward(self, obs):
+        x = torch.tanh(self.fc1(obs))
+        x = torch.tanh(self.fc2(x))
+        value = self.fc3(x)
+
+        return value
diff --git a/benchmark/torch/ppo/storage.py b/benchmark/torch/ppo/storage.py
new file mode 100644
index 0000000000000000000000000000000000000000..b986b670d545fb88938785fc812a320103023d5d
--- /dev/null
+++ b/benchmark/torch/ppo/storage.py
@@ -0,0 +1,107 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+
+from torch.utils.data.sampler import BatchSampler, SubsetRandomSampler
+
+
+class RolloutStorage(object):
+    def __init__(self, num_steps, obs_dim, act_dim):
+        self.num_steps = num_steps
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+
+        self.obs = np.zeros((num_steps + 1, obs_dim), dtype='float32')
+        self.actions = np.zeros((num_steps, act_dim), dtype='float32')
+        self.value_preds = np.zeros((num_steps + 1, ), dtype='float32')
+        self.returns = np.zeros((num_steps + 1, ), dtype='float32')
+        self.action_log_probs = np.zeros((num_steps, ), dtype='float32')
+        self.rewards = np.zeros((num_steps, ), dtype='float32')
+
+        self.masks = np.ones((num_steps + 1, ), dtype='bool')
+        self.bad_masks = np.ones((num_steps + 1, ), dtype='bool')
+
+        self.step = 0
+
+    def append(self, obs, actions, action_log_probs, value_preds, rewards,
+               masks, bad_masks):
+        """
+        print("obs")
+        print(obs)
+        print("masks")
+        print(masks)
+        print("rewards")
+        print(rewards)
+        exit()
+        """
+        self.obs[self.step + 1] = obs
+        self.actions[self.step] = actions
+        self.rewards[self.step] = rewards
+        self.action_log_probs[self.step] = action_log_probs
+        self.value_preds[self.step] = value_preds
+        self.masks[self.step + 1] = masks
+        self.bad_masks[self.step + 1] = bad_masks
+
+        self.step = (self.step + 1) % self.num_steps
+
+    def sample_batch(self,
+                     next_value,
+                     gamma,
+                     gae_lambda,
+                     num_mini_batch,
+                     mini_batch_size=None):
+        # calculate return and advantage first
+        self.compute_returns(next_value, gamma, gae_lambda)
+        advantages = self.returns[:-1] - self.value_preds[:-1]
+        advantages = (advantages - advantages.mean()) / (
+            advantages.std() + 1e-5)
+
+        # generate sample batch
+        mini_batch_size = self.num_steps // num_mini_batch
+        sampler = BatchSampler(
+            SubsetRandomSampler(range(self.num_steps)),
+            mini_batch_size,
+            drop_last=True)
+        for indices in sampler:
+            obs_batch = self.obs[:-1][indices]
+            actions_batch = self.actions[indices]
+            value_preds_batch = self.value_preds[:-1][indices]
+            returns_batch = self.returns[:-1][indices]
+            old_action_log_probs_batch = self.action_log_probs[indices]
+
+            value_preds_batch = value_preds_batch.reshape(-1, 1)
+            returns_batch = returns_batch.reshape(-1, 1)
+            old_action_log_probs_batch = old_action_log_probs_batch.reshape(
+                -1, 1)
+
+            adv_targ = advantages[indices]
+            adv_targ = adv_targ.reshape(-1, 1)
+
+            yield obs_batch, actions_batch, value_preds_batch, returns_batch, old_action_log_probs_batch, adv_targ
+
+    def after_update(self):
+        self.obs[0] = np.copy(self.obs[-1])
+        self.masks[0] = np.copy(self.masks[-1])
+        self.bad_masks[0] = np.copy(self.bad_masks[-1])
+
+    def compute_returns(self, next_value, gamma, gae_lambda):
+        self.value_preds[-1] = next_value
+        gae = 0
+        for step in reversed(range(self.rewards.size)):
+            delta = self.rewards[step] + gamma * self.value_preds[
+                step + 1] * self.masks[step + 1] - self.value_preds[step]
+            gae = delta + gamma * gae_lambda * self.masks[step + 1] * gae
+            gae = gae * self.bad_masks[step + 1]
+            self.returns[step] = gae + self.value_preds[step]
diff --git a/benchmark/torch/ppo/train.py b/benchmark/torch/ppo/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..6bb5dafbf4fbc6b96dc664030910446a7cfd46e1
--- /dev/null
+++ b/benchmark/torch/ppo/train.py
@@ -0,0 +1,128 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# modified from https://github.com/ikostrikov/pytorch-a2c-ppo-acktr-gail
+
+import copy
+import os
+from collections import deque
+
+import gym
+import numpy as np
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+
+import utils
+from arguments import get_args
+from wrapper import make_env
+from mujoco_model import MujocoModel
+from parl.algorithms import PPO
+from mujoco_agent import MujocoAgent
+from storage import RolloutStorage
+from evaluation import evaluate
+
+
+def main():
+    args = get_args()
+
+    torch.manual_seed(args.seed)
+    torch.cuda.manual_seed_all(args.seed)
+
+    torch.set_num_threads(1)
+    device = torch.device("cuda:0" if args.cuda else "cpu")
+
+    envs = make_env(args.env_name, args.seed, args.gamma)
+
+    model = MujocoModel(envs.observation_space.shape[0],
+                        envs.action_space.shape[0])
+    model.to(device)
+
+    algorithm = PPO(
+        model,
+        args.clip_param,
+        args.value_loss_coef,
+        args.entropy_coef,
+        initial_lr=args.lr,
+        eps=args.eps,
+        max_grad_norm=args.max_grad_norm)
+
+    agent = MujocoAgent(algorithm, device)
+
+    rollouts = RolloutStorage(args.num_steps, envs.observation_space.shape[0],
+                              envs.action_space.shape[0])
+
+    obs = envs.reset()
+    rollouts.obs[0] = np.copy(obs)
+
+    episode_rewards = deque(maxlen=10)
+
+    num_updates = int(args.num_env_steps) // args.num_steps
+    for j in range(num_updates):
+
+        if args.use_linear_lr_decay:
+            # decrease learning rate linearly
+            utils.update_linear_schedule(algorithm.optimizer, j, num_updates,
+                                         args.lr)
+
+        for step in range(args.num_steps):
+            # Sample actions
+            with torch.no_grad():
+                value, action, action_log_prob = agent.sample(
+                    rollouts.obs[step])  # why use obs from rollouts???有病吧
+
+            # Obser reward and next obs
+            obs, reward, done, infos = envs.step(action)
+
+            for info in infos:
+                if 'episode' in info.keys():
+                    episode_rewards.append(info['episode']['r'])
+
+            # If done then clean the history of observations.
+            masks = torch.FloatTensor(
+                [[0.0] if done_ else [1.0] for done_ in done])
+            bad_masks = torch.FloatTensor(
+                [[0.0] if 'bad_transition' in info.keys() else [1.0]
+                 for info in infos])
+            rollouts.append(obs, action, action_log_prob, value, reward, masks,
+                            bad_masks)
+
+        with torch.no_grad():
+            next_value = agent.value(rollouts.obs[-1])
+
+        value_loss, action_loss, dist_entropy = agent.learn(
+            next_value, args.gamma, args.gae_lambda, args.ppo_epoch,
+            args.num_mini_batch, rollouts)
+
+        rollouts.after_update()
+
+        if j % args.log_interval == 0 and len(episode_rewards) > 1:
+            total_num_steps = (j + 1) * args.num_steps
+            print(
+                "Updates {}, num timesteps {},\n Last {} training episodes: mean/median reward {:.1f}/{:.1f}, min/max reward {:.1f}/{:.1f}\n"
+                .format(j, total_num_steps, len(episode_rewards),
+                        np.mean(episode_rewards), np.median(episode_rewards),
+                        np.min(episode_rewards), np.max(episode_rewards),
+                        dist_entropy, value_loss, action_loss))
+
+        if (args.eval_interval is not None and len(episode_rewards) > 1
+                and j % args.eval_interval == 0):
+            ob_rms = utils.get_vec_normalize(envs).ob_rms
+            eval_mean_reward = evaluate(agent, ob_rms, args.env_name,
+                                        args.seed, device)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/benchmark/torch/ppo/utils.py b/benchmark/torch/ppo/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e276a7f0779cfb55b3ef92012f22a61b7937c62
--- /dev/null
+++ b/benchmark/torch/ppo/utils.py
@@ -0,0 +1,43 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import glob
+import os
+
+import torch
+import torch.nn as nn
+
+from wrapper import VecNormalize
+
+
+def get_vec_normalize(venv):
+    if isinstance(venv, VecNormalize):
+        return venv
+    elif hasattr(venv, 'venv'):
+        return get_vec_normalize(venv.venv)
+
+    return None
+
+
+def update_linear_schedule(optimizer, epoch, total_num_epochs, initial_lr):
+    """Decreases the learning rate linearly"""
+    lr = initial_lr - (initial_lr * (epoch / float(total_num_epochs)))
+    for param_group in optimizer.param_groups:
+        param_group['lr'] = lr
+
+
+def init(module, weight_init, bias_init, gain=1):
+    weight_init(module.weight.data, gain=gain)
+    bias_init(module.bias.data)
+    return module
diff --git a/benchmark/torch/ppo/wrapper.py b/benchmark/torch/ppo/wrapper.py
new file mode 100644
index 0000000000000000000000000000000000000000..a890db1d0e5ee2cc2131794d9317a76a55e16e83
--- /dev/null
+++ b/benchmark/torch/ppo/wrapper.py
@@ -0,0 +1,180 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Simplified version of https://github.com/ShangtongZhang/DeepRL/blob/master/deep_rl/component/envs.py
+
+import numpy as np
+import gym
+from gym.core import Wrapper
+import time
+
+
+class TimeLimitMask(gym.Wrapper):
+    def step(self, action):
+        obs, rew, done, info = self.env.step(action)
+        if done and self.env._max_episode_steps == self.env._elapsed_steps:
+            info['bad_transition'] = True
+        return obs, rew, done, info
+
+    def reset(self, **kwargs):
+        return self.env.reset(**kwargs)
+
+
+class MonitorEnv(gym.Wrapper):
+    def __init__(self, env):
+        Wrapper.__init__(self, env=env)
+        self.tstart = time.time()
+        self.rewards = None
+
+    def step(self, action):
+        ob, rew, done, info = self.env.step(action)
+        self.update(ob, rew, done, info)
+        return (ob, rew, done, info)
+
+    def update(self, ob, rew, done, info):
+        self.rewards.append(rew)
+        if done:
+            eprew = sum(self.rewards)
+            eplen = len(self.rewards)
+            epinfo = {
+                "r": round(eprew, 6),
+                "l": eplen,
+                "t": round(time.time() - self.tstart, 6)
+            }
+            assert isinstance(info, dict)
+            info['episode'] = epinfo
+            self.reset()
+
+    def reset(self, **kwargs):
+        self.rewards = []
+        return self.env.reset(**kwargs)
+
+
+class VectorEnv(gym.Wrapper):
+    def step(self, action):
+        ob, rew, done, info = self.env.step(action)
+        ob = np.array(ob)
+        ob = ob[np.newaxis, :]
+        rew = np.array([rew])
+
+        done = np.array([done])
+
+        info = [info]
+        return (ob, rew, done, info)
+
+
+class RunningMeanStd(object):
+    # https://en.wikipedia.org/wiki/Algorithms_for_calculating_variance#Parallel_algorithm
+    def __init__(self, epsilon=1e-4, shape=()):
+        self.mean = np.zeros(shape, 'float64')
+        self.var = np.ones(shape, 'float64')
+        self.count = epsilon
+
+    def update(self, x):
+        batch_mean = np.mean(x, axis=0)
+        batch_var = np.var(x, axis=0)
+        batch_count = x.shape[0]
+        self.update_from_moments(batch_mean, batch_var, batch_count)
+
+    def update_from_moments(self, batch_mean, batch_var, batch_count):
+        self.mean, self.var, self.count = update_mean_var_count_from_moments(
+            self.mean, self.var, self.count, batch_mean, batch_var,
+            batch_count)
+
+
+def update_mean_var_count_from_moments(mean, var, count, batch_mean, batch_var,
+                                       batch_count):
+    delta = batch_mean - mean
+    tot_count = count + batch_count
+
+    new_mean = mean + delta * batch_count / tot_count
+    m_a = var * count
+    m_b = batch_var * batch_count
+    M2 = m_a + m_b + np.square(delta) * count * batch_count / tot_count
+    new_var = M2 / tot_count
+    new_count = tot_count
+
+    return new_mean, new_var, new_count
+
+
+class VecNormalize(gym.Wrapper):
+    def __init__(self,
+                 env,
+                 ob=True,
+                 ret=True,
+                 clipob=10.,
+                 cliprew=10.,
+                 gamma=0.99,
+                 epsilon=1e-8):
+        Wrapper.__init__(self, env=env)
+        observation_space = env.observation_space.shape[0]
+
+        self.ob_rms = RunningMeanStd(shape=observation_space) if ob else None
+        self.ret_rms = RunningMeanStd(shape=()) if ret else None
+
+        self.clipob = clipob
+        self.cliprew = cliprew
+        self.gamma = gamma
+        self.epsilon = epsilon
+        self.ret = np.zeros(1)
+        self.training = True
+
+    def step(self, action):
+        ob, rew, new, info = self.env.step(action)
+        self.ret = self.ret * self.gamma + rew
+        # normalize observation
+        ob = self._obfilt(ob)
+        # normalize reward
+        if self.ret_rms:
+            self.ret_rms.update(self.ret)
+            rew = np.clip(rew / np.sqrt(self.ret_rms.var + self.epsilon),
+                          -self.cliprew, self.cliprew)
+        self.ret[new] = 0.
+        return ob, rew, new, info
+
+    def reset(self):
+        self.ret = np.zeros(1)
+        ob = self.env.reset()
+        return self._obfilt(ob)
+
+    def _obfilt(self, ob, update=True):
+        if self.ob_rms:
+            if self.training and update:
+                self.ob_rms.update(ob)
+            ob = np.clip((ob - self.ob_rms.mean) /
+                         np.sqrt(self.ob_rms.var + self.epsilon), -self.clipob,
+                         self.clipob)
+            return ob
+        else:
+            return ob
+
+    def train(self):
+        self.training = True
+
+    def eval(self):
+        self.trainint = False
+
+
+def make_env(env_name, seed, gamma):
+    env = gym.make(env_name)
+    env.seed(seed)
+    env = TimeLimitMask(env)
+    env = MonitorEnv(env)
+    env = VectorEnv(env)
+    if gamma is None:
+        env = VecNormalize(env, ret=False)
+    else:
+        env = VecNormalize(env, gamma=gamma)
+
+    return env
diff --git a/benchmark/torch/td3/train.py b/benchmark/torch/td3/train.py
index c844d8c079a4b10e1e0ade957202cd7d2dcd27fb..48bd1f77103f1e50bd28f55cc12bee09315496e7 100644
--- a/benchmark/torch/td3/train.py
+++ b/benchmark/torch/td3/train.py
@@ -15,7 +15,7 @@
 import gym
 import argparse
 import numpy as np
-from parl.utils import logger, tensorboard, ReplayMemory
+from parl.utils import logger, summary, ReplayMemory
 
 from mujoco_model import MujocoModel
 from mujoco_agent import MujocoAgent
@@ -103,8 +103,7 @@ def main():
         train_reward, steps = run_train_episode(env, agent, rpm)
         total_steps += steps
         logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
-        tensorboard.add_scalar('train/episode_reward', train_reward,
-                               total_steps)
+        summary.add_scalar('train/episode_reward', train_reward, total_steps)
 
         if total_steps // args.test_every_steps >= test_flag:
             while total_steps // args.test_every_steps >= test_flag:
@@ -112,8 +111,8 @@ def main():
             evaluate_reward = run_evaluate_episode(env, agent)
             logger.info('Steps {}, Evaluate reward: {}'.format(
                 total_steps, evaluate_reward))
-            tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
-                                   total_steps)
+            summary.add_scalar('eval/episode_reward', evaluate_reward,
+                               total_steps)
 
 
 if __name__ == '__main__':
diff --git a/docs/EvoKit/minimal_example.rst b/docs/EvoKit/minimal_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..0eb7c66902fe71ebe097586f8385f43952068860
--- /dev/null
+++ b/docs/EvoKit/minimal_example.rst
@@ -0,0 +1,190 @@
+minimal example
+---------------------
+
+``本教程的目标:
+演示如何通过EvoKit库来解决经典的CartPole 问题。``
+
+*本教程假定读者曾经使用过PaddlePaddle, 了解基本的进化算法迭代流程。*
+
+CartPole 介绍
+#############
+CartPole又叫倒立摆。小车上放了一根杆，杆会因重力而倒下。为了不让杆倒下，我们要通过移动小车，来保持其是直立的。如下图所示。
+在每一个时间步，模型的输入是一个4维的向量,表示当前小车和杆的状态，模型输出的信号用于控制小车往左或者右移动。当杆没有倒下的时候，每个时间步，环境会给1分的奖励；当杆倒下后，环境不会给任何的奖励，游戏结束。
+
+.. image:: ../../examples/QuickStart/performance.gif
+  :width: 300px
+
+step1: 生成预测网络
+########################
+根据上面的环境介绍，我们需要构造一个神经网络，输入为4维的向量，输出为2维的概率分布向量（表示左/右）移动的概率。
+在这里，我们使用Paddle来实现预测网络，并保存到本地。
+
+.. code-block:: python
+
+	from paddle import fluid
+	
+	def net(obs, act_dim):
+	    hid1 = fluid.layers.fc(obs, size=20)
+	    prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
+	    return prob
+	
+	if __name__ == '__main__':
+	    obs_dim = 4
+	    act_dim = 2
+	    obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
+	    prob = net(obs, act_dim)
+	
+	    exe = fluid.Executor(fluid.CPUPlace())
+	    exe.run(fluid.default_startup_program())
+	    fluid.io.save_inference_model(
+	        dirname='init_model',
+	        feeded_var_names=['obs'],
+	        target_vars=[prob],
+	        params_filename='params',
+	        model_filename='model',
+	        executor=exe)
+
+step2: 构造ESAgent
+###################
+
+- 调用 ``load_config`` 加载配置文件。
+- 调用 ``load_inference_model`` 函数加载模型参数。
+- 调用 ``init_solver`` 初始化solver。
+
+配置文件主要是用于指定进化算法类型（比如Gaussian或者CMA）,使用的optimizer类型（Adam或者SGD）。
+
+.. code-block:: c++
+
+    ESAgent agent = ESAgent();
+    agent.load_config(config);
+    agent.load_inference_model(model_dir);
+    agent.init_solver();
+
+    // 附：EvoKit配置项示范
+    solver {
+        type: BASIC_ES
+        optimizer { // 线下Adam更新
+            type: ADAM
+            base_lr: 0.05
+            adam {
+                beta1: 0.9
+                beta2: 0.999
+                epsilon: 1e-08
+            }
+        }
+        sampling { // 线上高斯采样
+            type: GAUSSIAN_SAMPLING
+            gaussian_sampling {
+                std: 0.5
+                cached: true
+                seed: 1024
+                cache_size : 100000
+            }
+        }
+    }
+
+
+step3: 生成用于采样的Agent
+###################
+
+主要关注三个接口：
+
+- 调用 ``clone`` 生成一个用于sampling的agent。
+- 调用 ``add_noise`` 给这个agent的参数空间增加噪声，同时返回该噪声对应的唯一信息，这个信息得记录在log中，用于线下更新。
+- 调用 ``predict`` 提供预测接口。
+
+.. code-block:: c++
+
+    auto sampling_agent = agent.clone();
+    auto sampling_info = sampling_agent.add_noise();
+    sampling_agent.predict(feature);
+
+step4: 用采样的数据更新模型参数
+###################
+
+用户提供两组数据：
+
+- 采样参数过程中用于线下复现采样噪声的sampling_info
+- 扰动参数后，新参数的评估结果
+
+.. code-block:: c++
+
+    agent.update(sampling_infos, rewards);
+
+主代码以及注释
+#################
+
+以下的代码演示通过多线程同时采样, 提升解决问题的效率。
+
+.. code-block:: c++
+
+    int main(int argc, char* argv[]) {
+        std::vector<CartPole> envs;
+        // 构造10个环境，用于多线程训练
+        for (int i = 0; i < ITER; ++i) {
+            envs.push_back(CartPole());
+        }
+    
+        // 初始化ESAgent
+        std::string model_dir = "./demo/cartpole/init_model";
+        std::string config_path = "./demo/cartpole/config.prototxt";
+        std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+        agent->load_config(config_path); // 加载配置
+
+        agent->load_inference_model(FLAGS_model_dir); // 加载初始预测模型
+        agent->init_solver(); // 初始化solver，注意要在load_inference_model后执行
+    
+        // 生成10个agent用于同时采样
+        std::vector<std::shared_ptr<ESAgent>> sampling_agents;
+        for (int i = 0; i < ITER; ++i) {
+            sampling_agents.push_back(agent->clone());
+        }
+    
+        std::vector<SamplingInfo> sampling_infos;
+        std::vector<float> rewards(ITER, 0.0f);
+        sampling_infos.resize(ITER);
+        omp_set_num_threads(10);
+    
+        // 共迭代100轮
+        for (int epoch = 0; epoch < 100; ++epoch) {
+            #pragma omp parallel for schedule(dynamic, 1)
+            for (int i = 0; i < ITER; ++i) {
+                std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
+                SamplingInfo sampling_info;
+                sampling_agent->add_noise(sampling_info);
+                float reward = evaluate(envs[i], sampling_agent);
+                // 保存采样的sampling_info以及对应的评估结果reward
+                sampling_infos[i] = sampling_info;
+                rewards[i] = reward;
+            }
+            // 更新模型参数，注意：参数更新后会自动同步到sampling_agent中
+            agent->update(sampling_infos, rewards);
+    
+            int reward = evaluate(envs[0], agent);
+            LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward; // 打印每一轮reward
+        }
+    }
+
+如何运行demo
+#################
+
+- 下载代码
+
+  在icode上clone代码，我们的仓库路径是： ``baidu/nlp/deep-es`` ``TO DO: 修改库路径``
+
+- 编译demo
+
+  通过bcloud的云端集群编译即可，命令为： ``bb``
+
+- 运行demo
+
+  编译完成后，我们需要增加动态库查找路径：
+
+  ``export LD_LIBRARY_PATH=./output/so/:$LD_LIBRARY_PATH``
+
+  运行demo： ``./output/bin/cartpole/train``
+
+问题解决
+####################
+
+在使用过程中有任何问题，请加hi群: 1692822 (PARL官方答疑群)进行咨询，开发同学会直接回答任何的使用问题。
diff --git a/docs/EvoKit/online_example.rst b/docs/EvoKit/online_example.rst
new file mode 100644
index 0000000000000000000000000000000000000000..c4963f8cb909a240f318b1e85c77ba310c460160
--- /dev/null
+++ b/docs/EvoKit/online_example.rst
@@ -0,0 +1,124 @@
+Example for Online Products
+#########################
+
+``本教程的目标: 演示通过EvoKit库上线后，如何迭代算法，更新模型参数。``
+
+在产品线中，线上无法实时拿到用户日志，经常是通过保存用户点击/时长日志，在线下根据用户数据更新模型，然后再推送到线上，完成算法的更新。
+本教程继续围绕经典的CartPole环境,展示如何通过在线采样/离线更新的方式，来更新迭代ES算法。
+
+demo的完整代码示例放在demp/online_example文件夹中。
+``TO DO: 文件夹``
+
+初始化solver
+---------------------
+构造solver，对它初始化，并保存到文件。初始化solver仅需在开始时调用一次。
+
+.. code-block:: c++
+
+    std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+    agent->load_config(FLAGS_config_path);
+    agent->load_inference_model(FLAGS_model_dir);
+    agent->init_solver();
+    agent->save_solver(FLAGS_model_dir);
+
+
+线上采样
+---------------------
+加载模型和solver，记录线上采样返回的sampling_info以及评估的reward，并通过二进制的方式记录到log文件中。
+
+.. code-block:: c++
+
+    std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+    agent->load_config(FLAGS_config_path);
+    agent->load_inference_model(FLAGS_model_dir);
+    agent->load_solver(FLAGS_model_dir);
+
+    #pragma omp parallel for schedule(dynamic, 1)
+    for (int i = 0; i < ITER; ++i) {
+        std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
+        SamplingInfo sampling_info;
+        sampling_agent->add_noise(sampling_info);
+        float reward = evaluate(envs[i], sampling_agent);
+        sampling_infos[i] = sampling_info;
+        rewards[i] = reward;
+    } 
+
+    // save sampling information and log in binary fomrat
+    std::ofstream log_stream(FLAGS_log_path, std::ios::binary);
+    for (int i = 0; i < ITER; ++i) {
+        std::string data;
+        sampling_infos[i].SerializeToString(&data);
+        int size = data.size();
+        log_stream.write((char*) &rewards[i], sizeof(float));
+        log_stream.write((char*) &size, sizeof(int));
+        log_stream.write(data.c_str(), size);
+    } 
+    log_stream.close();
+
+
+线下更新
+-----------------------
+在加载好之前记录的log之后，调用 ``update`` 函数进行更新，然后通过 ``save_inference_model`` 和 ``save_solver`` 函数保存更新后的参数到本地，推送到线上。
+
+.. code-block:: c++
+
+    std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>();
+    agent->load_config(FLAGS_config_path);
+    agent->load_inference_model(FLAGS_model_dir);
+    agent->load_solver(FLAGS_model_dir);
+
+    // load training data
+    std::vector<SamplingInfo> sampling_infos;
+    std::vector<float> rewards(ITER, 0.0f);
+    sampling_infos.resize(ITER);
+    std::ifstream log_stream(FLAGS_log_path);
+    CHECK(log_stream.good()) << "[EvoKit] cannot open log: " << FLAGS_log_path;
+    char buffer[1000];
+    for (int i = 0; i < ITER; ++i) {
+        int size;
+        log_stream.read((char*) &rewards[i], sizeof(float));
+        log_stream.read((char*) &size, sizeof(int));
+        log_stream.read(buffer, size);
+        buffer[size] = 0;
+        std::string data(buffer);
+        sampling_infos[i].ParseFromString(data);
+    } 
+
+    // update model and save parameter
+    agent->update(sampling_infos, rewards);
+    agent->save_inference_model(FLAGS_updated_model_dir);
+    agent->save_solver(FLAGS_updated_model_dir);
+
+
+主代码
+-----------------------
+
+将以上代码分别编译成可执行文件。
+
+- 初始化solver: ``init_solver`` 。
+- 线上采样: ``online_sampling`` 。
+- 线下更新: ``offline update`` 。
+
+.. code-block:: shell
+
+    #------------------------init solver------------------------
+    ./init_solver \
+        --model_dir="./model_warehouse/model_dir_0" \
+        --config_path="config.prototxt"
+
+
+    for ((epoch=0;epoch<200;++epoch));do
+    #------------------------online sampling------------------------
+        ./online_sampling \
+            --log_path="./sampling_log" \
+            --model_dir="./model_warehouse/model_dir_$epoch" \
+            --config_path="./config.prototxt"
+
+    #------------------------offline update------------------------
+        next_epoch=$((epoch+1))
+        ./offline_update \
+            --log_path='./sampling_log' \
+            --model_dir="./model_warehouse/model_dir_$epoch" \
+            --updated_model_dir="./model_warehouse/model_dir_${next_epoch}" \
+            --config_path="./config.prototxt"
+    done
diff --git a/docs/EvoKit/overview.rst b/docs/EvoKit/overview.rst
new file mode 100644
index 0000000000000000000000000000000000000000..ce6fa07211456e12a0fbc29f6ecc37b501e45f24
--- /dev/null
+++ b/docs/EvoKit/overview.rst
@@ -0,0 +1,21 @@
+Overview
+------------------
+
+``EvoKit`` 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库，主打 **快速上线验证** 。
+
+.. image:: ../../evo_kit/DeepES.gif
+  :align: center
+  :width: 400px
+
+特性
+#########
+
+**1. 多种进化算法支持。** 支持高斯采样、CMA、GA等算法，更多算法持续接入中。
+
+**2. 主流优化器支持。** 支持SGD/Momentum/Adam等多个主流优化器，有效提升算法收敛效率。
+
+**3. 一站式上线。** 整合了线上采样和线下更新流程, 提供Bcloud/Cmake等编译方式, 助力快速上线。
+
+**4. 深度学习框架全系列兼容。** 裸写的网络，paddle/lego/Torch等深度学习框架，EvoKit都支持。
+
+**5. 同步/异步更新方式。** 支持多个采样模型/多份采样数据异步更新，完美契合业务场景。
diff --git a/docs/conf.py b/docs/conf.py
index e4e009f0d8d2edc5ae158b0ab5d680c9c45fcdc2..29f697d1db5fc60304f1da625ed92cf14f2f819b 100644
--- a/docs/conf.py
+++ b/docs/conf.py
@@ -101,3 +101,37 @@ def setup(app):
 
 
 add_module_names = False
+
+latex_engine = 'xelatex'
+latex_use_xindy = False
+latex_elements = {
+    'preamble': '\\usepackage[UTF8]{ctex}\n',
+}
+
+latex_elements = {
+    # The paper size ('letterpaper' or 'a4paper').
+    #'papersize': 'letterpaper',
+
+    # The font size ('10pt', '11pt' or '12pt').
+    #'pointsize': '10pt',
+
+    # Additional stuff for the LaTeX preamble.
+    #'preamble': '',
+    'preamble':
+    r'''
+    \hypersetup{unicode=true}
+    \usepackage{CJKutf8}
+    \DeclareUnicodeCharacter{00A0}{\nobreakspace}
+    \DeclareUnicodeCharacter{2203}{\ensuremath{\exists}}
+    \DeclareUnicodeCharacter{2200}{\ensuremath{\forall}}
+    \DeclareUnicodeCharacter{2286}{\ensuremath{\subseteq}}
+    \DeclareUnicodeCharacter{2713}{x}
+    \DeclareUnicodeCharacter{27FA}{\ensuremath{\Longleftrightarrow}}
+    \DeclareUnicodeCharacter{221A}{\ensuremath{\sqrt{}}}
+    \DeclareUnicodeCharacter{221B}{\ensuremath{\sqrt[3]{}}}
+    \DeclareUnicodeCharacter{2295}{\ensuremath{\oplus}}
+    \DeclareUnicodeCharacter{2297}{\ensuremath{\otimes}}
+    \begin{CJK}{UTF8}{gbsn}
+    \AtEndDocument{\end{CJK}}
+    ''',
+}
diff --git a/docs/index.rst b/docs/index.rst
index e7d6c144112fca11f836b6890c68b2e4c2010832..5009dde813c18dfb97c9066a7dfb9abecf22657a 100644
--- a/docs/index.rst
+++ b/docs/index.rst
@@ -46,7 +46,7 @@ Abstractions
     :maxdepth: 1
     :caption: Installation
 
-   installation.rst
+    installation.rst
 
 .. toctree::
     :maxdepth: 1
@@ -58,9 +58,10 @@ Abstractions
     :maxdepth: 1
     :caption: Tutorial
 
-    getting_started.rst
-    new_alg.rst
-    save_param.rst
+    tutorial/getting_started.rst
+    tutorial/new_alg.rst
+    tutorial/save_param.rst
+    tutorial/tensorboard.rst
 
 .. toctree::
     :maxdepth: 2
@@ -83,3 +84,11 @@ Abstractions
    model.rst
    algorithm.rst
    agent.rst
+
+.. toctree::
+   :maxdepth: 2
+   :caption: EvoKit
+
+   EvoKit/overview.rst
+   EvoKit/minimal_example.rst
+   EvoKit/online_example.rst
diff --git a/docs/tutorial/add_histogram.jpg b/docs/tutorial/add_histogram.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..1c33b0e3ad81f3ca0878c2623f6c4a6a80de19b0
Binary files /dev/null and b/docs/tutorial/add_histogram.jpg differ
diff --git a/docs/tutorial/add_scalar.jpg b/docs/tutorial/add_scalar.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..27cb4a270150c00baf37332d79ed821c3bc901ba
Binary files /dev/null and b/docs/tutorial/add_scalar.jpg differ
diff --git a/docs/getting_started.rst b/docs/tutorial/getting_started.rst
similarity index 98%
rename from docs/getting_started.rst
rename to docs/tutorial/getting_started.rst
index a70a438ba7952a54d199d0fee345c0ee4e87b398..f406c47407d8b98d7bb26f99e8a54b64e11423c8 100644
--- a/docs/getting_started.rst
+++ b/docs/tutorial/getting_started.rst
@@ -178,9 +178,9 @@ Then we use this agent to interact with the environment, and run around 1000 epi
 Summary
 -----------
 
-.. image:: ../examples/QuickStart/performance.gif
+.. image:: ../../examples/QuickStart/performance.gif
   :width: 300px
-.. image:: ./images/quickstart.png
+.. image:: ../images/quickstart.png
   :width: 300px
 In this tutorial, we have shown how to build an agent step-by-step to solve the `Cartpole` problem.
 
diff --git a/docs/new_alg.rst b/docs/tutorial/new_alg.rst
similarity index 98%
rename from docs/new_alg.rst
rename to docs/tutorial/new_alg.rst
index 973c062b88cf5ad7f59e94161d4d019c72fbf717..1acf09796c3ed10ba6135ec367902e6f1d985d47 100644
--- a/docs/new_alg.rst
+++ b/docs/tutorial/new_alg.rst
@@ -59,7 +59,6 @@ Within class ``DQN(Algorithm)``, we define the following methods:
         
         Args:
             model (parl.Model): model defining forward network of Q function
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             act_dim (int): dimension of the action space
             gamma (float): discounted factor for reward computation.
             lr (float): learning rate.
diff --git a/docs/save_param.rst b/docs/tutorial/save_param.rst
similarity index 95%
rename from docs/save_param.rst
rename to docs/tutorial/save_param.rst
index 3824eb9d3fe23c47f375877a75c6c88aab06c0b4..82e411ab2010ef3f9b4dcca0fd0c23f319eac7b7 100644
--- a/docs/save_param.rst
+++ b/docs/tutorial/save_param.rst
@@ -22,5 +22,5 @@ Here is a demonstration of usage:
     agent.restore('./model.ckpt')    
 
     # restore the parameters from ./model.ckpt to another_agent
-    another_agent =  AtariAgent()
+    another_agent = AtariAgent()
     another_agent.restore('./model.ckpt')    
diff --git a/docs/tutorial/tensorboard.rst b/docs/tutorial/tensorboard.rst
new file mode 100644
index 0000000000000000000000000000000000000000..8952a5e00b624e1c02b74c451da0d168ee6a4817
--- /dev/null
+++ b/docs/tutorial/tensorboard.rst
@@ -0,0 +1,55 @@
+summary
+===============
+
+Visualize the results with tensorboard. 
+
+add_scalar
+-------------
+
+Common used arguments:
+
+* summary.add_scalar(tag, scalar_value, global_step=None)
+    * tag *(string)* – Data identifier
+    * scalar_value *(float or string/blobname)* – Value to save
+    * global_step *(int)* – Global step value to record
+
+Example:
+
+.. code-block:: python
+
+    from parl.utils import summary
+
+    x = range(100)
+    for i in x:
+        summary.add_scalar('y=2x', i * 2, i)
+
+Expected result:
+
+    .. image:: add_scalar.jpg
+        :scale: 50 %
+            
+add_histogram
+----------------
+
+Common used arguments:
+
+* summary.add_scalar(tag, scalar_value, global_step=None)
+    * tag *(string)* – Data identifier
+    * values *(torch.Tensor, numpy.array, or string/blobname)* – Values to build histogram
+    * global_step *(int)* – Global step value to record
+
+Example:
+
+.. code-block:: python
+
+    from parl.utils import summary
+    import numpy as np
+
+    for i in range(10):
+        x = np.random.random(1000)
+        summary.add_histogram('distribution centers', x + i, i)
+
+Expected result:
+
+    .. image:: add_histogram.jpg
+        :scale: 50 %
diff --git a/evo_kit/CMakeLists.txt b/evo_kit/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..a9672c11aa0ea54ee6de4f6c6d60e92c18d47e60
--- /dev/null
+++ b/evo_kit/CMakeLists.txt
@@ -0,0 +1,89 @@
+cmake_minimum_required (VERSION 2.6)
+project (EvoKit)
+
+########## options ##########
+option(WITH_PADDLE "Compile EvoKit with PaddleLite framework." OFF)
+option(WITH_TORCH "Compile EvoKit with Torch framework." OFF)
+
+message("WITH_PADDLE: "${WITH_PADDLE})
+message("WITH_TORCH: "${WITH_TORCH})
+
+if (NOT (WITH_PADDLE OR WITH_TORCH))
+  message("ERROR: You should choose at least one framework to compile EvoKit.")
+  return()
+elseif(WITH_PADDLE AND WITH_TORCH)
+  message("ERROR: You cannot choose more than one framework to compile EvoKit.")
+  return()
+endif()
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+
+file(GLOB src "core/src/*.cc" "core/proto/evo_kit/*.cc")
+include_directories("core/include")
+include_directories("core/proto")
+include_directories("benchmark")
+
+########## PaddleLite config ##########
+if (WITH_PADDLE)
+  add_definitions(-g -O3 -pthread)
+
+  include_directories("paddle/include")
+  include_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/include" 
+                   "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/include")
+  link_directories("${PROJECT_SOURCE_DIR}/inference_lite_lib/cxx/lib" 
+                   "${PROJECT_SOURCE_DIR}/inference_lite_lib/third_party/mklml/lib")
+
+  file(GLOB framework_src "paddle/src/*.cc")
+  set(TARGET EvoKit_paddle)
+########## Torch config ##########
+elseif (WITH_TORCH)
+  # list(APPEND CMAKE_PREFIX_PATH "./libtorch")
+  # find_package(Torch REQUIRED ON)  # TODO: not necessary for now
+
+  include_directories("torch/include")
+
+  file(GLOB framework_src "torch/src/*.cc")
+  set(TARGET EvoKit_torch)
+else ()
+  message("ERROR: You should choose at least one framework to compile EvoKit.")
+endif()
+
+
+add_library(${TARGET} STATIC ${src} ${framework_src})
+target_link_libraries(${TARGET} gflags protobuf pthread glog)
+
+
+# ########## PaddleLite libraries ##########
+# if (WITH_PADDLE)
+#   target_link_libraries(${TARGET} -lpaddle_full_api_shared)
+#   target_link_libraries(${TARGET} -lmklml_intel)
+#   target_link_libraries(${TARGET} -ldl)
+# ########## Torch libraries ##########
+# elseif (WITH_TORCH)
+#   target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
+# endif()
+
+file(GLOB include "core/include/evo_kit/*.h")
+file(GLOB proto_include "core/proto/evo_kit/*.h")
+file(GLOB torch_include "torch/include/evo_kit/*.h")
+file(GLOB paddle_include "paddle/include/evo_kit/*.h")
+file(GLOB benchmark_include "benchmark/*.h")
+file(GLOB findcmake "cmake/Torch/*.cmake")
+
+set(CMAKE_INSTALL_PREFIX "${PROJECT_SOURCE_DIR}/libevokit")
+install(TARGETS ${TARGET} ARCHIVE DESTINATION "lib")
+install(FILES ${include} ${proto_include} DESTINATION "include/evo_kit")
+install(FILES ${torch_include} DESTINATION "torch/evo_kit")
+install(FILES ${paddle_include} DESTINATION "paddle/evo_kit")
+install(FILES ${benchmark_include} DESTINATION "include")
+install(FILES ${findcmake} DESTINATION "cmake/Torch")
diff --git a/evo_kit/DeepES.gif b/evo_kit/DeepES.gif
new file mode 100644
index 0000000000000000000000000000000000000000..7240118f3fce55b587690450e0c9cafc2f0694db
Binary files /dev/null and b/evo_kit/DeepES.gif differ
diff --git a/evo_kit/README.md b/evo_kit/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..ecac85379c048f22e444b9286d8e5225a7e7daa8
--- /dev/null
+++ b/evo_kit/README.md
@@ -0,0 +1,41 @@
+# EvoKit
+EvoKit 是一个集合了多种进化算法、兼容多种类预测框架的进化算法库，主打快速上线验证 。
+<p align="center">
+<img src="DeepES.gif" alt="PARL" width="500"/>
+</p>
+
+## 使用示范
+```c++
+//实例化一个预测，根据配置文件加载模型，采样方式（Gaussian\CMA sampling..)、更新方式(SGD\Adam)等
+auto agent = ESAgent(config); 
+
+for (int i = 0; i < 10; ++i) {
+   auto sampling_agnet = agent->clone(); // clone出一个sampling agent
+   SamplingInfo info;
+   sampling_agent->add_noise(info); // 参数扰动，同时保存随机种子到info中
+   int reward = evaluate(env, sampling_agent); //评估参数
+   noisy_info.push_back(info); // 记录随机噪声对应种子
+   noisy_rewards.push_back(reward); // 记录评估结果
+}
+//根据评估结果、随机种子更新参数，然后重复以上过程，直到收敛。
+agent->update(noisy_info, noisy_rewards);
+```
+
+## 一键运行demo列表
+- **PaddleLite**: sh ./scripts/build.sh paddle
+- **Torch**: sh ./scripts/build.sh torch
+- **裸写网络**：
+
+## 相关依赖:
+- Protobuf2
+- OpenMP
+- [glog](https://github.com/gflags/gflags/blob/master/INSTALL.md)
+- [gflag](https://github.com/google/glog)
+
+## 额外依赖：
+
+### 使用PaddleLite
+下载PaddleLite的X86预编译库，或者编译PaddleLite源码，得到inference_lite_lib文件夹，放在当前目录中。(可参考：[PaddleLite使用X86预测部署](https://paddle-lite.readthedocs.io/zh/latest/demo_guides/x86.html))
+
+### 使用torch 
+下载[libtorch](https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip)或者编译torch源码，得到libtorch文件夹，放在当前目录中。
diff --git a/evo_kit/benchmark/cartpole.h b/evo_kit/benchmark/cartpole.h
new file mode 100644
index 0000000000000000000000000000000000000000..f289715aeac29cb76d5148a5ae8b4adc5233243c
--- /dev/null
+++ b/evo_kit/benchmark/cartpole.h
@@ -0,0 +1,98 @@
+// Third party code
+// This code is copied or modified from openai/gym's cartpole.py
+#include <iostream>
+#include <random>
+#include <cassert>
+#include <vector>
+
+const double kPi = 3.1415926535898;
+
+class CartPole {
+public:
+  double gravity = 9.8;
+  double masscart = 1.0;
+  double masspole = 0.1;
+  double total_mass = (masspole + masscart);
+  double length = 0.5; // actually half the pole's length;
+  double polemass_length = (masspole * length);
+  double force_mag = 10.0;
+  double tau = 0.02; // seconds between state updates;
+
+  // Angle at which to fail the episode
+  double theta_threshold_radians = 12 * 2 * kPi / 360;
+  double x_threshold = 2.4;
+  int steps_beyond_done = -1;
+
+  std::vector<float> state = {0, 0, 0, 0};
+  double reward;
+  bool done;
+  int step_ = 0;
+
+  const float* getState() {
+    return state.data();
+  }
+
+  double getReward() {
+    return reward;
+  }
+
+  double isDone() {
+    return done;
+  }
+
+  void reset() {
+    std::random_device rd;
+    std::default_random_engine generator(rd());
+    std::uniform_real_distribution<float> distribution(-0.05, 0.05);
+    for (int i = 0; i < 4; ++i) {
+      state[i] = distribution(generator);
+    }
+    
+    steps_beyond_done = -1;
+    step_ = 0;
+  }
+
+  CartPole() {
+    reset();
+  }
+
+  void step(int action) {
+    float x = state[0];
+    float x_dot = state[1];
+    float theta = state[2];
+    float theta_dot = state[3];
+
+    auto force = (action == 1) ? force_mag : -force_mag;
+    auto costheta = std::cos(theta);
+    auto sintheta = std::sin(theta);
+    auto temp = (force + polemass_length * theta_dot * theta_dot * sintheta) /
+      total_mass;
+    auto thetaacc = (gravity * sintheta - costheta * temp) /
+      (length * (4.0 / 3.0 - masspole * costheta * costheta / total_mass));
+    auto xacc = temp - polemass_length * thetaacc * costheta / total_mass;
+
+    x = x + tau * x_dot;
+    x_dot = x_dot + tau * xacc;
+    theta = theta + tau * theta_dot;
+    theta_dot = theta_dot + tau * thetaacc;
+
+    state = {x, x_dot, theta, theta_dot};
+
+    done = x < -x_threshold || x > x_threshold ||
+      theta < -theta_threshold_radians || theta > theta_threshold_radians ||
+      step_ > 200;
+
+    if (!done) {
+      reward = 1.0;
+    } else if (steps_beyond_done == -1) {
+      // Pole just fell!
+      steps_beyond_done = 0;
+      reward = 0;
+    } else {
+      if (steps_beyond_done == 0) {
+        assert(false); // Can't do this
+      }
+    }
+    step_++;
+  }
+};
diff --git a/evo_kit/cmake/Torch/EvoKitConfig.cmake b/evo_kit/cmake/Torch/EvoKitConfig.cmake
new file mode 100644
index 0000000000000000000000000000000000000000..9f1c954430aec05a38d03c26a6b406343c01ad20
--- /dev/null
+++ b/evo_kit/cmake/Torch/EvoKitConfig.cmake
@@ -0,0 +1,45 @@
+# FindEvoKit
+# -------
+#
+# Finds the EvoKit library
+#
+# This will define the following variables:
+#
+#   EVOKIT_FOUND        -- True if the system has the EvoKit library
+#   EVOKIT_INCLUDE_DIRS -- The include directories for EvoKit
+#   EVOKIT_LIBRARY    -- Libraries to link against
+#
+# and the following imported targets:
+#
+#   EvoKit
+
+include(FindPackageHandleStandardArgs)
+
+if (DEFINED ENV{EVOKIT_INSTALL_PREFIX})
+  set(EVOKIT_INSTALL_PREFIX $ENV{EVOKIT_INSTALL_PREFIX})
+else()
+  # Assume we are in <install-prefix>/cmake/Torch/EvoKitConfig.cmake
+  get_filename_component(CMAKE_CURRENT_LIST_DIR "${CMAKE_CURRENT_LIST_FILE}" PATH)
+  get_filename_component(EVOKIT_INSTALL_PREFIX "${CMAKE_CURRENT_LIST_DIR}/../../" ABSOLUTE)
+endif()
+
+# Include directories.
+if (EXISTS "${EVOKIT_INSTALL_PREFIX}/include")
+  set(EVOKIT_INCLUDE_DIRS
+    ${EVOKIT_INSTALL_PREFIX}/include
+    ${EVOKIT_INSTALL_PREFIX}/torch)
+else()
+  set(EVOKIT_INCLUDE_DIRS
+    ${EVOKIT_INSTALL_PREFIX}/include
+    ${EVOKIT_INSTALL_PREFIX}/torch)
+endif()
+
+find_library(EVOKIT_LIBRARY libEvoKit_torch.a PATHS "${EVOKIT_INSTALL_PREFIX}/lib")
+
+include_directories("${EVOKIT_INSTALL_PREFIX}/torch")
+include_directories("${EVOKIT_INSTALL_PREFIX}/include")
+
+find_package_handle_standard_args(EvoKit DEFAULT_MSG EVOKIT_LIBRARY EVOKIT_INCLUDE_DIRS)
+message(STATUS "EVOKIT_FOUND: ${EVOKIT_FOUND}")
+message(STATUS "EVOKIT_INCLUDE_DIRS: ${EVOKIT_INCLUDE_DIRS}")
+message(STATUS "EVOKIT_LIBRARY: ${EVOKIT_LIBRARY}")
diff --git a/evo_kit/core/include/evo_kit/adam_optimizer.h b/evo_kit/core/include/evo_kit/adam_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..b268b69f61d35e5d6df8eeb56b1869e7bcb828ff
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/adam_optimizer.h
@@ -0,0 +1,53 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_ADAM_OPTIMIZER_H
+#define EVO_KIT_ADAM_OPTIMIZER_H
+
+#include <cmath>
+#include <unordered_map>
+#include "evo_kit/optimizer.h"
+
+namespace evo_kit {
+
+/*@brief AdamOptimizer.
+  * Implements Adam algorithm.
+  *
+  *@Args:
+  *     base_lr: learning rate (default: 1e-3).
+  *     beta1: coefficients used for computing running averages of gradient (default: 0.9).
+  *     beta2: coefficients used for computing running averages of gradient's square (default: 0.999).
+  *     epsilon: term added to the denominator to improve numerical stability (default: 1e-8).
+  */
+class AdamOptimizer: public Optimizer {
+public:
+    AdamOptimizer(float base_lr, float beta1 = 0.9, float beta2 = 0.999,
+                  float epsilon = 1e-8): Optimizer(base_lr), \
+        _beta1(beta1), _beta2(beta2), _epsilon(epsilon) {}
+    ~AdamOptimizer();
+
+protected:
+    void compute_step(float* gradient, int size, std::string param_name);
+
+private:
+    float _beta1;
+    float _beta2;
+    float _epsilon;
+    std::unordered_map<std::string, float*> _momentum;
+    std::unordered_map<std::string, float*> _velocity;
+};
+
+}//namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h b/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..c033fb7f23e9d3d91754237cad61e181a823db2d
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/cached_gaussian_sampling.h
@@ -0,0 +1,78 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
+#define EVO_KIT_CACHED_GAUSSIAN_SAMPLING_H
+
+#include <glog/logging.h>
+#include <random>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "sampling_method.h"
+#include "utils.h"
+
+namespace evo_kit {
+
+class CachedGaussianSampling: public SamplingMethod {
+
+public:
+    CachedGaussianSampling();
+
+    ~CachedGaussianSampling();
+
+    /*Initialize the sampling algorithm given the config with the protobuf format.
+     *EvoKit library uses only one configuration file for all sampling algorithms.
+      A defalut configuration file can be found at: . // TODO: where?
+      Usally you won't have to modify the configuration items of other algorithms
+      if you are not using them.
+     */
+    bool load_config(const EvoKitConfig& config);
+
+    /*@brief generate Gaussian noise and the related key.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: generate Gaussian successfully or not.
+     */
+    bool sampling(int* key, float* noise, int64_t size);
+
+    /*@brief reconstruct the Gaussion noise given the key.
+     * This function is often used for updating the neuron network parameters in the offline environment.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: reconstruct Gaussian successfully or not.
+     */
+    bool resampling(int key, float* noise, int64_t size);
+
+private:
+    float _std;
+    int _cache_size;
+    float* _noise_cache = nullptr;
+
+    bool _create_noise_cache();
+};
+
+}
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/gaussian_sampling.h b/evo_kit/core/include/evo_kit/gaussian_sampling.h
new file mode 100644
index 0000000000000000000000000000000000000000..c0fc66f058f2d1b9224d19d5c029cdca1853f638
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/gaussian_sampling.h
@@ -0,0 +1,73 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+//
+#ifndef EVO_KIT_GAUSSIAN_SAMPLING_H
+#define EVO_KIT_GAUSSIAN_SAMPLING_H
+
+#include <random>
+#include <stdio.h>
+#include <stdlib.h>
+#include <time.h>
+#include "evo_kit/sampling_method.h"
+#include "evo_kit/utils.h"
+
+namespace evo_kit {
+
+class GaussianSampling: public SamplingMethod {
+
+public:
+    GaussianSampling() {}
+
+    ~GaussianSampling() {}
+
+    /*Initialize the sampling algorithm given the config with the protobuf format.
+     *EvoKit library uses only one configuration file for all sampling algorithms.
+      A defalut configuration file can be found at: . // TODO: where?
+      Usally you won't have to modify the configuration items of other algorithms
+      if you are not using them.
+     */
+    bool load_config(const EvoKitConfig& config);
+
+    /*@brief generate Gaussian noise and the related key.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: generate Gaussian successfully or not.
+     */
+    bool sampling(int* key, float* noise, int64_t size);
+
+    /*@brief reconstruct the Gaussion noise given the key.
+     * This function is often used for updating the neuron network parameters in the offline environment.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: reconstruct Gaussian successfully or not.
+     */
+    bool resampling(int key, float* noise, int64_t size);
+
+private:
+    float _std;
+};
+
+}
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/optimizer.h b/evo_kit/core/include/evo_kit/optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..5c41bc5d405b00bef71affa0fa6cb82a13afd1b2
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/optimizer.h
@@ -0,0 +1,79 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_OPTIMIZER_H
+#define EVO_KIT_OPTIMIZER_H
+
+#include <glog/logging.h>
+#include <unordered_map>
+
+namespace evo_kit {
+
+/*@brief Optimizer. Base class for optimizers.
+ *
+ *@Args:
+ *     base_lr: learning rate (default: 1e-3).
+ *
+ * .. warning: update () is based on the parameter level,
+ *             you need to perform update () on each parameter.
+ *
+ * Subclasses are required to implement the following functions:
+ * 1. compute_steps
+ */
+class Optimizer {
+public:
+    Optimizer() : _base_lr(1e-3), _update_times(0) {}
+    Optimizer(float base_lr) : _base_lr(base_lr), _update_times(0) {}
+    virtual ~Optimizer() {
+        _params_size.clear();
+    }
+
+    template<typename T>
+    bool update(T weights, float* gradient, int size, std::string param_name = "") {
+        /*@ Performs a single optimization step (parameter update) at the parameter level.
+          *
+          *@Args:
+          *     weights (array): parameter weights.
+          *     gradient (array): gradient for updating weights.
+          *     size: size of gradient.
+          *     param_name: the name corresponding to the weights.
+          */
+        if (_params_size.count(param_name) == 0) {
+            _params_size[param_name] = size;
+        } else if (_params_size[param_name] != size) {
+            LOG(WARNING) << "[Warning] Update times: " << int(_update_times / _params_size.size()) \
+                         << ". Size of weights[" << param_name << "] is " << _params_size[param_name] << ", not " << size;
+            return false;
+        }
+
+        ++_update_times;
+        compute_step(gradient, size, param_name);
+
+        for (int i = 0; i < size; ++i) {
+            weights[i] -= _base_lr * gradient[i];
+        }
+
+        return true;
+    } // template function
+
+protected:
+    virtual void compute_step(float* graident, int size, std::string param_name = "") = 0;
+    float _base_lr;
+    float _update_times;
+    std::unordered_map<std::string, int> _params_size;
+};
+
+
+}//namespace
+#endif
diff --git a/evo_kit/core/include/evo_kit/optimizer_factory.h b/evo_kit/core/include/evo_kit/optimizer_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..6e3e099110d17efefd8dce9d5090b06fc27c0d21
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/optimizer_factory.h
@@ -0,0 +1,36 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_OPTIMIZER_FACTORY_H
+#define EVO_KIT_OPTIMIZER_FACTORY_H
+
+#include <algorithm>
+#include <glog/logging.h>
+#include <memory>
+#include "evo_kit/adam_optimizer.h"
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/optimizer.h"
+#include "evo_kit/sgd_optimizer.h"
+
+namespace evo_kit {
+/* @brief: create an optimizer according to the configuration"
+ * @args:
+ *    config: configuration for the optimizer
+ *
+ */
+std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config);
+
+} // namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/sampling_factory.h b/evo_kit/core/include/evo_kit/sampling_factory.h
new file mode 100644
index 0000000000000000000000000000000000000000..e7e859cddcb88784b2d01b9642bcbc1b23e378cb
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/sampling_factory.h
@@ -0,0 +1,36 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SAMPLING_FACTORY_H
+#define EVO_KIT_SAMPLING_FACTORY_H
+
+#include <algorithm>
+#include <glog/logging.h>
+#include <memory>
+#include "evo_kit/cached_gaussian_sampling.h"
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/sampling_method.h"
+
+namespace evo_kit {
+/* @brief: create an sampling_method according to the configuration"
+ * @args:
+ *    config: configuration for the EvoKit
+ *
+ */
+std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& Config);
+
+} // namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/sampling_method.h b/evo_kit/core/include/evo_kit/sampling_method.h
new file mode 100644
index 0000000000000000000000000000000000000000..dc07dfe7cfefff694eef6cf7ca17ee35848eea98
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/sampling_method.h
@@ -0,0 +1,90 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SAMPLING_METHOD_H
+#define EVO_KIT_SAMPLING_METHOD_H
+
+#include <string>
+#include <random>
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit {
+
+/*Base class for sampling algorithms. All algorithms are required to override the following functions:
+ *
+ * 1. load_config
+ * 2. sampling
+ * 3. resampling
+ *
+ * View an demostrative algorithm in gaussian_sampling.h
+ * */
+
+class SamplingMethod {
+
+public:
+
+    SamplingMethod(): _seed(0) {}
+
+    virtual ~SamplingMethod() {}
+
+    /*Initialize the sampling algorithm given the config with the protobuf format.
+     *EvoKit library uses only one configuration file for all sampling algorithms.
+      A defalut configuration file can be found at: . // TODO: where?
+      Usally you won't have to modify the configuration items of other algorithms
+      if you are not using them.
+     */
+    virtual bool load_config(const EvoKitConfig& config) = 0;
+
+    /*@brief generate Gaussian noise and the related key.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: generate Gaussian successfully or not.
+     */
+    virtual bool sampling(int* key, float* noise, int64_t size) = 0;
+
+    /*@brief reconstruct the Gaussion noise given the key.
+     * This function is often used for updating the neuron network parameters in the offline environment.
+     *
+     *@Args:
+     *     key: a unique key associated with the sampled noise.
+     *     noise: a pointer pointed to the memory that stores the noise
+     *     size: the number of float to be sampled.
+     *
+     *@return:
+     *     success: reconstruct Gaussian successfully or not.
+     */
+    virtual bool resampling(int key, float* noise, int64_t size) = 0;
+
+    bool set_seed(int seed) {
+        _seed = seed;
+        srand(_seed);
+        return true;
+    }
+
+    int get_seed() {
+        return _seed;
+    }
+
+protected:
+    int _seed;
+
+};
+
+}
+#endif
diff --git a/evo_kit/core/include/evo_kit/sgd_optimizer.h b/evo_kit/core/include/evo_kit/sgd_optimizer.h
new file mode 100644
index 0000000000000000000000000000000000000000..cd0d68803775df66d1bc90c748fe9801e17176c9
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/sgd_optimizer.h
@@ -0,0 +1,46 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_SGD_OPTIMIZER_H
+#define EVO_KIT_SGD_OPTIMIZER_H
+
+#include <cmath>
+#include <unordered_map>
+#include "evo_kit/optimizer.h"
+
+namespace evo_kit {
+
+/*@brief SGDOptimizer.
+  * Implements stochastic gradient descent (optionally with momentum).
+  *
+  *@Args:
+  *     base_lr: learning rate (default: 1e-3).
+  *     momentum: momentum factor (default: 0.9).
+  */
+class SGDOptimizer: public Optimizer {
+public:
+    SGDOptimizer(float base_lr, float momentum = 0.9): Optimizer(base_lr), _momentum(momentum) {}
+    ~SGDOptimizer();
+
+protected:
+    void compute_step(float* gradient, int size, std::string param_name);
+
+private:
+    float _momentum;
+    std::unordered_map<std::string, float*> _velocity;
+};
+
+} // namespace
+
+#endif
diff --git a/evo_kit/core/include/evo_kit/utils.h b/evo_kit/core/include/evo_kit/utils.h
new file mode 100644
index 0000000000000000000000000000000000000000..fd704fd384de70683445d65d5609f97b9979907a
--- /dev/null
+++ b/evo_kit/core/include/evo_kit/utils.h
@@ -0,0 +1,97 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_UTILS_H
+#define EVO_KIT_UTILS_H
+
+#include <algorithm>
+#include <fstream>
+#include <glog/logging.h>
+#include <google/protobuf/text_format.h>
+#include <string>
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit {
+
+/*Return ranks that is normliazed to [-0.5, 0.5] with the rewards as input.
+  Args:
+    reward: an array of rewards
+*/
+bool compute_centered_ranks(std::vector<float>& reward);
+
+std::string read_file(const std::string& filename);
+
+/* Load a protobuf-based configuration from the file.
+ * Args:
+ *  config_file: file path.
+ *  proto_config: protobuff message for configuration.
+ * return
+ */
+template<typename T>
+bool load_proto_conf(const std::string& config_file, T& proto_config) {
+    bool success = true;
+    std::ifstream fin(config_file);
+
+    if (!fin || fin.fail()) {
+        LOG(ERROR) << "open prototxt config failed: " << config_file;
+        success = false;
+    } else {
+        fin.seekg(0, std::ios::end);
+        size_t file_size = fin.tellg();
+        fin.seekg(0, std::ios::beg);
+
+        char* file_content_buffer = new char[file_size];
+        fin.read(file_content_buffer, file_size);
+
+        std::string proto_str(file_content_buffer, file_size);
+
+        if (!google::protobuf::TextFormat::ParseFromString(proto_str, &proto_config)) {
+            LOG(ERROR) << "Failed to load config: " << config_file;
+            success = false;
+        }
+
+        delete[] file_content_buffer;
+        fin.close();
+    }
+
+    return success;
+}
+
+template<typename T>
+bool save_proto_conf(const std::string& config_file, T& proto_config) {
+    bool success = true;
+    std::ofstream ofs(config_file, std::ofstream::out);
+
+    if (!ofs || ofs.fail()) {
+        LOG(ERROR) << "open prototxt config failed: " << config_file;
+        success = false;
+    } else {
+        std::string config_str;
+        success = google::protobuf::TextFormat::PrintToString(proto_config, &config_str);
+
+        if (!success) {
+            return success;
+        }
+
+        ofs << config_str;
+    }
+
+    return success;
+}
+
+std::vector<std::string> list_all_model_dirs(std::string path);
+
+}
+
+#endif
diff --git a/evo_kit/core/proto/evo_kit/evo_kit.proto b/evo_kit/core/proto/evo_kit/evo_kit.proto
new file mode 100644
index 0000000000000000000000000000000000000000..fc4f68d9247e63b1d98b35ebd338052ffb7eb9a6
--- /dev/null
+++ b/evo_kit/core/proto/evo_kit/evo_kit.proto
@@ -0,0 +1,57 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+syntax = "proto2";
+
+package evo_kit;
+
+message EvoKitConfig {
+  //sampling configuration
+  optional int32 seed = 1 [default = 18];
+  optional int32 buffer_size = 2 [default = 100000];
+  optional GaussianSamplingConfig gaussian_sampling = 3;
+  // Optimizer Configuration
+  optional OptimizerConfig optimizer = 4;
+  // AsyncESAgent Configuration
+  optional AsyncESConfig async_es = 5;
+}
+
+message GaussianSamplingConfig {
+  optional float std = 1 [default = 1.0];
+  optional bool cached = 2 [default = false];
+  optional int32 cache_size = 3 [default = 100000];
+}
+
+message OptimizerConfig{
+  optional string type = 1 [default = "SGD"];
+  optional float base_lr = 2 [default = 1e-3]; // The base learning rate.
+  optional float momentum = 3 [default = 0.9]; // The momentum value for SGD.
+
+  // ------------Adam Optimizer---------
+  optional float beta1 = 4 [default = 0.9];
+  optional float beta2 = 5 [default = 0.999];
+  optional float epsilon = 6 [default = 1e-8];
+}
+
+message SamplingInfo{
+  repeated int32 key = 1;
+  optional int32 model_iter_id = 2;
+}
+
+message AsyncESConfig{
+  optional string model_warehouse = 1 [default = "./model_warehouse"];
+  repeated string model_md5 = 2;
+  optional int32 max_to_keep = 3 [default = 5];
+  optional int32 model_iter_id = 4 [default = 0];
+}
diff --git a/evo_kit/core/src/adam_optimizer.cc b/evo_kit/core/src/adam_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..44f36e4d1d3e01ae2cceeba16d95d7aaa24a2c09
--- /dev/null
+++ b/evo_kit/core/src/adam_optimizer.cc
@@ -0,0 +1,55 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/adam_optimizer.h"
+
+namespace evo_kit {
+
+AdamOptimizer::~AdamOptimizer() {
+    for (auto iter = _momentum.begin(); iter != _momentum.end(); iter++) {
+        delete[] iter->second;
+    }
+
+    for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
+        delete[] iter->second;
+    }
+
+    _momentum.clear();
+    _velocity.clear();
+}
+
+void AdamOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
+    if (_momentum.count(param_name) == 0) {
+        _momentum[param_name] = new float [size];
+        memset(_momentum[param_name], 0, size * sizeof(float));
+    }
+
+    if (_velocity.count(param_name) == 0) {
+        _velocity[param_name] = new float [size];
+        memset(_velocity[param_name], 0, size * sizeof(float));
+    }
+
+    int true_update_times = int(_update_times / _velocity.size());
+    float alpha = std::sqrt(1 - std::pow(_beta2, _update_times)) / (1 - std::pow(_beta1,
+                  _update_times));
+
+    for (int i = 0; i < size; ++i) {
+        _momentum[param_name][i] = _beta1 * _momentum[param_name][i] + (1 - _beta1) * gradient[i];
+        _velocity[param_name][i] = _beta2 * _velocity[param_name][i] + (1 - _beta2) * gradient[i] *
+                                   gradient[i];
+        gradient[i] = alpha * _momentum[param_name][i] / (std::sqrt(_velocity[param_name][i]) + _epsilon);
+    }
+}
+
+}//namespace
diff --git a/evo_kit/core/src/cached_gaussian_sampling.cc b/evo_kit/core/src/cached_gaussian_sampling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..844eca20e2935c4b5e7ac39d5fa07df1c2b13913
--- /dev/null
+++ b/evo_kit/core/src/cached_gaussian_sampling.cc
@@ -0,0 +1,121 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/cached_gaussian_sampling.h"
+
+namespace evo_kit {
+
+CachedGaussianSampling::CachedGaussianSampling() {}
+
+CachedGaussianSampling::~CachedGaussianSampling() {
+    delete[] _noise_cache;
+}
+
+bool CachedGaussianSampling::load_config(const EvoKitConfig& config) {
+    bool success = true;
+    _std = config.gaussian_sampling().std();
+    success = set_seed(config.seed());
+    CHECK(success) << "[EvoKit] Fail to set seed while load config.";
+    _cache_size = config.gaussian_sampling().cache_size();
+    _noise_cache = new float [_cache_size];
+    memset(_noise_cache, 0, _cache_size * sizeof(float));
+    success = _create_noise_cache();
+    CHECK(success) << "[EvoKit] Fail to create noise_cache while load config.";
+    return success;
+}
+
+bool CachedGaussianSampling::sampling(int* key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (_noise_cache == nullptr) {
+        LOG(ERROR) << "[EvoKit] Please use load_config() first.";
+        success = false;
+        return success;
+    }
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+        return success;
+    }
+
+    if ((size >= _cache_size) || (size < 0)) {
+        LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
+                   "), cache_size: " << _cache_size;
+        success = false;
+        return success;
+    }
+
+    int rand_key = rand();
+    std::default_random_engine generator(rand_key);
+    std::uniform_int_distribution<unsigned int> uniform(0, _cache_size - size);
+    int index = uniform(generator);
+    *key = index;
+
+    for (int64_t i = 0; i < size; ++i) {
+        *(noise + i) = *(_noise_cache + index + i);
+    }
+
+    return success;
+}
+
+bool CachedGaussianSampling::resampling(int key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (_noise_cache == nullptr) {
+        LOG(ERROR) << "[EvoKit] Please use load_config() first.";
+        success = false;
+        return success;
+    }
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+        return success;
+    }
+
+    if ((size >= _cache_size) || (size < 0)) {
+        LOG(ERROR) << "[EvoKit] Input size " << size << " is out of bounds [0, " << _cache_size <<
+                   "), cache_size: " << _cache_size;
+        success = false;
+        return success;
+    }
+
+    if ((key > _cache_size - size) || (key < 0)) {
+        LOG(ERROR) << "[EvoKit] Resampling key " << key << " is out of bounds [0, "
+                    << _cache_size - size <<
+                   "], cache_size: " << _cache_size << ", size: " << size;
+        success = false;
+        return success;
+    }
+
+    for (int64_t i = 0; i < size; ++i) {
+        *(noise + i) = *(_noise_cache + key + i);
+    }
+
+    return success;
+}
+
+bool CachedGaussianSampling::_create_noise_cache() {
+    std::default_random_engine generator(_seed);
+    std::normal_distribution<float> norm;
+
+    for (int64_t i = 0; i < _cache_size; ++i) {
+        *(_noise_cache + i) = norm(generator) * _std;
+    }
+
+    return true;
+}
+
+}
diff --git a/evo_kit/core/src/gaussian_sampling.cc b/evo_kit/core/src/gaussian_sampling.cc
new file mode 100644
index 0000000000000000000000000000000000000000..776c2c4da940fafd23e073dd97002876ddfc8673
--- /dev/null
+++ b/evo_kit/core/src/gaussian_sampling.cc
@@ -0,0 +1,65 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/gaussian_sampling.h"
+
+namespace evo_kit {
+
+bool GaussianSampling::load_config(const EvoKitConfig& config) {
+    bool success = true;
+    _std = config.gaussian_sampling().std();
+    success = set_seed(config.seed());
+    return success;
+}
+
+bool GaussianSampling::sampling(int* key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+        return success;
+    }
+
+    int rand_key = rand();
+    *key = rand_key;
+    std::default_random_engine generator(rand_key);
+    std::normal_distribution<float> norm;
+
+    for (int64_t i = 0; i < size; ++i) {
+        *(noise + i) = norm(generator) * _std;
+    }
+
+    return success;
+}
+
+bool GaussianSampling::resampling(int key, float* noise, int64_t size) {
+    bool success = true;
+
+    if (noise == nullptr) {
+        LOG(ERROR) << "[EvoKit] Input noise array cannot be nullptr.";
+        success = false;
+    } else {
+        std::default_random_engine generator(key);
+        std::normal_distribution<float> norm;
+
+        for (int64_t i = 0; i < size; ++i) {
+            *(noise + i) = norm(generator) * _std;
+        }
+    }
+
+    return success;
+}
+
+}
diff --git a/evo_kit/core/src/optimizer_factory.cc b/evo_kit/core/src/optimizer_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6137d623fc1b023cc8d8edc8c988aced66a482c0
--- /dev/null
+++ b/evo_kit/core/src/optimizer_factory.cc
@@ -0,0 +1,39 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/optimizer_factory.h"
+
+namespace evo_kit {
+
+std::shared_ptr<Optimizer> create_optimizer(const OptimizerConfig& optimizer_config) {
+    std::shared_ptr<Optimizer> optimizer;
+    std::string opt_type = optimizer_config.type();
+    std::transform(opt_type.begin(), opt_type.end(), opt_type.begin(), ::tolower);
+
+    if (opt_type == "sgd") {
+        optimizer = std::make_shared<SGDOptimizer>(optimizer_config.base_lr(), \
+                    optimizer_config.momentum());
+    } else if (opt_type == "adam") {
+        optimizer = std::make_shared<AdamOptimizer>(optimizer_config.base_lr(), \
+                    optimizer_config.beta1(), \
+                    optimizer_config.beta2(), \
+                    optimizer_config.epsilon());
+    } else {
+        LOG(ERROR) << "type of OptimizerConfig must be SGD or Adam."; // NotImplementedError
+    }
+
+    return optimizer;
+}
+
+}//namespace
diff --git a/evo_kit/core/src/sampling_factory.cc b/evo_kit/core/src/sampling_factory.cc
new file mode 100644
index 0000000000000000000000000000000000000000..8a0b8109a61a6ecaa80d82b8a8042c89574ea5a6
--- /dev/null
+++ b/evo_kit/core/src/sampling_factory.cc
@@ -0,0 +1,41 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/sampling_factory.h"
+
+namespace evo_kit {
+
+
+std::shared_ptr<SamplingMethod> create_sampling_method(const EvoKitConfig& config) {
+    std::shared_ptr<SamplingMethod> sampling_method;
+    bool cached = config.gaussian_sampling().cached();
+
+    if (cached) {
+        sampling_method = std::make_shared<CachedGaussianSampling>();
+    } else {
+        sampling_method = std::make_shared<GaussianSampling>();
+    }
+
+    bool success = sampling_method->load_config(config);
+
+    if (success) {
+        return sampling_method;
+    } else {
+        LOG(ERROR) << "[EvoKit] Fail to create sampling_method";
+        return nullptr;
+    }
+
+}
+
+}//namespace
diff --git a/evo_kit/core/src/sgd_optimizer.cc b/evo_kit/core/src/sgd_optimizer.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0b3174bffa3d7b3f3b353b18aab8eb428ba70437
--- /dev/null
+++ b/evo_kit/core/src/sgd_optimizer.cc
@@ -0,0 +1,40 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/sgd_optimizer.h"
+
+namespace evo_kit {
+
+SGDOptimizer::~SGDOptimizer() {
+    for (auto iter = _velocity.begin(); iter != _velocity.end(); iter++) {
+        delete[] iter->second;
+    }
+
+    _velocity.clear();
+}
+
+void SGDOptimizer::compute_step(float* gradient, int size, std::string param_name = "") {
+    if (_velocity.count(param_name) == 0) {
+        _velocity[param_name] = new float [size];
+        memset(_velocity[param_name], 0, size * sizeof(float));
+    }
+
+    for (int i = 0; i < size; ++i) {
+        _velocity[param_name][i] = _momentum * _velocity[param_name][i] + (1 - _momentum) * gradient[i];
+        gradient[i] = _velocity[param_name][i];
+    }
+}
+
+
+}//namespace
diff --git a/evo_kit/core/src/utils.cc b/evo_kit/core/src/utils.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e47b7d097f0f164a83fb96f6ae538e5a5f2370ea
--- /dev/null
+++ b/evo_kit/core/src/utils.cc
@@ -0,0 +1,81 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/utils.h"
+#include <dirent.h>
+
+namespace evo_kit {
+
+bool compute_centered_ranks(std::vector<float>& reward) {
+    std::vector<std::pair<float, int>> reward_index;
+    float gap = 1.0 / (reward.size() - 1);
+    float normlized_rank = -0.5;
+    int id = 0;
+
+    for (auto& rew : reward) {
+        reward_index.push_back(std::make_pair(rew, id));
+        ++id;
+    }
+
+    std::sort(reward_index.begin(), reward_index.end());
+
+    for (int i = 0; i < reward.size(); ++i) {
+        id = reward_index[i].second;
+        reward[id] = normlized_rank;
+        normlized_rank += gap;
+    }
+
+    return true;
+}
+
+std::vector<std::string> list_all_model_dirs(std::string path) {
+    std::vector<std::string> model_dirs;
+    DIR* dpdf;
+    struct dirent* epdf;
+    dpdf = opendir(path.data());
+
+    if (dpdf != NULL) {
+        while (epdf = readdir(dpdf)) {
+            std::string dir(epdf->d_name);
+
+            if (dir.find("model_iter_id") != std::string::npos) {
+                model_dirs.push_back(path + "/" + dir);
+            }
+        }
+    }
+
+    closedir(dpdf);
+    return model_dirs;
+}
+
+std::string read_file(const std::string& filename) {
+    std::ifstream ifile(filename.c_str());
+
+    if (!ifile.is_open()) {
+        LOG(ERROR) << "Open file: [" << filename << "] failed.";
+        return "";
+    }
+
+    std::ostringstream buf;
+    char ch = '\n';
+
+    while (buf && ifile.get(ch)) {
+        buf.put(ch);
+    }
+
+    ifile.close();
+    return buf.str();
+}
+
+}//namespace
diff --git a/evo_kit/demo/cartpole_config.prototxt b/evo_kit/demo/cartpole_config.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..2707cb60171a47675f1f5a0625de487ad04904f5
--- /dev/null
+++ b/evo_kit/demo/cartpole_config.prototxt
@@ -0,0 +1,17 @@
+seed: 1024
+gaussian_sampling {
+  std: 0.5
+  cached: true
+  cache_size: 100000
+}
+optimizer {
+  type: "Adam"
+  base_lr: 0.05
+  momentum: 0.9
+  beta1: 0.9
+  beta2: 0.999
+  epsilon: 1e-08
+}
+async_es {
+  model_iter_id: 0
+}
diff --git a/evo_kit/demo/paddle/cartpole_async_solver.cc b/evo_kit/demo/paddle/cartpole_async_solver.cc
new file mode 100644
index 0000000000000000000000000000000000000000..22d2507de2ea7f6684e8d835f78f88efd8fc5eb2
--- /dev/null
+++ b/evo_kit/demo/paddle/cartpole_async_solver.cc
@@ -0,0 +1,134 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <glog/logging.h>
+#include <omp.h>
+#include "evo_kit/async_es_agent.h"
+#include "cartpole.h"
+#include "paddle_api.h"
+
+using namespace evo_kit;
+using namespace paddle::lite_api;
+
+const int ITER = 10;
+
+// Use PaddlePredictor of CartPole model to predict the action.
+std::vector<float> forward(std::shared_ptr<PaddlePredictor> predictor, const float* obs) {
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+    input_tensor->Resize({1, 4});
+    input_tensor->CopyFromCpu(obs);
+
+    predictor->Run();
+
+    std::vector<float> probs(2, 0.0);
+    std::unique_ptr<const Tensor> output_tensor(
+        std::move(predictor->GetOutput(0)));
+    output_tensor->CopyToCpu(probs.data());
+    return probs;
+}
+
+int arg_max(const std::vector<float>& vec) {
+    return static_cast<int>(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())));
+}
+
+
+float evaluate(CartPole& env, std::shared_ptr<AsyncESAgent> agent) {
+    float total_reward = 0.0;
+    env.reset();
+    const float* obs = env.getState();
+
+    std::shared_ptr<PaddlePredictor> paddle_predictor;
+    paddle_predictor = agent->get_predictor();
+
+    while (true) {
+        std::vector<float> probs = forward(paddle_predictor, obs);
+        int act = arg_max(probs);
+        env.step(act);
+        float reward = env.getReward();
+        bool done = env.isDone();
+        total_reward += reward;
+
+        if (done) {
+            break;
+        }
+
+        obs = env.getState();
+    }
+
+    return total_reward;
+}
+
+
+int main(int argc, char* argv[]) {
+    std::vector<CartPole> envs;
+
+    for (int i = 0; i < ITER; ++i) {
+        envs.push_back(CartPole());
+    }
+
+    std::shared_ptr<AsyncESAgent> agent =
+        std::make_shared<AsyncESAgent>("./demo/paddle/cartpole_init_model",
+                                       "./demo/cartpole_config.prototxt");
+
+    // Clone agents to sample (explore).
+    std::vector< std::shared_ptr<AsyncESAgent> > sampling_agents;
+
+    for (int i = 0; i < ITER; ++i) {
+        sampling_agents.push_back(agent->clone());
+    }
+
+    std::vector<SamplingInfo> noisy_info;
+    std::vector<SamplingInfo> last_noisy_info;
+    std::vector<float> noisy_rewards(ITER, 0.0f);
+    std::vector<float> last_noisy_rewards;
+    noisy_info.resize(ITER);
+
+    omp_set_num_threads(10);
+
+    for (int epoch = 0; epoch < 100; ++epoch) {
+        last_noisy_info.clear();
+        last_noisy_rewards.clear();
+
+        if (epoch != 0) {
+            for (int i = 0; i < ITER; ++i) {
+                last_noisy_info.push_back(noisy_info[i]);
+                last_noisy_rewards.push_back(noisy_rewards[i]);
+            }
+        }
+
+        #pragma omp parallel for schedule(dynamic, 1)
+
+        for (int i = 0; i < ITER; ++i) {
+            std::shared_ptr<AsyncESAgent> sampling_agent = sampling_agents[i];
+            SamplingInfo info;
+            bool success = sampling_agent->add_noise(info);
+            float reward = evaluate(envs[i], sampling_agent);
+
+            noisy_info[i] = info;
+            noisy_rewards[i] = reward;
+        }
+
+        for (int i = 0; i < ITER; ++i) {
+            last_noisy_info.push_back(noisy_info[i]);
+            last_noisy_rewards.push_back(noisy_rewards[i]);
+        }
+
+        // NOTE: all parameters of sampling_agents will be updated
+        bool success = agent->update(last_noisy_info, last_noisy_rewards);
+
+        int reward = evaluate(envs[0], agent);
+        LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
+    }
+}
diff --git a/evo_kit/demo/paddle/cartpole_init_model.zip b/evo_kit/demo/paddle/cartpole_init_model.zip
new file mode 100644
index 0000000000000000000000000000000000000000..16a7720959786471f8f500e7aa031615d53a1928
Binary files /dev/null and b/evo_kit/demo/paddle/cartpole_init_model.zip differ
diff --git a/evo_kit/demo/paddle/cartpole_solver_parallel.cc b/evo_kit/demo/paddle/cartpole_solver_parallel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..33aa89990f23c744f494b9d9d75002103a0bfbcc
--- /dev/null
+++ b/evo_kit/demo/paddle/cartpole_solver_parallel.cc
@@ -0,0 +1,115 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <algorithm>
+#include <glog/logging.h>
+#include <omp.h>
+#include "cartpole.h"
+#include "evo_kit/es_agent.h"
+#include "paddle_api.h"
+
+using namespace evo_kit;
+using namespace paddle::lite_api;
+
+const int ITER = 10;
+
+// Use PaddlePredictor of CartPole model to predict the action.
+std::vector<float> forward(std::shared_ptr<PaddlePredictor> predictor, const float* obs) {
+    std::unique_ptr<Tensor> input_tensor(std::move(predictor->GetInput(0)));
+    input_tensor->Resize({1, 4});
+    input_tensor->CopyFromCpu(obs);
+
+    predictor->Run();
+
+    std::vector<float> probs(2, 0.0);
+    std::unique_ptr<const Tensor> output_tensor(
+        std::move(predictor->GetOutput(0)));
+    output_tensor->CopyToCpu(probs.data());
+    return probs;
+}
+
+int arg_max(const std::vector<float>& vec) {
+    return static_cast<int>(std::distance(vec.begin(), std::max_element(vec.begin(), vec.end())));
+}
+
+
+float evaluate(CartPole& env, std::shared_ptr<ESAgent> agent) {
+    float total_reward = 0.0;
+    env.reset();
+    const float* obs = env.getState();
+
+    std::shared_ptr<PaddlePredictor> paddle_predictor;
+    paddle_predictor = agent->get_predictor();
+
+    while (true) {
+        std::vector<float> probs = forward(paddle_predictor, obs);
+        int act = arg_max(probs);
+        env.step(act);
+        float reward = env.getReward();
+        bool done = env.isDone();
+        total_reward += reward;
+
+        if (done) {
+            break;
+        }
+
+        obs = env.getState();
+    }
+
+    return total_reward;
+}
+
+
+int main(int argc, char* argv[]) {
+    std::vector<CartPole> envs;
+
+    for (int i = 0; i < ITER; ++i) {
+        envs.push_back(CartPole());
+    }
+
+    std::shared_ptr<ESAgent> agent = std::make_shared<ESAgent>("./demo/paddle/cartpole_init_model",
+                                     "./demo/cartpole_config.prototxt");
+
+    // Clone agents to sample (explore).
+    std::vector< std::shared_ptr<ESAgent> > sampling_agents;
+
+    for (int i = 0; i < ITER; ++i) {
+        sampling_agents.push_back(agent->clone());
+    }
+
+    std::vector<SamplingInfo> noisy_keys;
+    std::vector<float> noisy_rewards(ITER, 0.0f);
+    noisy_keys.resize(ITER);
+
+    omp_set_num_threads(10);
+
+    for (int epoch = 0; epoch < 100; ++epoch) {
+        #pragma omp parallel for schedule(dynamic, 1)
+        for (int i = 0; i < ITER; ++i) {
+            std::shared_ptr<ESAgent> sampling_agent = sampling_agents[i];
+            SamplingInfo key;
+            bool success = sampling_agent->add_noise(key);
+            float reward = evaluate(envs[i], sampling_agent);
+
+            noisy_keys[i] = key;
+            noisy_rewards[i] = reward;
+        }
+
+        // NOTE: all parameters of sampling_agents will be updated
+        bool success = agent->update(noisy_keys, noisy_rewards);
+
+        int reward = evaluate(envs[0], agent);
+        LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
+    }
+}
diff --git a/evo_kit/demo/paddle/gen_cartpole_init_model.py b/evo_kit/demo/paddle/gen_cartpole_init_model.py
new file mode 100644
index 0000000000000000000000000000000000000000..62228b4f0cf953ffa3c1d11ae7bfd949c3e93925
--- /dev/null
+++ b/evo_kit/demo/paddle/gen_cartpole_init_model.py
@@ -0,0 +1,41 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from paddle import fluid
+
+
+def net(obs, act_dim):
+    hid1_size = act_dim * 10
+    hid1 = fluid.layers.fc(obs, size=hid1_size)
+    prob = fluid.layers.fc(hid1, size=act_dim, act='softmax')
+    return prob
+
+
+if __name__ == '__main__':
+    obs_dim = 4
+    act_dim = 2
+
+    obs = fluid.layers.data(name="obs", shape=[obs_dim], dtype='float32')
+
+    prob = net(obs, act_dim)
+
+    exe = fluid.Executor(fluid.CPUPlace())
+    exe.run(fluid.default_startup_program())
+    fluid.io.save_inference_model(
+        dirname='cartpole_init_model',
+        feeded_var_names=['obs'],
+        target_vars=[prob],
+        params_filename='params',
+        model_filename='model',
+        executor=exe)
diff --git a/evo_kit/demo/torch/CMakeLists.txt b/evo_kit/demo/torch/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..9ece945581c57a4c0c05fa38d007b00b7266392e
--- /dev/null
+++ b/evo_kit/demo/torch/CMakeLists.txt
@@ -0,0 +1,32 @@
+cmake_minimum_required (VERSION 2.6)
+project (EvoKit_demo)
+set(TARGET parallel_main)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(OpenMP)
+if (OPENMP_FOUND)
+    set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+    set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+    set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+list(APPEND CMAKE_PREFIX_PATH "./libtorch")
+find_package(Torch REQUIRED ON)
+set(demo "${PROJECT_SOURCE_DIR}/cartpole_solver_parallel.cc")
+
+
+########## main ##########
+add_executable(${TARGET} ${demo} ${framework_src}) 
+target_link_libraries(${TARGET} gflags protobuf pthread glog)
+
+########## Torch libraries ##########
+target_link_libraries(${TARGET} "${TORCH_LIBRARIES}")
+
+
+########## EvoKit libraries ##########
+list(APPEND CMAKE_PREFIX_PATH "./libevokit/cmake/Torch")
+find_package(EvoKit)
+target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}")
diff --git a/evo_kit/demo/torch/cartpole_solver_parallel.cc b/evo_kit/demo/torch/cartpole_solver_parallel.cc
new file mode 100644
index 0000000000000000000000000000000000000000..6c8f4c821c4b92e69b4755a1126296853a731102
--- /dev/null
+++ b/evo_kit/demo/torch/cartpole_solver_parallel.cc
@@ -0,0 +1,85 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <torch/torch.h>
+#include <memory>
+#include <algorithm>
+#include <glog/logging.h>
+#include <omp.h>
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/es_agent.h"
+#include "cartpole.h"
+#include "model.h"
+
+using namespace evo_kit;
+const int ITER = 10;
+
+float evaluate(CartPole& env, std::shared_ptr<ESAgent<Model>> agent) {
+  float total_reward = 0.0;
+  env.reset();
+  const float* obs = env.getState();
+  while (true) {
+    torch::Tensor obs_tensor = torch::tensor({obs[0], obs[1], obs[2], obs[3]});
+    torch::Tensor action = agent->predict(obs_tensor);
+    int act = std::get<1>(action.max(-1)).item<long>(); 
+    env.step(act);
+    float reward = env.getReward(); 
+    auto done = env.isDone();
+    total_reward += reward;
+    if (done) break;
+    obs = env.getState();
+  }
+  return total_reward;
+}
+
+int main(int argc, char* argv[]) {
+  //google::InitGoogleLogging(argv[0]);
+  std::vector<CartPole> envs;
+  for (int i = 0; i < ITER; ++i) {
+    envs.push_back(CartPole());
+  }
+
+  auto model = std::make_shared<Model>(4, 2);
+  std::shared_ptr<ESAgent<Model>> agent = std::make_shared<ESAgent<Model>>(model,
+      "./cartpole_config.prototxt");
+  
+  // Clone agents to sample (explore).
+  std::vector<std::shared_ptr<ESAgent<Model>>> sampling_agents;
+  for (int i = 0; i < ITER; ++i) {
+    sampling_agents.push_back(agent->clone());
+  }
+
+  std::vector<SamplingInfo> noisy_info;
+  std::vector<float> noisy_rewards(ITER, 0.0f);
+  noisy_info.resize(ITER);
+
+  for (int epoch = 0; epoch < 100; ++epoch) {
+#pragma omp parallel for schedule(dynamic, 1)
+    for (int i = 0; i < ITER; ++i) {
+      auto sampling_agent = sampling_agents[i];
+      SamplingInfo info;
+      bool success = sampling_agent->add_noise(info);
+      float reward = evaluate(envs[i], sampling_agent);
+      noisy_info[i] = info;
+      noisy_rewards[i] = reward;
+    }
+    
+    // Will also update parameters of sampling_agents
+    bool success = agent->update(noisy_info, noisy_rewards);
+    
+    // Use original agent to evalute (without noise).
+    int reward = evaluate(envs[0], agent);
+    LOG(INFO) << "Epoch:" << epoch << " Reward: " << reward;
+  }
+}
diff --git a/evo_kit/demo/torch/model.h b/evo_kit/demo/torch/model.h
new file mode 100644
index 0000000000000000000000000000000000000000..27373ceffd66bffd9d8a047a2e4fc5fe3a14005a
--- /dev/null
+++ b/evo_kit/demo/torch/model.h
@@ -0,0 +1,61 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _MODEL_H
+#define _MODEL_H
+
+#include <torch/torch.h>
+
+struct Model : public torch::nn::Module{
+
+  Model() = delete;
+
+  Model(const int obs_dim, const int act_dim) {
+
+    _obs_dim = obs_dim;
+    _act_dim = act_dim;
+    int hid1_size = act_dim * 10;
+    fc1 = register_module("fc1", torch::nn::Linear(obs_dim, hid1_size));
+    fc2 = register_module("fc2", torch::nn::Linear(hid1_size, act_dim));
+  }
+
+  torch::Tensor forward(torch::Tensor x) {
+    x = x.reshape({-1, _obs_dim});
+    x = torch::tanh(fc1->forward(x));
+    x = torch::softmax(fc2->forward(x), 1);
+    return x;
+  }
+
+  std::shared_ptr<Model> clone() {
+    std::shared_ptr<Model> model = std::make_shared<Model>(_obs_dim, _act_dim);
+    std::vector<torch::Tensor> parameters1 = parameters();
+    std::vector<torch::Tensor> parameters2 = model->parameters();
+    for (int i = 0; i < parameters1.size(); ++i) {
+      torch::Tensor src = parameters1[i].view({-1});
+      torch::Tensor des = parameters2[i].view({-1});
+      auto src_a = src.accessor<float, 1>();
+      auto des_a = des.accessor<float, 1>();
+      for (int j = 0; j < src.size(0); ++j) {
+        des_a[j] = src_a[j];
+      }
+    }
+    return model;
+  }
+
+  int _act_dim;
+  int _obs_dim;
+  torch::nn::Linear fc1{nullptr}, fc2{nullptr};
+};
+
+#endif
diff --git a/evo_kit/paddle/include/evo_kit/async_es_agent.h b/evo_kit/paddle/include/evo_kit/async_es_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..a8558820bb86f7d4a6f084aea456e2c9a79ed762
--- /dev/null
+++ b/evo_kit/paddle/include/evo_kit/async_es_agent.h
@@ -0,0 +1,101 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_ASYNC_ES_AGENT_H
+#define EVO_KIT_ASYNC_ES_AGENT_H
+
+#include <stdlib.h>
+#include <unordered_map>
+#include "evo_kit/es_agent.h"
+
+namespace evo_kit {
+/* EvoKit agent with PaddleLite as backend. This agent supports asynchronous update.
+ * Users mainly focus on the following functions:
+ * 1. clone: clone an agent for multi-thread evaluation
+ * 2. add_noise: add noise into parameters.
+ * 3. update: update parameters given data collected during evaluation.
+ */
+class AsyncESAgent: public ESAgent {
+public:
+    AsyncESAgent() {}
+
+    ~AsyncESAgent();
+
+    /**
+     * @args:
+     *    predictor: predictor created by users for prediction.
+     *    config_path: the path of configuration file.
+     * Note that AsyncESAgent will update the configuration file after calling the update function.
+     * Please use the up-to-date configuration.
+     */
+    AsyncESAgent(
+        const std::string& model_dir,
+        const std::string& config_path);
+
+    /**
+     * @brief: Clone an agent for sampling.
+     */
+    std::shared_ptr<AsyncESAgent> clone();
+
+    /**
+     * @brief: update parameters given data collected during evaluation.
+     * @args:
+     *   noisy_info: sampling information returned by add_noise function.
+     *   noisy_reward: evaluation rewards.
+     */
+    bool update(
+        std::vector<SamplingInfo>& noisy_info,
+        std::vector<float>& noisy_rewards);
+
+private:
+    std::unordered_map<int, std::shared_ptr<PaddlePredictor>> _previous_predictors;
+    std::unordered_map<int, float*> _param_delta;
+    std::string _config_path;
+
+    /**
+     * @brief: parse model_iter_id given a string of model directory.
+     * @return: an integer indicating the model_iter_id
+     */
+    int _parse_model_iter_id(const std::string&);
+
+    /**
+     * @brief: compute the distance between current parameter and previous models.
+     */
+    bool _compute_model_diff();
+
+    /**
+     * @brief: remove expired models to avoid overuse of disk space.
+     * @args:
+     *  max_to_keep: the maximum number of models to keep locally.
+     */
+    bool _remove_expired_model(int max_to_keep);
+
+    /**
+     * @brief: save up-to-date parameters to the disk.
+     */
+    bool _save();
+
+    /**
+     * @brief: load all models in the model warehouse.
+     */
+    bool _load();
+
+    /**
+     * @brief: load a model given the model directory.
+     */
+    std::shared_ptr<PaddlePredictor> _load_previous_model(std::string model_dir);
+};
+
+} // namespace
+#endif
diff --git a/evo_kit/paddle/include/evo_kit/es_agent.h b/evo_kit/paddle/include/evo_kit/es_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..9a256712a3d99be12ff4a9f409298602192ec21e
--- /dev/null
+++ b/evo_kit/paddle/include/evo_kit/es_agent.h
@@ -0,0 +1,103 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_
+#define EVO_KIT_DEEPES_PADDLE_ES_AGENT_H_
+
+#include <vector>
+#include "evo_kit/evo_kit.pb.h"
+#include "evo_kit/optimizer_factory.h"
+#include "evo_kit/sampling_factory.h"
+#include "evo_kit/utils.h"
+#include "paddle_api.h"
+
+namespace evo_kit {
+
+typedef paddle::lite_api::PaddlePredictor PaddlePredictor;
+typedef paddle::lite_api::CxxConfig CxxConfig;
+typedef paddle::lite_api::Tensor Tensor;
+
+int64_t ShapeProduction(const paddle::lite_api::shape_t& shape);
+
+/**
+ * @brief EvoKit agent with PaddleLite as backend.
+ * Users mainly focus on the following functions:
+ * 1. clone: clone an agent for multi-thread evaluation
+ * 2. add_noise: add noise into parameters.
+ * 3. update: update parameters given data collected during evaluation.
+ *
+ */
+class ESAgent {
+public:
+    ESAgent() {}
+
+    ~ESAgent();
+
+    ESAgent(const std::string& model_dir, const std::string& config_path);
+
+    /**
+     * @breif Clone a sampling agent
+     *
+     * Only cloned ESAgent can call `add_noise` function.
+     * Each cloned ESAgent will have a copy of original parameters.
+     * (support sampling in multi-thread way)
+     */
+    std::shared_ptr<ESAgent> clone();
+
+    /**
+     * @brief Update parameters of predictor based on ES algorithm.
+     *
+     * Only not cloned ESAgent can call `update` function.
+     * Parameters of cloned agents will also be updated.
+     */
+    bool update(
+        std::vector<SamplingInfo>& noisy_info,
+        std::vector<float>& noisy_rewards);
+
+    // copied parameters = original parameters + noise
+    bool add_noise(SamplingInfo& sampling_info);
+
+    /**
+     * @brief Get paddle predict
+     *
+     * if _is_sampling_agent is true, will return predictor with added noise;
+     * if _is_sampling_agent is false, will return predictor without added noise.
+     */
+    std::shared_ptr<PaddlePredictor> get_predictor();
+
+    // get param size of model
+    int64_t param_size() {
+        return _param_size;
+    }
+
+protected:
+    int64_t _calculate_param_size();
+
+    std::shared_ptr<PaddlePredictor> _predictor;
+    std::shared_ptr<PaddlePredictor> _sampling_predictor;
+    std::shared_ptr<SamplingMethod> _sampling_method;
+    std::shared_ptr<Optimizer> _optimizer;
+    std::shared_ptr<EvoKitConfig> _config;
+    std::shared_ptr<CxxConfig> _cxx_config;
+    std::vector<std::string> _param_names;
+    // malloc memory of noise and neg_gradients in advance.
+    float* _noise;
+    float* _neg_gradients;
+    int64_t _param_size;
+    bool _is_sampling_agent;
+};
+
+} // namespace
+
+#endif 
diff --git a/evo_kit/paddle/src/async_es_agent.cc b/evo_kit/paddle/src/async_es_agent.cc
new file mode 100644
index 0000000000000000000000000000000000000000..0bff6e42907f6f83f53ea147051d34d3b4851141
--- /dev/null
+++ b/evo_kit/paddle/src/async_es_agent.cc
@@ -0,0 +1,290 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/async_es_agent.h"
+
+namespace evo_kit {
+
+AsyncESAgent::AsyncESAgent(
+    const std::string& model_dir,
+    const std::string& config_path): ESAgent(model_dir, config_path) {
+    _config_path = config_path;
+}
+AsyncESAgent::~AsyncESAgent() {
+    for (const auto kv : _param_delta) {
+        float* delta = kv.second;
+        delete[] delta;
+    }
+}
+
+bool AsyncESAgent::_save() {
+    using namespace paddle::lite_api;
+    bool success = true;
+
+    if (_is_sampling_agent) {
+        LOG(ERROR) <<
+            "[EvoKit] Cloned AsyncESAgent cannot call `save`.Please use original AsyncESAgent.";
+        success = false;
+        return success;
+    }
+
+    int model_iter_id = _config->async_es().model_iter_id() + 1;
+    //current time
+    time_t rawtime;
+    struct tm* timeinfo;
+    char buffer[80];
+
+    time(&rawtime);
+    timeinfo = localtime(&rawtime);
+
+    std::string model_name = "model_iter_id-" + std::to_string(model_iter_id);
+    std::string model_path = _config->async_es().model_warehouse() + "/" + model_name;
+    LOG(INFO) << "[save]model_path: " << model_path;
+    _predictor->SaveOptimizedModel(model_path, LiteModelType::kProtobuf);
+    // save config
+    auto async_es = _config->mutable_async_es();
+    async_es->set_model_iter_id(model_iter_id);
+    success = save_proto_conf(_config_path, *_config);
+
+    if (!success) {
+        LOG(ERROR) << "[]unable to save config for AsyncESAgent";
+        success = false;
+        return success;
+    }
+
+    int max_to_keep = _config->async_es().max_to_keep();
+    success = _remove_expired_model(max_to_keep);
+    return success;
+}
+
+bool AsyncESAgent::_remove_expired_model(int max_to_keep) {
+    bool success = true;
+    std::string model_path = _config->async_es().model_warehouse();
+    std::vector<std::string> model_dirs = list_all_model_dirs(model_path);
+    int model_iter_id = _config->async_es().model_iter_id() + 1;
+
+    for (const auto& dir : model_dirs) {
+        int dir_model_iter_id = _parse_model_iter_id(dir);
+
+        if (model_iter_id - dir_model_iter_id >= max_to_keep) {
+            std::string rm_command = std::string("rm -rf ") + dir;
+            int ret = system(rm_command.c_str());
+
+            if (ret == 0) {
+                LOG(INFO) << "[EvoKit] remove expired Model: " << dir;
+            } else {
+                LOG(ERROR) << "[EvoKit] fail to remove expired Model: " << dir;
+                success = false;
+                return success;
+            }
+        }
+    }
+
+    return success;
+}
+
+bool AsyncESAgent::_compute_model_diff() {
+    bool success = true;
+
+    for (const auto& kv : _previous_predictors) {
+        int model_iter_id = kv.first;
+        std::shared_ptr<PaddlePredictor> old_predictor = kv.second;
+        float* diff = new float[_param_size];
+        memset(diff, 0, _param_size * sizeof(float));
+        int offset = 0;
+
+        for (const std::string& param_name : _param_names) {
+            auto des_tensor = old_predictor->GetTensor(param_name);
+            auto src_tensor = _predictor->GetTensor(param_name);
+            const float* des_data = des_tensor->data<float>();
+            const float* src_data = src_tensor->data<float>();
+            int64_t tensor_size = ShapeProduction(src_tensor->shape());
+
+            for (int i = 0; i < tensor_size; ++i) {
+                diff[i + offset] = des_data[i] - src_data[i];
+            }
+
+            offset += tensor_size;
+        }
+
+        _param_delta[model_iter_id] = diff;
+    }
+
+    return success;
+}
+
+bool AsyncESAgent::_load() {
+    bool success = true;
+    std::string model_path = _config->async_es().model_warehouse();
+    std::vector<std::string> model_dirs = list_all_model_dirs(model_path);
+
+    if (model_dirs.size() == 0) {
+        int model_iter_id = _config->async_es().model_iter_id();
+        success = model_iter_id == 0 ? true : false;
+
+        if (!success) {
+            LOG(WARNING) << "[EvoKit] current_model_iter_id is nonzero, but no model is \
+        found at the dir: " << model_path;
+        }
+
+        return success;
+    }
+
+    for (auto& dir : model_dirs) {
+        int model_iter_id = _parse_model_iter_id(dir);
+
+        if (model_iter_id == -1) {
+            LOG(WARNING) << "[EvoKit] fail to parse model_iter_id: " << dir;
+            success = false;
+            return success;
+        }
+
+        std::shared_ptr<PaddlePredictor> predictor = _load_previous_model(dir);
+
+        if (predictor == nullptr) {
+            success = false;
+            LOG(WARNING) << "[EvoKit] fail to load model: " << dir;
+            return success;
+        }
+
+        _previous_predictors[model_iter_id] = predictor;
+    }
+
+    success = _compute_model_diff();
+    return success;
+}
+
+std::shared_ptr<PaddlePredictor> AsyncESAgent::_load_previous_model(std::string model_dir) {
+    using namespace paddle::lite_api;
+    // 1. Create CxxConfig
+    CxxConfig config;
+    config.set_model_file(model_dir + "/model");
+    config.set_param_file(model_dir + "/params");
+    config.set_valid_places({
+        Place{TARGET(kX86), PRECISION(kFloat)},
+        Place{TARGET(kHost), PRECISION(kFloat)}
+    });
+
+    // 2. Create PaddlePredictor by CxxConfig
+    std::shared_ptr<PaddlePredictor> predictor = CreatePaddlePredictor<CxxConfig>(config);
+    return predictor;
+}
+
+std::shared_ptr<AsyncESAgent> AsyncESAgent::clone() {
+
+    std::shared_ptr<AsyncESAgent> new_agent = std::make_shared<AsyncESAgent>();
+
+    float* noise = new float [_param_size];
+
+    new_agent->_predictor = _predictor;
+    new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor<CxxConfig>(*_cxx_config);
+    new_agent->_is_sampling_agent = true;
+    new_agent->_sampling_method = _sampling_method;
+    new_agent->_param_names = _param_names;
+    new_agent->_param_size = _param_size;
+    new_agent->_config = _config;
+    new_agent->_noise = noise;
+
+    return new_agent;
+}
+
+bool AsyncESAgent::update(
+    std::vector<SamplingInfo>& noisy_info,
+    std::vector<float>& noisy_rewards) {
+
+    CHECK(!_is_sampling_agent) << "[EvoKit] Cloned ESAgent cannot call update function. \
+    Please use original ESAgent.";
+
+    bool success = _load();
+    CHECK(success) << "[EvoKit] fail to load previous models.";
+
+    int current_model_iter_id =  _config->async_es().model_iter_id();
+
+    // validate model_iter_id for each sample before the update
+    for (int i = 0; i < noisy_info.size(); ++i) {
+        int model_iter_id = noisy_info[i].model_iter_id();
+
+        if (model_iter_id != current_model_iter_id
+                && _previous_predictors.count(model_iter_id) == 0) {
+            LOG(WARNING) << "[EvoKit] The sample with model_dir_id: " << model_iter_id \
+                         << " cannot match any local model";
+            success = false;
+            return success;
+        }
+    }
+
+    compute_centered_ranks(noisy_rewards);
+    memset(_neg_gradients, 0, _param_size * sizeof(float));
+
+    for (int i = 0; i < noisy_info.size(); ++i) {
+        int key = noisy_info[i].key(0);
+        float reward = noisy_rewards[i];
+        int model_iter_id = noisy_info[i].model_iter_id();
+        bool success = _sampling_method->resampling(key, _noise, _param_size);
+        CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i;
+        float* delta = _param_delta[model_iter_id];
+
+        // compute neg_gradients
+        if (model_iter_id == current_model_iter_id) {
+            for (int64_t j = 0; j < _param_size; ++j) {
+                _neg_gradients[j] += _noise[j] * reward;
+            }
+        } else {
+            for (int64_t j = 0; j < _param_size; ++j) {
+                _neg_gradients[j] += (_noise[j] + delta[j]) * reward;
+            }
+        }
+    }
+
+    for (int64_t j = 0; j < _param_size; ++j) {
+        _neg_gradients[j] /= -1.0 * noisy_info.size();
+    }
+
+    //update
+    int64_t counter = 0;
+
+    for (std::string param_name : _param_names) {
+        std::unique_ptr<Tensor> tensor = _predictor->GetMutableTensor(param_name);
+        float* tensor_data = tensor->mutable_data<float>();
+        int64_t tensor_size = ShapeProduction(tensor->shape());
+        _optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name);
+        counter += tensor_size;
+    }
+
+    success = _save();
+    CHECK(success) << "[EvoKit] fail to save model.";
+    return true;
+}
+
+int AsyncESAgent::_parse_model_iter_id(const std::string& model_path) {
+    int model_iter_id = -1;
+    int pow = 1;
+
+    for (int i = model_path.size() - 1; i >= 0; --i) {
+        if (model_path[i] >= '0' && model_path[i] <= '9') {
+            if (model_iter_id == -1) {
+                model_iter_id = 0;
+            }
+        } else {
+            break;
+        }
+
+        model_iter_id += pow * (model_path[i] - '0');
+        pow *= 10;
+    }
+
+    return model_iter_id;
+}
+
+}//namespace
diff --git a/evo_kit/paddle/src/es_agent.cc b/evo_kit/paddle/src/es_agent.cc
new file mode 100644
index 0000000000000000000000000000000000000000..d8f3ebd37299224791f1380f284849195383f65b
--- /dev/null
+++ b/evo_kit/paddle/src/es_agent.cc
@@ -0,0 +1,185 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "evo_kit/es_agent.h"
+#include <ctime>
+
+namespace evo_kit {
+
+int64_t ShapeProduction(const paddle::lite_api::shape_t& shape) {
+    int64_t res = 1;
+
+    for (auto i : shape) {
+        res *= i;
+    }
+
+    return res;
+}
+
+ESAgent::~ESAgent() {
+    delete[] _noise;
+
+    if (!_is_sampling_agent) {
+        delete[] _neg_gradients;
+    }
+}
+
+ESAgent::ESAgent(const std::string& model_dir, const std::string& config_path) {
+    using namespace paddle::lite_api;
+    // 1. Create CxxConfig
+    _cxx_config = std::make_shared<CxxConfig>();
+    std::string model_path = model_dir + "/model";
+    std::string param_path = model_dir + "/param";
+    std::string model_buffer = read_file(model_path);
+    std::string param_buffer = read_file(param_path);
+    _cxx_config->set_model_buffer(model_buffer.c_str(), model_buffer.size(),
+                                  param_buffer.c_str(), param_buffer.size());
+    _cxx_config->set_valid_places({
+        Place{TARGET(kX86), PRECISION(kFloat)},
+        Place{TARGET(kHost), PRECISION(kFloat)}
+    });
+
+    _predictor = CreatePaddlePredictor<CxxConfig>(*_cxx_config);
+
+    _is_sampling_agent = false;
+    // Original agent can't be used to sample, so keep it same with _predictor for evaluating.
+    _sampling_predictor = _predictor;
+
+    _config = std::make_shared<EvoKitConfig>();
+    load_proto_conf(config_path, *_config);
+
+    _sampling_method = create_sampling_method(*_config);
+
+    _optimizer = create_optimizer(_config->optimizer());
+
+    _param_names = _predictor->GetParamNames();
+    _param_size = _calculate_param_size();
+
+    _noise = new float [_param_size];
+    _neg_gradients = new float [_param_size];
+}
+
+std::shared_ptr<ESAgent> ESAgent::clone() {
+    if (_is_sampling_agent) {
+        LOG(ERROR) << "[EvoKit] only original ESAgent can call `clone` function.";
+        return nullptr;
+    }
+
+    std::shared_ptr<ESAgent> new_agent = std::make_shared<ESAgent>();
+
+    float* noise = new float [_param_size];
+
+    new_agent->_sampling_predictor = paddle::lite_api::CreatePaddlePredictor<CxxConfig>(*_cxx_config);
+    new_agent->_predictor = _predictor;
+    new_agent->_cxx_config = _cxx_config;
+    new_agent->_is_sampling_agent = true;
+    new_agent->_sampling_method = _sampling_method;
+    new_agent->_param_names = _param_names;
+    new_agent->_config = _config;
+    new_agent->_param_size = _param_size;
+    new_agent->_noise = noise;
+
+    return new_agent;
+}
+
+bool ESAgent::update(
+    std::vector<SamplingInfo>& noisy_info,
+    std::vector<float>& noisy_rewards) {
+    if (_is_sampling_agent) {
+        LOG(ERROR) << "[EvoKit] Cloned ESAgent cannot call update function, please use original ESAgent.";
+        return false;
+    }
+
+    compute_centered_ranks(noisy_rewards);
+
+    memset(_neg_gradients, 0, _param_size * sizeof(float));
+
+    for (int i = 0; i < noisy_info.size(); ++i) {
+        int key = noisy_info[i].key(0);
+        float reward = noisy_rewards[i];
+        bool success = _sampling_method->resampling(key, _noise, _param_size);
+        CHECK(success) << "[EvoKit] resampling error occurs at sample: " << i;
+
+        for (int64_t j = 0; j < _param_size; ++j) {
+            _neg_gradients[j] += _noise[j] * reward;
+        }
+    }
+
+    for (int64_t j = 0; j < _param_size; ++j) {
+        _neg_gradients[j] /= -1.0 * noisy_info.size();
+    }
+
+    //update
+    int64_t counter = 0;
+
+    for (std::string param_name : _param_names) {
+        std::unique_ptr<Tensor> tensor = _predictor->GetMutableTensor(param_name);
+        float* tensor_data = tensor->mutable_data<float>();
+        int64_t tensor_size = ShapeProduction(tensor->shape());
+        _optimizer->update(tensor_data, _neg_gradients + counter, tensor_size, param_name);
+        counter += tensor_size;
+    }
+
+    return true;
+}
+
+bool ESAgent::add_noise(SamplingInfo& sampling_info) {
+    bool success = true;
+
+    if (!_is_sampling_agent) {
+        LOG(ERROR) <<
+                   "[EvoKit] Original ESAgent cannot call add_noise function, please use cloned ESAgent.";
+        success =  false;
+        return success;
+    }
+
+    int key = 0;
+    success = _sampling_method->sampling(&key, _noise, _param_size);
+    CHECK(success) << "[EvoKit] sampling error occurs while add_noise.";
+    int model_iter_id = _config->async_es().model_iter_id();
+    sampling_info.add_key(key);
+    sampling_info.set_model_iter_id(model_iter_id);
+    int64_t counter = 0;
+
+    for (std::string param_name : _param_names) {
+        std::unique_ptr<Tensor> sample_tensor = _sampling_predictor->GetMutableTensor(param_name);
+        std::unique_ptr<const Tensor> tensor = _predictor->GetTensor(param_name);
+        int64_t tensor_size = ShapeProduction(tensor->shape());
+
+        for (int64_t j = 0; j < tensor_size; ++j) {
+            sample_tensor->mutable_data<float>()[j] = tensor->data<float>()[j] + _noise[counter + j];
+        }
+
+        counter += tensor_size;
+    }
+
+    return success;
+}
+
+std::shared_ptr<PaddlePredictor> ESAgent::get_predictor() {
+    return _sampling_predictor;
+}
+
+int64_t ESAgent::_calculate_param_size() {
+    int64_t param_size = 0;
+
+    for (std::string param_name : _param_names) {
+        std::unique_ptr<const Tensor> tensor = _predictor->GetTensor(param_name);
+        param_size += ShapeProduction(tensor->shape());
+    }
+
+    return param_size;
+}
+
+}//namespace
diff --git a/evo_kit/scripts/build_torch_demo.sh b/evo_kit/scripts/build_torch_demo.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b2f4df4444012c49bade049a7b30c9ebf637cafb
--- /dev/null
+++ b/evo_kit/scripts/build_torch_demo.sh
@@ -0,0 +1,39 @@
+#!/bin/bash
+
+cd demo/torch
+
+#---------------libtorch-------------#
+if [ ! -d "./libtorch" ];then
+  echo "Cannot find the torch library: ./libtorch"
+    echo "Downloading Torch library"
+    wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
+    unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+    rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+    echo "Torch library Downloaded"
+fi
+
+
+#---------------libevokit-------------#
+cp -r ../../libevokit ./
+if [ ! -d "./libevokit" ];then
+  echo "Cannot find the EvoKit library: ./libevokit"
+  echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme
+  exit 1
+fi
+
+# proto
+cp ../cartpole_config.prototxt ./
+
+#----------------build---------------#
+rm -rf build
+mkdir build
+cd build
+cmake ../
+make -j10
+cd -
+
+#-----------------run----------------#
+./build/parallel_main
+
+
+cd ../..
diff --git a/evo_kit/scripts/lib_install.sh b/evo_kit/scripts/lib_install.sh
new file mode 100644
index 0000000000000000000000000000000000000000..eb4cc5df7a901618c91b7be8a898d419d607278b
--- /dev/null
+++ b/evo_kit/scripts/lib_install.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+
+if [ $# != 1 ]; then
+  echo "You must choose one framework (paddle/torch) to compile EvoKit."
+  exit 0
+fi
+
+if [ $1 = "paddle" ]; then
+  #---------------paddlelite-------------#
+  if [ ! -d "./inference_lite_lib" ];then
+    echo "Cannot find the PaddleLite library: ./inference_lite_lib"
+    echo "Please put the PaddleLite libraray to current folder according the instruction in README"
+    exit 1
+  fi
+  
+  # Initialization model
+  if [ ! -d ./demo/paddle/cartpole_init_model ]; then
+    unzip ./demo/paddle/cartpole_init_model.zip -d ./demo/paddle/
+  fi
+
+  FLAGS=" -DWITH_PADDLE=ON"
+elif [ $1 = "torch" ]; then
+  FLAGS=" -DWITH_TORCH=ON"
+else
+  echo "Invalid arguments. [paddle/torch]"
+  exit 0
+fi
+
+
+#----------------protobuf-------------#
+cd core/proto/
+protoc evo_kit/evo_kit.proto --cpp_out . 
+cd -
+
+#----------------build---------------#
+echo ${FLAGS}
+rm -rf build
+mkdir build
+cd build
+cmake ../ ${FLAGS}
+make -j10
+make install
+cd -
diff --git a/evo_kit/test/CMakeLists.txt b/evo_kit/test/CMakeLists.txt
new file mode 100644
index 0000000000000000000000000000000000000000..979e5c59afd5e74b2907054a8398fc7d27fbc6e6
--- /dev/null
+++ b/evo_kit/test/CMakeLists.txt
@@ -0,0 +1,33 @@
+cmake_minimum_required (VERSION 2.6)
+project (EvoKit_demo)
+set(TARGET unit_test_main)
+
+set(CMAKE_CXX_STANDARD 11)
+set(CMAKE_CXX_STANDARD_REQUIRED ON)
+set(CMAKE_CXX_EXTENSIONS OFF)
+
+find_package(GTest REQUIRED)
+find_package(OpenMP)
+if (OPENMP_FOUND)
+  set (CMAKE_C_FLAGS "${CMAKE_C_FLAGS} ${OpenMP_C_FLAGS}")
+  set (CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} ${OpenMP_CXX_FLAGS}")
+  set (CMAKE_EXE_LINKER_FLAGS "${CMAKE_EXE_LINKER_FLAGS} ${OpenMP_EXE_LINKER_FLAGS}")
+endif()
+
+# Torch lib
+list(APPEND CMAKE_PREFIX_PATH "../libtorch")
+find_package(Torch REQUIRED ON)
+
+# include and source
+include_directories("${PROJECT_SOURCE_DIR}/include")
+file(GLOB test_src "${PROJECT_SOURCE_DIR}/src/*.cc")
+
+# make
+add_executable(${TARGET} "unit_test.cc" ${core_src} ${agent_src} ${test_src})
+target_link_libraries(${TARGET} gflags protobuf pthread glog gtest "${TORCH_LIBRARIES}")
+
+
+########## EvoKit libraries ##########
+list(APPEND CMAKE_PREFIX_PATH "${PROJECT_SOURCE_DIR}/libevokit/cmake/Torch")
+find_package(EvoKit)
+target_link_libraries(${TARGET} "${EVOKIT_LIBRARY}")
diff --git a/evo_kit/test/include/torch_demo_model.h b/evo_kit/test/include/torch_demo_model.h
new file mode 100644
index 0000000000000000000000000000000000000000..cf9d3400ea4358fe109ff6da3f9bec395920336f
--- /dev/null
+++ b/evo_kit/test/include/torch_demo_model.h
@@ -0,0 +1,65 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef _TORCH_DEMO_MODEL_H
+#define _TORCH_DEMO_MODEL_H
+
+#include <torch/torch.h>
+
+struct Model : public torch::nn::Module{
+
+  Model() = delete;
+
+  Model(const int obs_dim, const int act_dim, const int h1_size, const int h2_size) {
+    _obs_dim = obs_dim;
+    _act_dim = act_dim;
+    _h1_size = h1_size;
+    _h2_size = h2_size;
+    fc1 = register_module("fc1", torch::nn::Linear(obs_dim, h1_size));
+    fc2 = register_module("fc2", torch::nn::Linear(h1_size, h2_size));
+    fc3 = register_module("fc3", torch::nn::Linear(h2_size, act_dim));
+  }
+
+  torch::Tensor forward(torch::Tensor x) {
+    x = x.reshape({-1, _obs_dim});
+    x = torch::tanh(fc1->forward(x));
+    x = torch::tanh(fc2->forward(x));
+    x = torch::tanh(fc3->forward(x));
+    return x;
+  }
+
+  std::shared_ptr<Model> clone() {
+    std::shared_ptr<Model> model = std::make_shared<Model>(_obs_dim, _act_dim, _h1_size, _h2_size);
+    std::vector<torch::Tensor> parameters1 = parameters();
+    std::vector<torch::Tensor> parameters2 = model->parameters();
+    for (int i = 0; i < parameters1.size(); ++i) {
+      torch::Tensor src = parameters1[i].view({-1});
+      torch::Tensor des = parameters2[i].view({-1});
+      auto src_a = src.accessor<float, 1>();
+      auto des_a = des.accessor<float, 1>();
+      for (int j = 0; j < src.size(0); ++j) {
+        des_a[j] = src_a[j];
+      }
+    }
+    return model;
+  }
+
+  int _act_dim;
+  int _obs_dim;
+  int _h1_size;
+  int _h2_size;
+  torch::nn::Linear fc1{nullptr}, fc2{nullptr}, fc3{nullptr};
+};
+
+#endif
diff --git a/evo_kit/test/prototxt/torch_sin_cached_config.prototxt b/evo_kit/test/prototxt/torch_sin_cached_config.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..6fe80b1e07396b0909cb087f1a9b0c20724a0fc4
--- /dev/null
+++ b/evo_kit/test/prototxt/torch_sin_cached_config.prototxt
@@ -0,0 +1,16 @@
+seed : 1024
+
+gaussian_sampling {
+  std: 0.005
+  cached: true
+  cache_size : 100000
+}
+
+optimizer {
+  type: "Adam",
+  base_lr: 0.005,
+  momentum: 0.9,
+  beta1: 0.9,
+  beta2: 0.999,
+  epsilon: 1e-8,
+}
diff --git a/evo_kit/test/prototxt/torch_sin_config.prototxt b/evo_kit/test/prototxt/torch_sin_config.prototxt
new file mode 100644
index 0000000000000000000000000000000000000000..3704d64e6b6c7f7976422e33c2f5892b7ca4efc5
--- /dev/null
+++ b/evo_kit/test/prototxt/torch_sin_config.prototxt
@@ -0,0 +1,15 @@
+seed : 1024
+
+gaussian_sampling {
+  std: 0.005
+  cached: false
+}
+
+optimizer {
+  type: "Adam",
+  base_lr: 0.005,
+  momentum: 0.9,
+  beta1: 0.9,
+  beta2: 0.999,
+  epsilon: 1e-8,
+}
diff --git a/evo_kit/test/run_test.sh b/evo_kit/test/run_test.sh
new file mode 100644
index 0000000000000000000000000000000000000000..b39cbc9db8c32c4827aa03a101b45a8011dde7ae
--- /dev/null
+++ b/evo_kit/test/run_test.sh
@@ -0,0 +1,41 @@
+#!/bin/bash
+export LD_LIBRARY_PATH=/usr/local/lib:$LD_LIBRARY_PATH
+
+#---------------libtorch-------------#
+if [ ! -d "./libtorch" ];then
+echo "Cannot find the torch library: ../libtorch"
+  echo "Downloading Torch library"
+  wget -q https://download.pytorch.org/libtorch/cpu/libtorch-cxx11-abi-shared-with-deps-1.4.0%2Bcpu.zip
+  unzip -q libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+  rm -rf libtorch-cxx11-abi-shared-with-deps-1.4.0+cpu.zip
+  echo "Torch library Downloaded"
+fi
+
+#----------------protobuf-------------#
+cd core/proto/
+protoc evo_kit/evo_kit.proto --cpp_out . 
+cd -
+
+#----------------build---------------#
+sh scripts/lib_install.sh torch
+
+#----------------build test---------------#
+cd test
+
+cp -r ../libevokit ./
+if [ ! -d "./libevokit" ];then
+  echo "Cannot find the EvoKit library: ./libevokit"
+  echo "Please put the EvoKit libraray to current folder according the instruction in README" # TODO: readme
+  exit 1
+fi
+
+rm -rf build
+mkdir build
+cd build
+cmake ../
+make -j10
+
+#-----------------run----------------#
+./unit_test_main
+
+cd ..
diff --git a/evo_kit/test/src/optimizers_test.cc b/evo_kit/test/src/optimizers_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..1c561e3085bdf5f9102ba29115e7e8fabbf8ed75
--- /dev/null
+++ b/evo_kit/test/src/optimizers_test.cc
@@ -0,0 +1,59 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include <vector>
+#include "evo_kit/optimizer_factory.h"
+#include <memory>
+
+namespace evo_kit {
+
+TEST(SGDOptimizersTest, Method_update) {
+    std::shared_ptr<EvoKitConfig> config = std::make_shared<EvoKitConfig>();
+  auto optimizer_config = config->mutable_optimizer();
+  optimizer_config->set_base_lr(1.0);
+  optimizer_config->set_type("sgd");
+  std::shared_ptr<Optimizer> optimizer = create_optimizer(config->optimizer());
+  float sgd_wei[10]  = { 0.0       , 0.0       , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081};
+  float sgd_grad[10] = {-0.11992419,-0.0       , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143};
+  float sgd_new[10]  = { 0.01199242, 0.0       , 0.0344831 , 0.05776198, 0.04206595, 0.00973154, 0.09637211,-0.03477474, 0.014892306, 0.03129495};
+
+  EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1"));
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_FLOAT_EQ(sgd_new[i], sgd_wei[i]) << " i: " << i ;
+  }
+  EXPECT_TRUE(optimizer->update(sgd_wei, sgd_grad, 10, "fc1"));
+  EXPECT_FALSE(optimizer->update(sgd_wei, sgd_grad, 9, "fc1"));
+}
+
+TEST(AdamOptimizersTest, Method_update) {
+    std::shared_ptr<EvoKitConfig> config = std::make_shared<EvoKitConfig>();
+  auto optimizer_config = config->mutable_optimizer();
+  optimizer_config->set_base_lr(1.0);
+  optimizer_config->set_type("adam");
+  std::shared_ptr<Optimizer> optimizer = create_optimizer(config->optimizer());
+  float adam_wei[10]  = { 0.0       , 0.0       , 0.04216444, 0.0511456 , 0.04231584, 0.01089015, 0.06569759, 0.00127421,-0.00092832, 0.01128081};
+  float adam_grad[10] = {-0.11992419,-0.0       , 0.07681337,-0.06616384, 0.00249889, 0.01158612,-0.3067452 , 0.36048946,-0.15820622,-0.20014143};
+  float adam_new[10]  = { 0.99999736, 0.        ,-0.95783144, 1.05114082,-0.95755763,-0.98908256, 1.06569656,-0.99872491, 0.99906968, 1.01127923};
+
+  EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1"));
+  for (int i = 0; i < 10; ++i) {
+    EXPECT_FLOAT_EQ(adam_new[i], adam_wei[i]) << " i: " << i ;
+  }
+  EXPECT_TRUE(optimizer->update(adam_wei, adam_grad, 10, "fc1"));
+  EXPECT_FALSE(optimizer->update(adam_wei, adam_grad, 9, "fc1"));
+}
+
+} // namespace
+
diff --git a/evo_kit/test/src/sampling_test.cc b/evo_kit/test/src/sampling_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..e707a63354836f3e70b42d819bab8b0fc3f79e70
--- /dev/null
+++ b/evo_kit/test/src/sampling_test.cc
@@ -0,0 +1,116 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include <vector>
+#include "evo_kit/sampling_method.h"
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/cached_gaussian_sampling.h"
+#include <memory>
+
+namespace evo_kit {
+
+class SamplingTest : public ::testing::Test {
+ protected:
+  void init_sampling_method(bool cached) {
+    config = std::make_shared<EvoKitConfig>();
+    config->set_seed(1024);
+    auto sampling_config = config->mutable_gaussian_sampling();
+    sampling_config->set_std(1.0);
+    sampling_config->set_cached(cached);
+    sampling_config->set_cache_size(cache_size);
+    if (cached) {
+      sampler = std::make_shared<CachedGaussianSampling>();
+    } else {
+      sampler = std::make_shared<GaussianSampling>();
+    }
+  }
+
+  std::shared_ptr<SamplingMethod> sampler;
+  std::shared_ptr<EvoKitConfig> config;
+  float array[3] = {1.0, 2.0, 3.0};
+  int cache_size = 100;   // default cache_size 100
+  int key = 0;
+};
+
+
+TEST_F(SamplingTest, GaussianSampling_load_config) {
+  init_sampling_method(false);
+  EXPECT_TRUE(sampler->load_config(*config));
+}
+
+TEST_F(SamplingTest, GaussianSampling_sampling) {
+  init_sampling_method(false);
+  sampler->load_config(*config);
+
+  EXPECT_FALSE(sampler->sampling(&key, nullptr, 0));
+  EXPECT_TRUE(sampler->sampling(&key, array, 3));
+}
+
+TEST_F(SamplingTest, GaussianSampling_resampling) {
+  init_sampling_method(false);
+  sampler->load_config(*config);
+
+  EXPECT_FALSE(sampler->resampling(0, nullptr, 0));
+  EXPECT_TRUE(sampler->resampling(0, array, 3));
+}
+
+
+TEST_F(SamplingTest, CachedGaussianSampling_load_config) {
+  init_sampling_method(true);
+  EXPECT_TRUE(sampler->load_config(*config));
+}
+
+TEST_F(SamplingTest, CachedGaussianSampling_sampling) {
+  init_sampling_method(true);
+  EXPECT_FALSE(sampler->sampling(&key, array, 0));
+
+  sampler->load_config(*config);
+
+  EXPECT_FALSE(sampler->sampling(&key, nullptr, 0));
+  EXPECT_FALSE(sampler->sampling(&key, array, -1));
+  EXPECT_FALSE(sampler->sampling(&key, array, cache_size));
+
+  EXPECT_TRUE(sampler->sampling(&key, array, 0));
+  EXPECT_TRUE(sampler->sampling(&key, array, 3));
+}
+
+TEST_F(SamplingTest, CachedGaussianSampling_resampling) {
+  init_sampling_method(true);
+  EXPECT_FALSE(sampler->resampling(0, array, 0));
+
+  sampler->load_config(*config);
+
+  EXPECT_FALSE(sampler->resampling(0, nullptr, 0));
+  EXPECT_FALSE(sampler->resampling(0, array, -1));
+  EXPECT_FALSE(sampler->resampling(0, array, cache_size));
+
+  EXPECT_TRUE(sampler->resampling(0, array, 0));
+  EXPECT_TRUE(sampler->resampling(0, array, 1));
+  EXPECT_TRUE(sampler->resampling(0, array, 2));
+
+  EXPECT_FALSE(sampler->resampling(-1, array, 3));
+  EXPECT_TRUE(sampler->resampling(0, array, 3));
+  EXPECT_TRUE(sampler->resampling(1, array, 3));
+  EXPECT_TRUE(sampler->resampling(2, array, 3));
+  EXPECT_TRUE(sampler->resampling(cache_size-3, array, 3));
+  EXPECT_FALSE(sampler->resampling(cache_size-2, array, 3));
+  EXPECT_FALSE(sampler->resampling(cache_size-1, array, 3));
+  EXPECT_FALSE(sampler->resampling(cache_size, array, 3));
+  EXPECT_FALSE(sampler->resampling(cache_size-3, array, cache_size-1));
+}
+
+
+} // namespace
+
diff --git a/evo_kit/test/src/torch_agent_test.cc b/evo_kit/test/src/torch_agent_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..080b85391d720a6b500517a6f27976f76d2258b6
--- /dev/null
+++ b/evo_kit/test/src/torch_agent_test.cc
@@ -0,0 +1,157 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include <torch/torch.h>
+#include <glog/logging.h>
+#include <omp.h>
+
+#include "evo_kit/gaussian_sampling.h"
+#include "evo_kit/es_agent.h"
+#include "torch_demo_model.h"
+
+#include <memory>
+#include <vector>
+#include <random>
+#include <math.h>
+
+namespace evo_kit {
+
+
+// The fixture for testing class Foo.
+class TorchDemoTest : public ::testing::Test {
+protected:
+  float evaluate(std::vector<float>& x_list, std::vector<float>& y_list, int size, std::shared_ptr<ESAgent<Model>> agent) {
+    float total_loss = 0.0;
+    for (int i = 0; i < size; ++i) {
+      torch::Tensor x_input = torch::tensor(x_list[i], torch::dtype(torch::kFloat32));
+      torch::Tensor predict_y = agent->predict(x_input);
+      auto pred_y = predict_y.accessor<float,2>();
+      float loss = pow((pred_y[0][0] - y_list[i]), 2);
+      total_loss += loss;
+    }
+    return -total_loss / float(size);
+  }
+
+  float train_loss() {
+    return -1.0 * evaluate(x_list, y_list, train_data_size, agent);
+  }
+
+  float test_loss() {
+    return -1.0 * evaluate(test_x_list, test_y_list, test_data_size, agent);
+  }
+
+  float train_test_gap() {
+    float train_lo = train_loss();
+    float test_lo = test_loss();
+    if ( train_lo > test_lo) {
+      return train_lo - test_lo;
+    } else {
+      return test_lo - train_lo;
+    }
+  }
+
+  void init_agent(const int in_dim, const int out_dim, const int h1_size, const int h2_size) {
+    std::shared_ptr<Model>  model = std::make_shared<Model>(in_dim, out_dim, h1_size, h2_size);
+    agent = std::make_shared<ESAgent<Model>>(model, "../prototxt/torch_sin_config.prototxt");
+  }
+
+  void train_agent(std::string config_path) {
+    std::default_random_engine generator(0); // fix seed
+    std::uniform_real_distribution<float> uniform(-3.0, 9.0);
+    std::normal_distribution<float> norm;
+    for (int i = 0; i < train_data_size; ++i) {
+      float x_i = uniform(generator); // generate data between [-3, 9]
+      float y_i = sin(x_i) + norm(generator) * 0.05; // label noise std 0.05
+      x_list.push_back(x_i);
+      y_list.push_back(y_i);
+    }
+    for (int i= 0; i < test_data_size; ++i) {
+      float x_i = uniform(generator);
+      float y_i = sin(x_i);
+      test_x_list.push_back(x_i);
+      test_y_list.push_back(y_i);
+    }
+
+    std::shared_ptr<Model>  model = std::make_shared<Model>(1, 1, 10, 5);
+    agent = std::make_shared<ESAgent<Model>>(model, config_path);
+
+    // Clone agents to sample (explore).
+    std::vector<std::shared_ptr<ESAgent<Model>>> sampling_agents;
+    for (int i = 0; i < iter; ++i) {
+      sampling_agents.push_back(agent->clone());
+    }
+
+    std::vector<SamplingInfo> noisy_keys;
+    std::vector<float> noisy_rewards(iter, 0.0f);
+    noisy_keys.resize(iter);
+    
+    LOG(INFO) << "start training...";
+    for (int epoch = 0; epoch < 1001; ++epoch) {
+#pragma omp parallel for schedule(dynamic, 1)
+      for (int i = 0; i < iter; ++i) {
+        auto sampling_agent = sampling_agents[i];
+        SamplingInfo key;
+        bool success = sampling_agent->add_noise(key);
+        float reward = evaluate(x_list, y_list, train_data_size, sampling_agent);
+        noisy_keys[i] = key;
+        noisy_rewards[i] = reward;
+      }
+      bool success = agent->update(noisy_keys, noisy_rewards);
+
+      if (epoch % 100 == 0) {
+        float reward = evaluate(test_x_list, test_y_list, test_data_size, agent);
+        float train_reward = evaluate(x_list, y_list, train_data_size, agent);
+        LOG(INFO) << "Epoch:" << epoch << " Loss: " << -reward << ", Train loss" << -train_reward;
+      }
+    }
+  }
+
+  // Class members declared here can be used by all tests in the test suite
+  int train_data_size = 300;
+  int test_data_size = 100;
+  int iter = 10;
+  std::vector<float> x_list;
+  std::vector<float> y_list;
+  std::vector<float> test_x_list;
+  std::vector<float> test_y_list;
+  std::shared_ptr<ESAgent<Model>> agent;
+};
+
+TEST_F(TorchDemoTest, TrainingEffectUseNormalSampling) {
+  train_agent("../prototxt/torch_sin_config.prototxt");
+  EXPECT_LT(train_loss(), 0.05);
+  EXPECT_LT(test_loss(), 0.05);
+  EXPECT_LT(train_test_gap(), 0.03);
+}
+
+TEST_F(TorchDemoTest, TrainingEffectTestUseTableSampling) {
+  train_agent("../prototxt/torch_sin_cached_config.prototxt");
+  EXPECT_LT(train_loss(), 0.05);
+  EXPECT_LT(test_loss(), 0.05);
+  EXPECT_LT(train_test_gap(), 0.03);
+}
+
+TEST_F(TorchDemoTest,ParamSizeTest) {
+  init_agent(1, 1, 10, 5);
+  EXPECT_EQ(agent->param_size(), 81);
+  init_agent(2, 3, 10, 5);
+  EXPECT_EQ(agent->param_size(), 103);
+  init_agent(1, 1, 1, 1);
+  EXPECT_EQ(agent->param_size(), 6);
+  init_agent(100, 2, 256, 64);
+  EXPECT_EQ(agent->param_size(), 42434);
+}
+
+} // namespace
diff --git a/evo_kit/test/src/utils_test.cc b/evo_kit/test/src/utils_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..a0c8d2c963a698475831a641c3eefc8abcc3693a
--- /dev/null
+++ b/evo_kit/test/src/utils_test.cc
@@ -0,0 +1,30 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+#include <vector>
+#include "evo_kit/utils.h"
+
+namespace evo_kit {
+
+// Tests that the Utils::compute_centered_rank() method.
+TEST(UtilsTest, Method_compute_centered_ranks) {
+  float a[5] = {9.0, 8.0, 7.0, 6.0, 5.0};
+  std::vector<float> reward_vec(a, a+5);
+  EXPECT_EQ(compute_centered_ranks(reward_vec), true);
+}
+
+
+} // namespace
+
diff --git a/evo_kit/test/unit_test.cc b/evo_kit/test/unit_test.cc
new file mode 100644
index 0000000000000000000000000000000000000000..3bbc21f4cdfb8e7709173a258f66560a7f7e27a1
--- /dev/null
+++ b/evo_kit/test/unit_test.cc
@@ -0,0 +1,20 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "gtest/gtest.h"
+
+int main(int argc, char **argv) {
+  ::testing::InitGoogleTest(&argc, argv);
+  return RUN_ALL_TESTS();
+}
diff --git a/evo_kit/torch/include/evo_kit/es_agent.h b/evo_kit/torch/include/evo_kit/es_agent.h
new file mode 100644
index 0000000000000000000000000000000000000000..856034f75fc2c025cbb3aed74c5eac4edc888178
--- /dev/null
+++ b/evo_kit/torch/include/evo_kit/es_agent.h
@@ -0,0 +1,196 @@
+//   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#ifndef TORCH_ESAGENT_H
+#define TORCH_ESAGENT_H
+
+#include <memory>
+#include <string>
+#include "evo_kit/optimizer_factory.h"
+#include "evo_kit/sampling_factory.h"
+#include "evo_kit/utils.h"
+#include "evo_kit/evo_kit.pb.h"
+
+namespace evo_kit{
+
+/**
+ * @brief DeepES agent for Torch.
+ *
+ * Our implemtation is flexible to support any model that subclass torch::nn::Module.
+ * That is, we can instantiate an agent by: es_agent = ESAgent<Model>(model);
+ * After that, users can clone an agent for multi-thread processing, add parametric noise for exploration,
+ * and update the parameteres, according to the evaluation resutls of noisy parameters.
+ */
+template <class T>
+class ESAgent{
+public:
+  ESAgent() {}
+
+  ~ESAgent() {
+    delete[] _noise;
+    if (!_is_sampling_agent)
+      delete[] _neg_gradients;
+  }
+
+  ESAgent(std::shared_ptr<T> model, std::string config_path): _model(model) {
+    _is_sampling_agent = false;
+    _config = std::make_shared<EvoKitConfig>();
+    load_proto_conf(config_path, *_config);
+    _sampling_method = create_sampling_method(*_config);
+    _optimizer = create_optimizer(_config->optimizer());
+    // Origin agent can't be used to sample, so keep it same with _model for evaluating.
+    _sampling_model = model;
+    _param_size = _calculate_param_size();
+
+    _noise = new float [_param_size];
+    _neg_gradients = new float [_param_size];
+  }
+
+  /** 
+   * @breif Clone a sampling agent
+   *
+   * Only cloned ESAgent can call `add_noise` function.
+   * Each cloned ESAgent will have a copy of original parameters.
+   * (support sampling in multi-thread way)
+   */
+  std::shared_ptr<ESAgent> clone() {
+    std::shared_ptr<ESAgent> new_agent = std::make_shared<ESAgent>();
+
+    new_agent->_model = _model;
+    std::shared_ptr<T> new_model = _model->clone();
+    new_agent->_sampling_model = new_model;
+  
+    new_agent->_is_sampling_agent = true;
+    new_agent->_sampling_method = _sampling_method;
+    new_agent->_param_size = _param_size;
+
+    float* new_noise = new float [_param_size];
+    new_agent->_noise = new_noise;
+
+    return new_agent;
+  }
+
+  /**
+   * @brief Use the model to predict. 
+   *
+   * if _is_sampling_agent is true, will use the sampling model with added noise;
+   * if _is_sampling_agent is false, will use the original model without added noise.
+   */
+  torch::Tensor predict(const torch::Tensor& x) {
+    return _sampling_model->forward(x);
+  }
+
+  /**
+   * @brief Update parameters of model based on ES algorithm.
+   *
+   * Only not cloned ESAgent can call `update` function.
+   * Parameters of cloned agents will also be updated.
+   */
+  bool update(std::vector<SamplingInfo>& noisy_info, std::vector<float>& noisy_rewards) {
+    if (_is_sampling_agent) {
+      LOG(ERROR) << "[DeepES] Cloned ESAgent cannot call update function, please use original ESAgent.";
+      return false;
+    }
+
+    compute_centered_ranks(noisy_rewards);
+
+    memset(_neg_gradients, 0, _param_size * sizeof(float));
+    for (int i = 0; i < noisy_info.size(); ++i) {
+      int key = noisy_info[i].key(0);
+      float reward = noisy_rewards[i];
+      bool success = _sampling_method->resampling(key, _noise, _param_size);
+      CHECK(success) << "[DeepES] resampling error occurs at sample: " << i;
+      for (int64_t j = 0; j < _param_size; ++j) {
+        _neg_gradients[j] += _noise[j] * reward;
+      }
+    }
+    for (int64_t j = 0; j < _param_size; ++j) {
+      _neg_gradients[j] /= -1.0 * noisy_info.size();
+    }
+
+    //update
+    auto params = _model->named_parameters();
+    int64_t counter = 0;
+    for (auto& param: params) {
+      torch::Tensor tensor = param.value().view({-1});
+      auto tensor_a = tensor.accessor<float,1>();
+      _optimizer->update(tensor_a, _neg_gradients+counter, tensor.size(0), param.key());
+      counter += tensor.size(0);
+    }
+
+    return true;
+  }
+
+  // copied parameters = original parameters + noise
+  bool add_noise(SamplingInfo& sampling_info) {
+    bool success = true;
+    if (!_is_sampling_agent) {
+      LOG(ERROR) << "[DeepES] Original ESAgent cannot call add_noise function, please use cloned ESAgent.";
+      success =  false;
+      return success;
+    }
+
+    auto sampling_params = _sampling_model->named_parameters();
+    auto params = _model->named_parameters();
+    int key = 0;
+    success = _sampling_method->sampling(&key, _noise, _param_size);
+    CHECK(success) << "[EvoKit] sampling error occurs while add_noise.";
+    sampling_info.add_key(key);
+    int64_t counter = 0;
+    for (auto& param: sampling_params) {
+      torch::Tensor sampling_tensor = param.value().view({-1});
+      std::string param_name = param.key();
+      torch::Tensor tensor = params.find(param_name)->view({-1});
+      auto sampling_tensor_a = sampling_tensor.accessor<float,1>();
+      auto tensor_a = tensor.accessor<float,1>();
+      for (int64_t j = 0; j < tensor.size(0); ++j) {
+        sampling_tensor_a[j] = tensor_a[j] + _noise[counter + j];
+      }
+      counter += tensor.size(0);
+    }
+    return success;
+  }
+
+  // get param size of model
+  int64_t param_size() {
+    return _param_size;
+  }
+
+
+private:
+  int64_t _calculate_param_size() {
+    _param_size = 0;
+    auto params = _model->named_parameters();
+    for (auto& param: params) {
+      torch::Tensor tensor = param.value().view({-1});
+      _param_size += tensor.size(0);
+    }
+    return _param_size;
+  }
+
+  std::shared_ptr<T> _model;
+  std::shared_ptr<T> _sampling_model;
+  bool _is_sampling_agent;
+  std::shared_ptr<SamplingMethod> _sampling_method;
+  std::shared_ptr<Optimizer> _optimizer;
+  std::shared_ptr<EvoKitConfig> _config;
+  int64_t _param_size;
+  // malloc memory of noise and neg_gradients in advance.
+  float* _noise;
+  float* _neg_gradients;
+};
+
+}
+
+#endif /* TORCH_ESAGENT_H */
diff --git a/examples/A2C/README.md b/examples/A2C/README.md
index d38a5d153b3ab39c59b851775567d90edcdda4fb..2328a3ee350851370c5c44bbed6b4e2daae27512 100755
--- a/examples/A2C/README.md
+++ b/examples/A2C/README.md
@@ -20,7 +20,7 @@ Performance of A2C on various envrionments
 ## How to use
 ### Dependencies
 + [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
-+ [parl](https://github.com/PaddlePaddle/PARL)
++ [parl>=1.2.1](https://github.com/PaddlePaddle/PARL)
 + gym==0.12.1
 + atari-py==0.1.7
 
diff --git a/examples/A2C/atari_agent.py b/examples/A2C/atari_agent.py
index 5604f71016538650b0ed0355dd6cd2856f52c60e..94d2125214a9ef52273763cea6cc0213cc34c963 100755
--- a/examples/A2C/atari_agent.py
+++ b/examples/A2C/atari_agent.py
@@ -71,7 +71,10 @@ class AtariAgent(parl.Agent):
             lr = layers.data(
                 name='lr', shape=[1], dtype='float32', append_batch_size=False)
             entropy_coeff = layers.data(
-                name='entropy_coeff', shape=[], dtype='float32')
+                name='entropy_coeff',
+                shape=[1],
+                dtype='float32',
+                append_batch_size=False)
 
             total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
                 obs, actions, advantages, target_values, lr, entropy_coeff)
diff --git a/examples/A2C/train.py b/examples/A2C/train.py
index 777a22849afcb3ad5b1e237d7e3d0ae9b39fa871..4050a413f0262ec62f76bbc07062578d6a398d5c 100755
--- a/examples/A2C/train.py
+++ b/examples/A2C/train.py
@@ -25,7 +25,7 @@ from atari_agent import AtariAgent
 from collections import defaultdict
 
 from parl.env.atari_wrappers import wrap_deepmind
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
 from parl.utils.scheduler import PiecewiseScheduler
 from parl.utils.time_stat import TimeStat
 from parl.utils.window_stat import WindowStat
@@ -55,11 +55,6 @@ class Learner(object):
             assert get_gpu_count() == 1, 'Only support training in single GPU,\
                     Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_TO_USE]` .'
 
-        else:
-            cpu_num = os.environ.get('CPU_NUM')
-            assert cpu_num is not None and cpu_num == '1', 'Only support training in single CPU,\
-                    Please set environment variable:  `export CPU_NUM=1`.'
-
         #========== Learner ==========
 
         self.total_loss_stat = WindowStat(100)
@@ -191,7 +186,7 @@ class Learner(object):
             min_episode_steps = np.min(np.array(episode_steps).flatten())
 
         metric = {
-            'Sample steps': self.sample_total_steps,
+            'sample_steps': self.sample_total_steps,
             'max_episode_rewards': max_episode_rewards,
             'mean_episode_rewards': mean_episode_rewards,
             'min_episode_rewards': min_episode_rewards,
@@ -210,7 +205,7 @@ class Learner(object):
 
         for key, value in metric.items():
             if value is not None:
-                tensorboard.add_scalar(key, value, self.sample_total_steps)
+                summary.add_scalar(key, value, self.sample_total_steps)
 
         logger.info(metric)
 
diff --git a/examples/DDPG/mujoco_agent.py b/examples/DDPG/mujoco_agent.py
index 2b2c216e0dfcb6bf675483d13454195b9cc634ed..4a92f3ea01217715a8fae16b8079367d5368f05a 100644
--- a/examples/DDPG/mujoco_agent.py
+++ b/examples/DDPG/mujoco_agent.py
@@ -55,6 +55,7 @@ class MujocoAgent(parl.Agent):
         act = self.fluid_executor.run(
             self.pred_program, feed={'obs': obs},
             fetch_list=[self.pred_act])[0]
+        act = np.squeeze(act)
         return act
 
     def learn(self, obs, act, reward, next_obs, terminal):
diff --git a/examples/DDPG/mujoco_model.py b/examples/DDPG/mujoco_model.py
index ed59dbbf2e2d3381f24bbc67c7503c681ad87c18..6a812f6e465cd4937bfae3bc2eeabeaaaa8e0d8d 100644
--- a/examples/DDPG/mujoco_model.py
+++ b/examples/DDPG/mujoco_model.py
@@ -45,7 +45,6 @@ class ActorModel(parl.Model):
         hid1 = self.fc1(obs)
         hid2 = self.fc2(hid1)
         means = self.fc3(hid2)
-        means = means
         return means
 
 
diff --git a/examples/DDPG/train.py b/examples/DDPG/train.py
index 1b7e5c1cc9d98f91b024cea7dbdda6e443f3fea9..05b25dc81f3e45985812d526e70c422ada225197 100644
--- a/examples/DDPG/train.py
+++ b/examples/DDPG/train.py
@@ -21,14 +21,12 @@ from mujoco_agent import MujocoAgent
 from mujoco_model import MujocoModel
 from parl.utils import logger, action_mapping, ReplayMemory
 
-MAX_EPISODES = 5000
-TEST_EVERY_EPISODES = 20
 ACTOR_LR = 1e-4
 CRITIC_LR = 1e-3
 GAMMA = 0.99
 TAU = 0.001
 MEMORY_SIZE = int(1e6)
-MIN_LEARN_SIZE = 1e4
+MEMORY_WARMUP_SIZE = 1e4
 BATCH_SIZE = 128
 REWARD_SCALE = 0.1
 ENV_SEED = 1
@@ -37,12 +35,9 @@ ENV_SEED = 1
 def run_train_episode(env, agent, rpm):
     obs = env.reset()
     total_reward = 0
-    steps = 0
     while True:
-        steps += 1
         batch_obs = np.expand_dims(obs, axis=0)
         action = agent.predict(batch_obs.astype('float32'))
-        action = np.squeeze(action)
 
         # Add exploration noise, and clip to [-1.0, 1.0]
         action = np.clip(np.random.normal(action, 1.0), -1.0, 1.0)
@@ -53,7 +48,7 @@ def run_train_episode(env, agent, rpm):
 
         rpm.append(obs, action, REWARD_SCALE * reward, next_obs, done)
 
-        if rpm.size() > MIN_LEARN_SIZE:
+        if rpm.size() > MEMORY_WARMUP_SIZE:
             batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = rpm.sample_batch(
                 BATCH_SIZE)
             agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
@@ -64,7 +59,7 @@ def run_train_episode(env, agent, rpm):
 
         if done:
             break
-    return total_reward, steps
+    return total_reward
 
 
 def run_evaluate_episode(env, agent):
@@ -73,7 +68,6 @@ def run_evaluate_episode(env, agent):
     while True:
         batch_obs = np.expand_dims(obs, axis=0)
         action = agent.predict(batch_obs.astype('float32'))
-        action = np.squeeze(action)
         action = action_mapping(action, env.action_space.low[0],
                                 env.action_space.high[0])
 
@@ -101,19 +95,19 @@ def main():
 
     rpm = ReplayMemory(MEMORY_SIZE, obs_dim, act_dim)
 
-    test_flag = 0
-    total_steps = 0
-    while total_steps < args.train_total_steps:
-        train_reward, steps = run_train_episode(env, agent, rpm)
-        total_steps += steps
-        logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
+    while rpm.size() < MEMORY_WARMUP_SIZE:
+        run_train_episode(env, agent, rpm)
+
+    episode = 0
+    while episode < args.train_total_episode:
+        for i in range(50):
+            train_reward = run_train_episode(env, agent, rpm)
+            episode += 1
+            logger.info('Episode: {} Reward: {}'.format(episode, train_reward))
 
-        if total_steps // args.test_every_steps >= test_flag:
-            while total_steps // args.test_every_steps >= test_flag:
-                test_flag += 1
-            evaluate_reward = run_evaluate_episode(env, agent)
-            logger.info('Steps {}, Evaluate reward: {}'.format(
-                total_steps, evaluate_reward))
+        evaluate_reward = run_evaluate_episode(env, agent)
+        logger.info('Episode {}, Evaluate reward: {}'.format(
+            episode, evaluate_reward))
 
 
 if __name__ == '__main__':
@@ -121,15 +115,10 @@ if __name__ == '__main__':
     parser.add_argument(
         '--env', help='Mujoco environment name', default='HalfCheetah-v2')
     parser.add_argument(
-        '--train_total_steps',
-        type=int,
-        default=int(1e7),
-        help='maximum training steps')
-    parser.add_argument(
-        '--test_every_steps',
+        '--train_total_episode',
         type=int,
         default=int(1e4),
-        help='the step interval between two consecutive evaluations')
+        help='maximum training episodes')
 
     args = parser.parse_args()
 
diff --git a/examples/DQN/README.md b/examples/DQN/README.md
index 351e44754ad82125eec4e1346fd6301e8c1555b7..2281cee4e5080a0030926a7f81ec5d4cdf7d82ec 100644
--- a/examples/DQN/README.md
+++ b/examples/DQN/README.md
@@ -1,22 +1,16 @@
 ## Reproduce DQN with PARL
-Based on PARL, the DQN algorithm of deep reinforcement learning has been reproduced, reaching the same level of indicators as the paper in Atari benchmarks.
+Based on PARL, we provide a simple demonstration of DQN.
 
 + DQN in
 [Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
 
-### Atari games introduction
-Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari games.
+### Result
 
-### Benchmark result
+Performance of DQN playing CartPole-v0
 
-Mean episode rewards for 10 million training steps.
-
-<img src=".benchmark/merge.png" width = "1150" height ="230" alt="pong" /> 
-
-Performance of DQN on various environments
-
-<p align="center">
-<img src=".benchmark/table.png" alt="result" width="700"/>
+<p align="left">
+<img src="../QuickStart/performance.gif" alt="result" height="175"/>
+<img src="cartpole.jpg" alt="result" height="175"/>
 </p>
 
 ## How to use
@@ -25,13 +19,14 @@ Performance of DQN on various environments
 + [parl](https://github.com/PaddlePaddle/PARL)
 + gym
 + tqdm
-+ atari-py
-+ [ale_python_interface](https://github.com/mgbellemare/Arcade-Learning-Environment)
 
 
 ### Start Training:
 ```
-# To train a model for Pong game
-python train.py --rom ./rom_files/pong.bin
+# To train a model for CartPole-v0 game
+python train.py
 ```
-> To train more games, you can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms).
+
+## DQN-Variants
+
+For DQN variants such as Double DQN and Dueling DQN, please check [here](https://github.com/PaddlePaddle/PARL/tree/develop/examples/DQN_variant)
diff --git a/examples/DQN/cartpole.jpg b/examples/DQN/cartpole.jpg
new file mode 100644
index 0000000000000000000000000000000000000000..978a074468950a36bd385b3f7cb691efb636829b
Binary files /dev/null and b/examples/DQN/cartpole.jpg differ
diff --git a/examples/DQN/cartpole_agent.py b/examples/DQN/cartpole_agent.py
new file mode 100755
index 0000000000000000000000000000000000000000..d98f2ba7cdbd426754e3103ebd4068d5e9fb9871
--- /dev/null
+++ b/examples/DQN/cartpole_agent.py
@@ -0,0 +1,93 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class CartpoleAgent(parl.Agent):
+    def __init__(self,
+                 algorithm,
+                 obs_dim,
+                 act_dim,
+                 e_greed=0.1,
+                 e_greed_decrement=0):
+        assert isinstance(obs_dim, int)
+        assert isinstance(act_dim, int)
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(CartpoleAgent, self).__init__(algorithm)
+
+        self.global_step = 0
+        self.update_target_steps = 200
+
+        self.e_greed = e_greed
+        self.e_greed_decrement = e_greed_decrement
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.value = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            action = layers.data(name='act', shape=[1], dtype='int32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(
+                name='next_obs', shape=[self.obs_dim], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)
+
+    def sample(self, obs):
+        sample = np.random.rand()
+        if sample < self.e_greed:
+            act = np.random.randint(self.act_dim)
+        else:
+            act = self.predict(obs)
+        self.e_greed = max(0.01, self.e_greed - self.e_greed_decrement)
+        return act
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        pred_Q = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.value])[0]
+        pred_Q = np.squeeze(pred_Q, axis=0)
+        act = np.argmax(pred_Q)
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        if self.global_step % self.update_target_steps == 0:
+            self.alg.sync_target()
+        self.global_step += 1
+
+        act = np.expand_dims(act, -1)
+        feed = {
+            'obs': obs.astype('float32'),
+            'act': act.astype('int32'),
+            'reward': reward,
+            'next_obs': next_obs.astype('float32'),
+            'terminal': terminal,
+        }
+        cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.cost])[0]
+        return cost
diff --git a/examples/LiftSim_baseline/rl_benchmark/model.py b/examples/DQN/cartpole_model.py
old mode 100644
new mode 100755
similarity index 61%
rename from examples/LiftSim_baseline/rl_benchmark/model.py
rename to examples/DQN/cartpole_model.py
index 3b2364df90565565f5d4e3286b6662c134cb4c08..9218fdfca6555551dc90d025777d45d2acb4b27d
--- a/examples/LiftSim_baseline/rl_benchmark/model.py
+++ b/examples/DQN/cartpole_model.py
@@ -12,24 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import os
 import paddle.fluid as fluid
-from parl import layers
-import numpy as np
 import parl
+from parl import layers
 
 
-class RLDispatcherModel(parl.Model):
+class CartpoleModel(parl.Model):
     def __init__(self, act_dim):
-        self._act_dim = act_dim
-        self._fc_1 = layers.fc(size=512, act='relu')
-        self._fc_2 = layers.fc(size=256, act='relu')
-        self._fc_3 = layers.fc(size=128, act='tanh')
-        self._output = layers.fc(size=act_dim)
+        hid1_size = 128
+        hid2_size = 128
+        self.fc1 = layers.fc(size=hid1_size, act='relu')
+        self.fc2 = layers.fc(size=hid2_size, act='relu')
+        self.fc3 = layers.fc(size=act_dim, act=None)
 
     def value(self, obs):
-        _h_1 = self._fc_1(obs)
-        _h_2 = self._fc_2(_h_1)
-        _h_3 = self._fc_3(_h_2)
-        self._pred = self._output(_h_3)
-        return self._pred
+        h1 = self.fc1(obs)
+        h2 = self.fc2(h1)
+        Q = self.fc3(h2)
+        return Q
diff --git a/examples/DQN/replay_memory.py b/examples/DQN/replay_memory.py
old mode 100644
new mode 100755
index ea8c6565155ddacae568e901566f9b390ee3a8b8..c9474a0dce8d3cc9f5d5610cafbd1df5b1a03586
--- a/examples/DQN/replay_memory.py
+++ b/examples/DQN/replay_memory.py
@@ -12,103 +12,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import numpy as np
-import copy
-from collections import deque, namedtuple
+# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
 
-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+import random
+import collections
+import numpy as np
 
 
 class ReplayMemory(object):
-    def __init__(self, max_size, state_shape, context_len):
-        self.max_size = int(max_size)
-        self.state_shape = state_shape
-        self.context_len = int(context_len)
-
-        self.state = np.zeros((self.max_size, ) + state_shape, dtype='uint8')
-        self.action = np.zeros((self.max_size, ), dtype='int32')
-        self.reward = np.zeros((self.max_size, ), dtype='float32')
-        self.isOver = np.zeros((self.max_size, ), dtype='bool')
-
-        self._curr_size = 0
-        self._curr_pos = 0
-        self._context = deque(maxlen=context_len - 1)
+    def __init__(self, max_size):
+        self.buffer = collections.deque(maxlen=max_size)
 
     def append(self, exp):
-        """append a new experience into replay memory
-        """
-        if self._curr_size < self.max_size:
-            self._assign(self._curr_pos, exp)
-            self._curr_size += 1
-        else:
-            self._assign(self._curr_pos, exp)
-        self._curr_pos = (self._curr_pos + 1) % self.max_size
-        if exp.isOver:
-            self._context.clear()
-        else:
-            self._context.append(exp)
-
-    def recent_state(self):
-        """ maintain recent state for training"""
-        lst = list(self._context)
-        states = [np.zeros(self.state_shape, dtype='uint8')] * \
-                    (self._context.maxlen - len(lst))
-        states.extend([k.state for k in lst])
-        return states
+        self.buffer.append(exp)
 
-    def sample(self, idx):
-        """ return state, action, reward, isOver,
-            note that some frames in state may be generated from last episode,
-            they should be removed from state
-            """
-        state = np.zeros(
-            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
-        state_idx = np.arange(idx,
-                              idx + self.context_len + 1) % self._curr_size
+    def sample(self, batch_size):
+        mini_batch = random.sample(self.buffer, batch_size)
+        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
 
-        # confirm that no frame was generated from last episode
-        has_last_episode = False
-        for k in range(self.context_len - 2, -1, -1):
-            to_check_idx = state_idx[k]
-            if self.isOver[to_check_idx]:
-                has_last_episode = True
-                state_idx = state_idx[k + 1:]
-                state[k + 1:] = self.state[state_idx]
-                break
+        for experience in mini_batch:
+            s, a, r, s_p, done = experience
+            obs_batch.append(s)
+            action_batch.append(a)
+            reward_batch.append(r)
+            next_obs_batch.append(s_p)
+            done_batch.append(done)
 
-        if not has_last_episode:
-            state = self.state[state_idx]
-
-        real_idx = (idx + self.context_len - 1) % self._curr_size
-        action = self.action[real_idx]
-        reward = self.reward[real_idx]
-        isOver = self.isOver[real_idx]
-        return state, reward, action, isOver
+        return np.array(obs_batch).astype('float32'), \
+            np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
+            np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
 
     def __len__(self):
-        return self._curr_size
-
-    def size(self):
-        return self._curr_size
-
-    def _assign(self, pos, exp):
-        self.state[pos] = exp.state
-        self.reward[pos] = exp.reward
-        self.action[pos] = exp.action
-        self.isOver[pos] = exp.isOver
-
-    def sample_batch(self, batch_size):
-        """sample a batch from replay memory for training
-        """
-        batch_idx = np.random.randint(
-            self._curr_size - self.context_len - 1, size=batch_size)
-        batch_idx = (self._curr_pos + batch_idx) % self._curr_size
-        batch_exp = [self.sample(i) for i in batch_idx]
-        return self._process_batch(batch_exp)
-
-    def _process_batch(self, batch_exp):
-        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
-        reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
-        action = np.asarray([e[2] for e in batch_exp], dtype='int8')
-        isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
-        return [state, action, reward, isOver]
+        return len(self.buffer)
diff --git a/examples/DQN/train.py b/examples/DQN/train.py
old mode 100644
new mode 100755
index 3149e6b81a34e81aff038a12994e7eb4e91eac22..b634b122eff4abc7177ff830387560c95fb2aa2b
--- a/examples/DQN/train.py
+++ b/examples/DQN/train.py
@@ -12,160 +12,100 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import argparse
 import gym
-import paddle.fluid as fluid
 import numpy as np
-import os
 import parl
-from atari_agent import AtariAgent
-from atari_model import AtariModel
-from datetime import datetime
-from replay_memory import ReplayMemory, Experience
-from parl.utils import tensorboard, logger
-from tqdm import tqdm
-from utils import get_player
-
-MEMORY_SIZE = 1e6
-MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20
-IMAGE_SIZE = (84, 84)
-CONTEXT_LEN = 4
-FRAME_SKIP = 4
-UPDATE_FREQ = 4
-GAMMA = 0.99
-LEARNING_RATE = 3e-4
-
-
-def run_train_episode(env, agent, rpm):
-    total_reward = 0
-    all_cost = []
-    state = env.reset()
-    steps = 0
-    while True:
-        steps += 1
-        context = rpm.recent_state()
-        context.append(state)
-        context = np.stack(context, axis=0)
-        action = agent.sample(context)
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append(Experience(state, action, reward, isOver))
-        # start training
-        if rpm.size() > MEMORY_WARMUP_SIZE:
-            if steps % UPDATE_FREQ == 0:
-                batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
-                    args.batch_size)
-                batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-                batch_next_state = batch_all_state[:, 1:, :, :]
-                cost = agent.learn(batch_state, batch_action, batch_reward,
-                                   batch_next_state, batch_isOver)
-                all_cost.append(float(cost))
-        total_reward += reward
-        state = next_state
-        if isOver:
-            break
-    if all_cost:
-        logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
-            total_reward, np.mean(all_cost)))
-    return total_reward, steps, np.mean(all_cost)
+from parl.utils import logger
 
+from cartpole_model import CartpoleModel
+from cartpole_agent import CartpoleAgent
 
-def run_evaluate_episode(env, agent):
-    state = env.reset()
+from replay_memory import ReplayMemory
+
+LEARN_FREQ = 5  # update parameters every 5 steps
+MEMORY_SIZE = 20000  # replay memory size
+MEMORY_WARMUP_SIZE = 200  # store some experiences in the replay memory in advance
+BATCH_SIZE = 32
+LEARNING_RATE = 0.0005
+GAMMA = 0.99  # discount factor of reward
+
+
+def run_episode(agent, env, rpm):
     total_reward = 0
+    obs = env.reset()
+    step = 0
     while True:
-        action = agent.predict(state)
-        state, reward, isOver, info = env.step(action)
+        step += 1
+        action = agent.sample(obs)
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append((obs, action, reward, next_obs, isOver))
+
+        # train model
+        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
+            (batch_obs, batch_action, batch_reward, batch_next_obs,
+             batch_isOver) = rpm.sample(BATCH_SIZE)
+            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
+                                     batch_next_obs, batch_isOver)
+
         total_reward += reward
+        obs = next_obs
         if isOver:
             break
     return total_reward
 
 
+def evaluate(agent, env, render=False):
+    # test part, run 5 episodes and average
+    eval_reward = []
+    for i in range(5):
+        obs = env.reset()
+        episode_reward = 0
+        isOver = False
+        while not isOver:
+            action = agent.predict(obs)
+            if render:
+                env.render()
+            obs, reward, isOver, _ = env.step(action)
+            episode_reward += reward
+        eval_reward.append(episode_reward)
+    return np.mean(eval_reward)
+
+
 def main():
-    env = get_player(
-        args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP)
-    test_env = get_player(
-        args.rom,
-        image_size=IMAGE_SIZE,
-        frame_skip=FRAME_SKIP,
-        context_len=CONTEXT_LEN)
-    rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN)
-    act_dim = env.action_space.n
-
-    model = AtariModel(act_dim, args.algo)
-    if args.algo == 'Double':
-        algorithm = parl.algorithms.DDQN(model, act_dim=act_dim, gamma=GAMMA)
-    elif args.algo in ['DQN', 'Dueling']:
-        algorithm = parl.algorithms.DQN(model, act_dim=act_dim, gamma=GAMMA)
-    agent = AtariAgent(
+    env = gym.make('CartPole-v0')
+    action_dim = env.action_space.n
+    obs_shape = env.observation_space.shape
+
+    rpm = ReplayMemory(MEMORY_SIZE)
+
+    model = CartpoleModel(act_dim=action_dim)
+    algorithm = parl.algorithms.DQN(
+        model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
+    agent = CartpoleAgent(
         algorithm,
-        act_dim=act_dim,
-        start_lr=LEARNING_RATE,
-        total_step=args.train_total_steps,
-        update_freq=UPDATE_FREQ)
-
-    with tqdm(
-            total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar:
-        while rpm.size() < MEMORY_WARMUP_SIZE:
-            total_reward, steps, _ = run_train_episode(env, agent, rpm)
-            pbar.update(steps)
-
-    # train
-    test_flag = 0
-    pbar = tqdm(total=args.train_total_steps)
-    total_steps = 0
-    max_reward = None
-    while total_steps < args.train_total_steps:
-        # start epoch
-        total_reward, steps, loss = run_train_episode(env, agent, rpm)
-        total_steps += steps
-        pbar.set_description('[train]exploration:{}'.format(agent.exploration))
-        tensorboard.add_scalar('dqn/score', total_reward, total_steps)
-        tensorboard.add_scalar('dqn/loss', loss,
-                               total_steps)  # mean of total loss
-        tensorboard.add_scalar('dqn/exploration', agent.exploration,
-                               total_steps)
-        pbar.update(steps)
-
-        if total_steps // args.test_every_steps >= test_flag:
-            while total_steps // args.test_every_steps >= test_flag:
-                test_flag += 1
-            pbar.write("testing")
-            eval_rewards = []
-            for _ in tqdm(range(3), desc='eval agent'):
-                eval_reward = run_evaluate_episode(test_env, agent)
-                eval_rewards.append(eval_reward)
-            logger.info(
-                "eval_agent done, (steps, eval_reward): ({}, {})".format(
-                    total_steps, np.mean(eval_rewards)))
-            eval_test = np.mean(eval_rewards)
-            tensorboard.add_scalar('dqn/eval', eval_test, total_steps)
-
-    pbar.close()
+        obs_dim=obs_shape[0],
+        act_dim=action_dim,
+        e_greed=0.1,  # explore
+        e_greed_decrement=1e-6
+    )  # probability of exploring is decreasing during training
+
+    while len(rpm) < MEMORY_WARMUP_SIZE:  # warm up replay memory
+        run_episode(agent, env, rpm)
+
+    max_episode = 2000
+
+    # start train
+    episode = 0
+    while episode < max_episode:
+        # train part
+        for i in range(0, 50):
+            total_reward = run_episode(agent, env, rpm)
+            episode += 1
+
+        eval_reward = evaluate(agent, env)
+        logger.info('episode:{}    test_reward:{}'.format(
+            episode, eval_reward))
 
 
 if __name__ == '__main__':
-    parser = argparse.ArgumentParser()
-    parser.add_argument(
-        '--rom', help='path of the rom of the atari game', required=True)
-    parser.add_argument(
-        '--batch_size', type=int, default=64, help='batch size for training')
-    parser.add_argument(
-        '--algo',
-        default='DQN',
-        help=
-        'DQN/DDQN/Dueling, represent DQN, double DQN, and dueling DQN respectively',
-    )
-    parser.add_argument(
-        '--train_total_steps',
-        type=int,
-        default=int(1e7),
-        help='maximum environmental steps of games')
-    parser.add_argument(
-        '--test_every_steps',
-        type=int,
-        default=100000,
-        help='the step interval between two consecutive evaluations')
-
-    args = parser.parse_args()
     main()
diff --git a/examples/DQN/.benchmark/merge.png b/examples/DQN_variant/.benchmark/merge.png
similarity index 100%
rename from examples/DQN/.benchmark/merge.png
rename to examples/DQN_variant/.benchmark/merge.png
diff --git a/examples/DQN/.benchmark/table.png b/examples/DQN_variant/.benchmark/table.png
similarity index 100%
rename from examples/DQN/.benchmark/table.png
rename to examples/DQN_variant/.benchmark/table.png
diff --git a/examples/DQN_variant/README.md b/examples/DQN_variant/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..351e44754ad82125eec4e1346fd6301e8c1555b7
--- /dev/null
+++ b/examples/DQN_variant/README.md
@@ -0,0 +1,37 @@
+## Reproduce DQN with PARL
+Based on PARL, the DQN algorithm of deep reinforcement learning has been reproduced, reaching the same level of indicators as the paper in Atari benchmarks.
+
++ DQN in
+[Human-level Control Through Deep Reinforcement Learning](http://www.nature.com/nature/journal/v518/n7540/full/nature14236.html)
+
+### Atari games introduction
+Please see [here](https://gym.openai.com/envs/#atari) to know more about Atari games.
+
+### Benchmark result
+
+Mean episode rewards for 10 million training steps.
+
+<img src=".benchmark/merge.png" width = "1150" height ="230" alt="pong" /> 
+
+Performance of DQN on various environments
+
+<p align="center">
+<img src=".benchmark/table.png" alt="result" width="700"/>
+</p>
+
+## How to use
+### Dependencies:
++ [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
++ [parl](https://github.com/PaddlePaddle/PARL)
++ gym
++ tqdm
++ atari-py
++ [ale_python_interface](https://github.com/mgbellemare/Arcade-Learning-Environment)
+
+
+### Start Training:
+```
+# To train a model for Pong game
+python train.py --rom ./rom_files/pong.bin
+```
+> To train more games, you can install more rom files from [here](https://github.com/openai/atari-py/tree/master/atari_py/atari_roms).
diff --git a/examples/DQN/atari.py b/examples/DQN_variant/atari.py
similarity index 100%
rename from examples/DQN/atari.py
rename to examples/DQN_variant/atari.py
diff --git a/examples/DQN/atari_agent.py b/examples/DQN_variant/atari_agent.py
similarity index 99%
rename from examples/DQN/atari_agent.py
rename to examples/DQN_variant/atari_agent.py
index 4af4478048bc582f5951446920a1686ed497b3b4..8a33ac4369f4d9f0c55d12c82b6fded63eedbc77 100644
--- a/examples/DQN/atari_agent.py
+++ b/examples/DQN_variant/atari_agent.py
@@ -106,7 +106,7 @@ class AtariAgent(parl.Agent):
             'reward': reward,
             'next_obs': next_obs.astype('float32'),
             'terminal': terminal,
-            'lr': lr
+            'lr': np.float32(lr)
         }
         cost = self.fluid_executor.run(
             self.learn_program, feed=feed, fetch_list=[self.cost])[0]
diff --git a/examples/DQN/atari_model.py b/examples/DQN_variant/atari_model.py
similarity index 100%
rename from examples/DQN/atari_model.py
rename to examples/DQN_variant/atari_model.py
diff --git a/examples/DQN/atari_wrapper.py b/examples/DQN_variant/atari_wrapper.py
similarity index 100%
rename from examples/DQN/atari_wrapper.py
rename to examples/DQN_variant/atari_wrapper.py
diff --git a/examples/DQN_variant/replay_memory.py b/examples/DQN_variant/replay_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..25ef0e50a7e53fd68d19b6ac3bffb807b18d6d29
--- /dev/null
+++ b/examples/DQN_variant/replay_memory.py
@@ -0,0 +1,113 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import numpy as np
+import copy
+from collections import deque, namedtuple
+
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
+
+
+class ReplayMemory(object):
+    def __init__(self, max_size, obs_shape, context_len):
+        self.max_size = int(max_size)
+        self.obs_shape = obs_shape
+        self.context_len = int(context_len)
+
+        self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
+        self.action = np.zeros((self.max_size, ), dtype='int32')
+        self.reward = np.zeros((self.max_size, ), dtype='float32')
+        self.isOver = np.zeros((self.max_size, ), dtype='bool')
+
+        self._curr_size = 0
+        self._curr_pos = 0
+        self._context = deque(maxlen=context_len - 1)
+
+    def append(self, exp):
+        """append a new experience into replay memory
+        """
+        if self._curr_size < self.max_size:
+            self._assign(self._curr_pos, exp)
+            self._curr_size += 1
+        else:
+            self._assign(self._curr_pos, exp)
+        self._curr_pos = (self._curr_pos + 1) % self.max_size
+        if exp.isOver:
+            self._context.clear()
+        else:
+            self._context.append(exp)
+
+    def recent_obs(self):
+        """ maintain recent obs for training"""
+        lst = list(self._context)
+        obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
+                    (self._context.maxlen - len(lst))
+        obs.extend([k.obs for k in lst])
+        return obs
+
+    def sample(self, idx):
+        """ return obs, action, reward, isOver,
+            note that some frames in obs may be generated from last episode,
+            they should be removed from obs
+            """
+        obs = np.zeros(
+            (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+        obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
+
+        # confirm that no frame was generated from last episode
+        has_last_episode = False
+        for k in range(self.context_len - 2, -1, -1):
+            to_check_idx = obs_idx[k]
+            if self.isOver[to_check_idx]:
+                has_last_episode = True
+                obs_idx = obs_idx[k + 1:]
+                obs[k + 1:] = self.obs[obs_idx]
+                break
+
+        if not has_last_episode:
+            obs = self.obs[obs_idx]
+
+        real_idx = (idx + self.context_len - 1) % self._curr_size
+        action = self.action[real_idx]
+        reward = self.reward[real_idx]
+        isOver = self.isOver[real_idx]
+        return obs, reward, action, isOver
+
+    def __len__(self):
+        return self._curr_size
+
+    def size(self):
+        return self._curr_size
+
+    def _assign(self, pos, exp):
+        self.obs[pos] = exp.obs
+        self.reward[pos] = exp.reward
+        self.action[pos] = exp.action
+        self.isOver[pos] = exp.isOver
+
+    def sample_batch(self, batch_size):
+        """sample a batch from replay memory for training
+        """
+        batch_idx = np.random.randint(
+            self._curr_size - self.context_len - 1, size=batch_size)
+        batch_idx = (self._curr_pos + batch_idx) % self._curr_size
+        batch_exp = [self.sample(i) for i in batch_idx]
+        return self._process_batch(batch_exp)
+
+    def _process_batch(self, batch_exp):
+        obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
+        action = np.asarray([e[2] for e in batch_exp], dtype='int8')
+        isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
+        return [obs, action, reward, isOver]
diff --git a/examples/DQN/rom_files/breakout.bin b/examples/DQN_variant/rom_files/breakout.bin
similarity index 100%
rename from examples/DQN/rom_files/breakout.bin
rename to examples/DQN_variant/rom_files/breakout.bin
diff --git a/examples/DQN/rom_files/pong.bin b/examples/DQN_variant/rom_files/pong.bin
similarity index 100%
rename from examples/DQN/rom_files/pong.bin
rename to examples/DQN_variant/rom_files/pong.bin
diff --git a/examples/DQN_variant/train.py b/examples/DQN_variant/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5ca16df135c346a08f87efc6a694e1e289b8192c
--- /dev/null
+++ b/examples/DQN_variant/train.py
@@ -0,0 +1,169 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import gym
+import paddle.fluid as fluid
+import numpy as np
+import os
+import parl
+from atari_agent import AtariAgent
+from atari_model import AtariModel
+from datetime import datetime
+from replay_memory import ReplayMemory, Experience
+from parl.utils import summary, logger
+from tqdm import tqdm
+from utils import get_player
+
+MEMORY_SIZE = 1e6
+MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20
+IMAGE_SIZE = (84, 84)
+CONTEXT_LEN = 4
+FRAME_SKIP = 4
+UPDATE_FREQ = 4
+GAMMA = 0.99
+LEARNING_RATE = 3e-4
+
+
+def run_train_episode(env, agent, rpm):
+    total_reward = 0
+    all_cost = []
+    obs = env.reset()
+    steps = 0
+    while True:
+        steps += 1
+        context = rpm.recent_obs()
+        context.append(obs)
+        context = np.stack(context, axis=0)
+        action = agent.sample(context)
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append(Experience(obs, action, reward, isOver))
+        # start training
+        if rpm.size() > MEMORY_WARMUP_SIZE:
+            if steps % UPDATE_FREQ == 0:
+                batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+                    args.batch_size)
+                batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+                batch_next_obs = batch_all_obs[:, 1:, :, :]
+                cost = agent.learn(batch_obs, batch_action, batch_reward,
+                                   batch_next_obs, batch_isOver)
+                all_cost.append(float(cost))
+        total_reward += reward
+        obs = next_obs
+        if isOver:
+            break
+    if all_cost:
+        logger.info('[Train]total_reward: {}, mean_cost: {}'.format(
+            total_reward, np.mean(all_cost)))
+    return total_reward, steps, np.mean(all_cost)
+
+
+def run_evaluate_episode(env, agent):
+    obs = env.reset()
+    total_reward = 0
+    while True:
+        action = agent.predict(obs)
+        obs, reward, isOver, info = env.step(action)
+        total_reward += reward
+        if isOver:
+            break
+    return total_reward
+
+
+def main():
+    env = get_player(
+        args.rom, image_size=IMAGE_SIZE, train=True, frame_skip=FRAME_SKIP)
+    test_env = get_player(
+        args.rom,
+        image_size=IMAGE_SIZE,
+        frame_skip=FRAME_SKIP,
+        context_len=CONTEXT_LEN)
+    rpm = ReplayMemory(MEMORY_SIZE, IMAGE_SIZE, CONTEXT_LEN)
+    act_dim = env.action_space.n
+
+    model = AtariModel(act_dim, args.algo)
+    if args.algo == 'Double':
+        algorithm = parl.algorithms.DDQN(model, act_dim=act_dim, gamma=GAMMA)
+    elif args.algo in ['DQN', 'Dueling']:
+        algorithm = parl.algorithms.DQN(model, act_dim=act_dim, gamma=GAMMA)
+    agent = AtariAgent(
+        algorithm,
+        act_dim=act_dim,
+        start_lr=LEARNING_RATE,
+        total_step=args.train_total_steps,
+        update_freq=UPDATE_FREQ)
+
+    with tqdm(
+            total=MEMORY_WARMUP_SIZE, desc='[Replay Memory Warm Up]') as pbar:
+        while rpm.size() < MEMORY_WARMUP_SIZE:
+            total_reward, steps, _ = run_train_episode(env, agent, rpm)
+            pbar.update(steps)
+
+    # train
+    test_flag = 0
+    pbar = tqdm(total=args.train_total_steps)
+    total_steps = 0
+    max_reward = None
+    while total_steps < args.train_total_steps:
+        # start epoch
+        total_reward, steps, loss = run_train_episode(env, agent, rpm)
+        total_steps += steps
+        pbar.set_description('[train]exploration:{}'.format(agent.exploration))
+        summary.add_scalar('dqn/score', total_reward, total_steps)
+        summary.add_scalar('dqn/loss', loss, total_steps)  # mean of total loss
+        summary.add_scalar('dqn/exploration', agent.exploration, total_steps)
+        pbar.update(steps)
+
+        if total_steps // args.test_every_steps >= test_flag:
+            while total_steps // args.test_every_steps >= test_flag:
+                test_flag += 1
+            pbar.write("testing")
+            eval_rewards = []
+            for _ in tqdm(range(3), desc='eval agent'):
+                eval_reward = run_evaluate_episode(test_env, agent)
+                eval_rewards.append(eval_reward)
+            logger.info(
+                "eval_agent done, (steps, eval_reward): ({}, {})".format(
+                    total_steps, np.mean(eval_rewards)))
+            eval_test = np.mean(eval_rewards)
+            summary.add_scalar('dqn/eval', eval_test, total_steps)
+
+    pbar.close()
+
+
+if __name__ == '__main__':
+    parser = argparse.ArgumentParser()
+    parser.add_argument(
+        '--rom', help='path of the rom of the atari game', required=True)
+    parser.add_argument(
+        '--batch_size', type=int, default=64, help='batch size for training')
+    parser.add_argument(
+        '--algo',
+        default='DQN',
+        help=
+        'DQN/DDQN/Dueling, represent DQN, double DQN, and dueling DQN respectively',
+    )
+    parser.add_argument(
+        '--train_total_steps',
+        type=int,
+        default=int(1e7),
+        help='maximum environmental steps of games')
+    parser.add_argument(
+        '--test_every_steps',
+        type=int,
+        default=100000,
+        help='the step interval between two consecutive evaluations')
+
+    args = parser.parse_args()
+    main()
diff --git a/examples/DQN/utils.py b/examples/DQN_variant/utils.py
similarity index 100%
rename from examples/DQN/utils.py
rename to examples/DQN_variant/utils.py
diff --git a/examples/ES/README.md b/examples/ES/README.md
index 207ae2dafa68c5f7d2eb30f956355b07c1bd5d61..d868202753fa34c0799c8c58975c958aa1ffe001 100644
--- a/examples/ES/README.md
+++ b/examples/ES/README.md
@@ -34,7 +34,7 @@ Then we can start the distributed training by running:
 python train.py
 ```
 
-Training result will be saved in `train_log` with training curve that can be visualized in tensorboard data.
+Training result will be saved in `train_log` with training curve.
 
 ### Reference
 + [Ray](https://github.com/ray-project/ray)
diff --git a/examples/ES/train.py b/examples/ES/train.py
index be2c7d703eeba39931312491274f554ee9a76562..eadf26ea6e7d736abe45e7c08d25a5c7ae8dda2e 100644
--- a/examples/ES/train.py
+++ b/examples/ES/train.py
@@ -23,7 +23,7 @@ from obs_filter import MeanStdFilter
 from mujoco_agent import MujocoAgent
 from mujoco_model import MujocoModel
 from noise import SharedNoiseTable
-from parl.utils import logger, tensorboard
+from parl.utils import logger, summary
 from parl.utils.window_stat import WindowStat
 from six.moves import queue
 from actor import Actor
@@ -202,7 +202,7 @@ class Learner(object):
         logger.info(metrics)
         for k, v in metrics.items():
             if v is not None:
-                tensorboard.add_scalar(k, v, self.sample_total_steps)
+                summary.add_scalar(k, v, self.sample_total_steps)
 
 
 if __name__ == '__main__':
diff --git a/examples/GA3C/train.py b/examples/GA3C/train.py
index edc7f33344bc484fff640700dfd80bfc35987843..30f3a415b77cfa83d8868606498379b528ad1c31 100755
--- a/examples/GA3C/train.py
+++ b/examples/GA3C/train.py
@@ -24,7 +24,7 @@ from atari_model import AtariModel
 from atari_agent import AtariAgent
 from collections import defaultdict
 from parl.env.atari_wrappers import wrap_deepmind
-from parl.utils import logger, get_gpu_count, tensorboard
+from parl.utils import logger, get_gpu_count, summary
 from parl.utils.scheduler import PiecewiseScheduler
 from parl.utils.time_stat import TimeStat
 from parl.utils.window_stat import WindowStat
@@ -313,7 +313,7 @@ class Learner(object):
 
         for key, value in metric.items():
             if value is not None:
-                tensorboard.add_scalar(key, value, self.sample_total_steps)
+                summary.add_scalar(key, value, self.sample_total_steps)
 
         logger.info(metric)
 
diff --git a/examples/IMPALA/atari_agent.py b/examples/IMPALA/atari_agent.py
index 98d4a4c4fd3ea611f60f2d8da850265025541b4b..0746f951f6920a70b0af87430af51879b635ada7 100755
--- a/examples/IMPALA/atari_agent.py
+++ b/examples/IMPALA/atari_agent.py
@@ -58,7 +58,10 @@ class AtariAgent(parl.Agent):
             lr = layers.data(
                 name='lr', shape=[1], dtype='float32', append_batch_size=False)
             entropy_coeff = layers.data(
-                name='entropy_coeff', shape=[], dtype='float32')
+                name='entropy_coeff',
+                shape=[1],
+                dtype='float32',
+                append_batch_size=False)
 
             self.learn_reader = fluid.layers.create_py_reader_by_data(
                 capacity=32,
diff --git a/examples/IMPALA/train.py b/examples/IMPALA/train.py
index 8440ee78cec30f5de568ea277769fe1df938ed9f..9f2a3e65a7962d0aed103318c4a1979520004f8f 100755
--- a/examples/IMPALA/train.py
+++ b/examples/IMPALA/train.py
@@ -22,7 +22,7 @@ import parl
 from atari_model import AtariModel
 from atari_agent import AtariAgent
 from parl.env.atari_wrappers import wrap_deepmind
-from parl.utils import logger, tensorboard, get_gpu_count
+from parl.utils import logger, summary, get_gpu_count
 from parl.utils.scheduler import PiecewiseScheduler
 from parl.utils.time_stat import TimeStat
 from parl.utils.window_stat import WindowStat
@@ -121,7 +121,9 @@ class Learner(object):
 
                 yield [
                     obs_np, actions_np, behaviour_logits_np, rewards_np,
-                    dones_np, self.lr, self.entropy_coeff
+                    dones_np,
+                    np.float32(self.lr),
+                    np.array([self.entropy_coeff], dtype='float32')
                 ]
 
     def run_learn(self):
@@ -219,7 +221,7 @@ class Learner(object):
             min_episode_steps = np.min(np.array(episode_steps).flatten())
 
         metric = {
-            'Sample steps': self.sample_total_steps,
+            'sample_steps': self.sample_total_steps,
             'max_episode_rewards': max_episode_rewards,
             'mean_episode_rewards': mean_episode_rewards,
             'min_episode_rewards': min_episode_rewards,
@@ -242,7 +244,7 @@ class Learner(object):
 
         for key, value in metric.items():
             if value is not None:
-                tensorboard.add_scalar(key, value, self.sample_total_steps)
+                summary.add_scalar(key, value, self.sample_total_steps)
 
         logger.info(metric)
 
diff --git a/examples/LiftSim_baseline/README.md b/examples/LiftSim_baseline/README.md
deleted file mode 100644
index bfc903402d2665fb00e518ae1df77a1b8c88dae5..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/README.md
+++ /dev/null
@@ -1,28 +0,0 @@
-# LiftSim基线
-
-## 简介
-
-基于PARL库实现Deep Q-network算法，应用于[RLSchool][rlschool]库中的电梯调度模拟环境[LiftSim][liftsim]。
-
-## 依赖库
-
-- paddlepaddle >= 1.5.1
-- parl >= 1.1.2
-- rlschool >= 0.0.1
-
-Windows版本仅支持Python3.5及以上版本。
-
-## 运行
-
-```python
-python demo.py
-```
-
-## Benchmark
-
-<img src="rl_10.png" width="400"/>
-
-Accumulated Reward：每3600 steps内reward的总和，可体现电梯调度在单位时间（模拟环境0.5小时）内的效率。
-
-[rlschool]: https://github.com/PaddlePaddle/RLSchool
-[liftsim]: https://github.com/PaddlePaddle/RLSchool/tree/master/rlschool/liftsim
diff --git a/examples/LiftSim_baseline/demo.py b/examples/LiftSim_baseline/demo.py
deleted file mode 100644
index cecbf6c1a34d9060dac90abf6e4d648aa0f9a870..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/demo.py
+++ /dev/null
@@ -1,48 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-from rlschool import LiftSim
-from wrapper import Wrapper, ActionWrapper, ObservationWrapper
-from rl_benchmark.dispatcher import RL_dispatcher
-import sys
-import argparse
-
-
-# run main program with args
-def run_main(args):
-
-    parser = argparse.ArgumentParser(description='demo configuration')
-    parser.add_argument(
-        '--iterations',
-        type=int,
-        default=100000000,
-        help='total number of iterations')
-    args = parser.parse_args(args)
-    print('iterations:', args.iterations)
-
-    mansion_env = LiftSim()
-    # mansion_env.seed(1988)
-
-    mansion_env = Wrapper(mansion_env)
-    mansion_env = ActionWrapper(mansion_env)
-    mansion_env = ObservationWrapper(mansion_env)
-
-    dispatcher = RL_dispatcher(mansion_env, args.iterations)
-    dispatcher.run_episode()
-
-    return 0
-
-
-if __name__ == "__main__":
-    run_main(sys.argv[1:])
diff --git a/examples/LiftSim_baseline/rl_10.png b/examples/LiftSim_baseline/rl_10.png
deleted file mode 100644
index b8f9eef1d10c0a617d8dd462f1d66e5d26484622..0000000000000000000000000000000000000000
Binary files a/examples/LiftSim_baseline/rl_10.png and /dev/null differ
diff --git a/examples/LiftSim_baseline/rl_benchmark/__init__.py b/examples/LiftSim_baseline/rl_benchmark/__init__.py
deleted file mode 100644
index eca2dce114b069bf9b455d77ce670d73b5047fd2..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/rl_benchmark/__init__.py
+++ /dev/null
@@ -1,13 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
diff --git a/examples/LiftSim_baseline/rl_benchmark/agent.py b/examples/LiftSim_baseline/rl_benchmark/agent.py
deleted file mode 100644
index 846bcf318090916141a4216abb3a889d2548d2ff..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/rl_benchmark/agent.py
+++ /dev/null
@@ -1,111 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import os
-import numpy as np
-import numpy.random as random
-import paddle.fluid as fluid
-from parl import layers
-from parl import Agent
-from parl.utils import get_gpu_count, machine_info
-
-
-class ElevatorAgent(Agent):
-    def __init__(self, algorithm, obs_dim, action_dim):
-        self._action_dim = action_dim
-        self._obs_dim = obs_dim
-        self._update_target_steps = 1000
-
-        self._global_step = 0
-        self.exploration_ratio = 0.9
-        self.exploration_decre = 1e-7
-        self.exploration_min = 0.1
-        super(ElevatorAgent, self).__init__(algorithm)
-
-        use_cuda = machine_info.is_gpu_available()
-        if self.gpu_id >= 0:
-            assert get_gpu_count() == 1, 'Only support training in single GPU,\
-                    Please set environment variable: `export CUDA_VISIBLE_DEVICES=[GPU_ID_YOU_WANT_TO_USE]` .'
-
-        else:
-            os.environ['CPU_NUM'] = str(1)
-
-        exec_strategy = fluid.ExecutionStrategy()
-        exec_strategy.num_threads = 1
-        exec_strategy.num_iteration_per_drop_scope = 10
-        build_strategy = fluid.BuildStrategy()
-        build_strategy.remove_unnecessary_lock = False
-
-        self.learn_pe = fluid.ParallelExecutor(
-            use_cuda=use_cuda,
-            main_program=self.learn_program,
-            build_strategy=build_strategy,
-            exec_strategy=exec_strategy,
-        )
-
-    def build_program(self):
-        self.pred_program = fluid.Program()
-        self.learn_program = fluid.Program()
-
-        with fluid.program_guard(self.pred_program):
-            obs = layers.data(
-                name='obs', shape=[self._obs_dim], dtype='float32')
-            self._value = self.alg.define_predict(obs)
-
-        with fluid.program_guard(self.learn_program):
-            obs = layers.data(
-                name='obs', shape=[self._obs_dim], dtype='float32')
-            action = layers.data(name='act', shape=[1], dtype='int32')
-            reward = layers.data(name='reward', shape=[], dtype='float32')
-            next_obs = layers.data(
-                name='next_obs', shape=[self._obs_dim], dtype='float32')
-            terminal = layers.data(name='terminal', shape=[], dtype='bool')
-            self._cost = self.alg.define_learn(obs, action, reward, next_obs,
-                                               terminal)
-
-    def sample(self, obs):
-        if self.exploration_ratio > self.exploration_min:
-            self.exploration_ratio -= self.exploration_decre
-        q_values = self.predict(obs)
-
-        ret_actions = list()
-        for i in range(len(q_values)):  # number of elevators
-            if (random.random() < self.exploration_ratio):
-                action = random.randint(0, self._action_dim)
-            else:
-                action = np.argmax(q_values[i])
-            ret_actions.append(int(action))
-        return ret_actions
-
-    def predict(self, obs):
-        pred_Q = self.fluid_executor.run(
-            self.pred_program,
-            feed={'obs': obs.astype('float32')},
-            fetch_list=[self._value])
-        return pred_Q[0]
-
-    def learn(self, obs, act, reward, next_obs, terminal):
-        self._global_step += 1
-        if self._global_step % self._update_target_steps == 0:
-            self.alg.sync_target(self.gpu_id)
-
-        feed = {
-            'obs': obs.astype('float32'),
-            'act': act.astype('int32'),
-            'reward': reward,
-            'next_obs': next_obs.astype('float32'),
-            'terminal': terminal
-        }
-        cost = self.learn_pe.run(feed=feed, fetch_list=[self._cost.name])[0]
-        return cost
diff --git a/examples/LiftSim_baseline/rl_benchmark/dispatcher.py b/examples/LiftSim_baseline/rl_benchmark/dispatcher.py
deleted file mode 100644
index a2561ee6d3d9f2c6b8f39c886ef4a1f2f01fb1ea..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/rl_benchmark/dispatcher.py
+++ /dev/null
@@ -1,105 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import parl
-import numpy as np
-import numpy.random as random
-
-from copy import deepcopy
-from collections import deque
-
-from rlschool import EPSILON, HUGE
-from rl_benchmark.model import RLDispatcherModel
-from rl_benchmark.agent import ElevatorAgent
-from parl.algorithms import DQN
-from parl.utils import ReplayMemory
-
-MEMORY_SIZE = 1000000
-BATCH_SIZE = 64
-
-
-class RL_dispatcher():
-    """
-    An RL benchmark for elevator system
-    """
-
-    def __init__(self, env, max_episode):
-        self.env = env
-
-        self._obs_dim = env.observation_space
-        self._act_dim = env.action_space
-        self._global_step = 0
-        self.max_episode = max_episode
-        self._rpm = ReplayMemory(MEMORY_SIZE, self._obs_dim, 1)
-        self._model = RLDispatcherModel(self._act_dim)
-        hyperparas = {
-            'action_dim': self._act_dim,
-            'lr': 5.0e-4,
-            'gamma': 0.998
-        }
-
-        self._algorithm = DQN(self._model, hyperparas)
-        self._agent = ElevatorAgent(self._algorithm, self._obs_dim,
-                                    self._act_dim)
-        self._warm_up_size = 2000
-        self._statistic_freq = 1000
-        self._loss_queue = deque()
-
-    def run_episode(self):
-        self.env.reset()
-        acc_reward = 0.0
-
-        while self._global_step < self.max_episode:
-            # self.env.render()
-            state = self.env.state
-            action = self._agent.sample(state)
-            state_, reward, done, info = self.env.step(action)
-            output_info = self.learn_step(state, action, reward)
-            acc_reward += reward
-            if (isinstance(output_info, dict) and len(output_info) > 0):
-                self.env.log_notice("%s", output_info)
-            if (self._global_step % 3600 == 0):
-                self.env.log_notice(
-                    "Accumulated Reward: %f, Mansion Status: %s", acc_reward,
-                    self.env.statistics)
-                acc_reward = 0.0
-
-        self._agent.save('./model.ckpt')
-
-    def learn_step(self, state, action, r):
-        self._global_step += 1
-        if (self._global_step > self._warm_up_size):
-            for i in range(self.env.elevator_num):
-                self._rpm.append(self._last_observation_array[i],
-                                 self._last_action[i], self._last_reward,
-                                 deepcopy(state[i]), False)
-        self._last_observation_array = deepcopy(state)
-        self._last_action = deepcopy(action)
-        self._last_reward = r
-
-        ret_dict = {}
-        if self._rpm.size() > self._warm_up_size:
-            batch_obs, batch_action, batch_reward, batch_next_obs, batch_terminal = \
-                self._rpm.sample_batch(BATCH_SIZE)
-            cost = self._agent.learn(batch_obs, batch_action, batch_reward,
-                                     batch_next_obs, batch_terminal)
-            self._loss_queue.appendleft(cost)
-            if (len(self._loss_queue) > self._statistic_freq):
-                self._loss_queue.pop()
-            if (self._global_step % self._statistic_freq == 0):
-                ret_dict["Temporal Difference Error(Average)"] = \
-                    float(sum(self._loss_queue)) / float(len(self._loss_queue))
-
-        return ret_dict
diff --git a/examples/LiftSim_baseline/wrapper.py b/examples/LiftSim_baseline/wrapper.py
deleted file mode 100644
index 55d525deaeecb76df4f2ba9183ed5ea6c119e5d8..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/wrapper.py
+++ /dev/null
@@ -1,87 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-# wrapper part modified from
-# https://github.com/openai/gym/blob/master/gym/core.py
-
-from rlschool import LiftSim
-from wrapper_utils import obs_dim, act_dim, mansion_state_preprocessing
-from wrapper_utils import action_idx_to_action
-
-
-class Wrapper(LiftSim):
-    def __init__(self, env):
-        self.env = env
-        self._mansion = env._mansion
-        self.mansion_attr = self._mansion.attribute
-        self.elevator_num = self.mansion_attr.ElevatorNumber
-        self.observation_space = obs_dim(self.mansion_attr)
-        self.action_space = act_dim(self.mansion_attr)
-        self.viewer = env.viewer
-
-    def __getattr__(self, name):
-        if name.startswith('_'):
-            raise AttributeError(
-                "attempted to get missing private attribute '{}'".format(name))
-        return getattr(self.env, name)
-
-    def seed(self, seed=None):
-        return self.env.seed(seed)
-
-    def step(self, action):
-        return self.env.step(action)
-
-    def reset(self):
-        return self.env.reset()
-
-    def render(self):
-        return self.env.render()
-
-    def close(self):
-        return self.env.close()
-
-
-class RewardWrapper(Wrapper):
-    pass
-
-
-class ActionWrapper(Wrapper):
-    def reset(self):
-        return self.env.reset()
-
-    def step(self, action):
-        act = []
-        for a in action:
-            act.extend(self.action(a, self.action_space))
-        return self.env.step(act)
-
-    def action(self, action, action_space):
-        return action_idx_to_action(action, action_space)
-
-
-class ObservationWrapper(Wrapper):
-    def reset(self):
-        self.env.reset()
-        return self.observation(self._mansion.state)
-
-    def step(self, action):
-        observation, reward, done, info = self.env.step(action)
-        return (self.observation(observation), reward, done, info)
-
-    def observation(self, observation):
-        return mansion_state_preprocessing(observation)
-
-    @property
-    def state(self):
-        return self.observation(self._mansion.state)
diff --git a/examples/LiftSim_baseline/wrapper_utils.py b/examples/LiftSim_baseline/wrapper_utils.py
deleted file mode 100644
index 45afcefbf9ebdbacc2841bd54b1756a1213be5bf..0000000000000000000000000000000000000000
--- a/examples/LiftSim_baseline/wrapper_utils.py
+++ /dev/null
@@ -1,241 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import sys
-import random
-import numpy as np
-from rlschool import ElevatorState, ElevatorAction
-from rlschool import MansionAttribute, MansionState
-from rlschool import EPSILON, HUGE
-from rlschool import MansionConfig
-from rlschool import MansionManager
-
-
-def discretize(value, n_dim, min_val, max_val):
-    """
-    discretize a value into a vector of n_dim dimension 1-hot representation
-    with the value below min_val being [1, 0, 0, ..., 0]
-    and the value above max_val being [0, 0, ..., 0, 1]
-    Args:
-        value: the value that needs to be discretized into 1-hot format
-        n_dim: number of dimensions
-        min_val: minimal value in the result
-        man_val: maximum value in the result
-    Returns:
-        the discretized vector
-    """
-    assert n_dim > 0
-    if (n_dim == 1):
-        return [1]
-    delta = (max_val - min_val) / float(n_dim - 1)
-    active_pos = int((value - min_val) / delta + 0.5)
-    active_pos = min(n_dim - 1, active_pos)
-    active_pos = max(0, active_pos)
-    ret_array = [0 for i in range(n_dim)]
-    ret_array[active_pos] = 1.0
-    return ret_array
-
-
-def linear_discretize(value, n_dim, min_val, max_val):
-    """
-    discretize a value into a vector of n_dim dimensional representation
-    with the value below min_val being [1, 0, 0, ..., 0]
-    and the value above max_val being [0, 0, ..., 0, 1]
-    e.g. if n_dim = 2, min_val = 1.0, max_val = 2.0
-      if value  = 1.5 returns [0.5, 0.5], if value = 1.8 returns [0.2, 0.8]
-    Args:
-        value: the value that needs to be discretized
-        n_dim: number of dimensions
-        min_val: minimal value in the result
-        man_val: maximum value in the result
-    Returns:
-        the discretized vector
-    """
-    assert n_dim > 0
-    if (n_dim == 1):
-        return [1]
-    delta = (max_val - min_val) / float(n_dim - 1)
-    active_pos = int((value - min_val) / delta + 0.5)
-    active_pos = min(n_dim - 2, active_pos)
-    active_pos = max(0, active_pos)
-    anchor_pt = active_pos * delta + min_val
-    if (anchor_pt > value and anchor_pt > min_val + 0.5 * delta):
-        anchor_pt -= delta
-        active_pos -= 1
-    weight = (value - anchor_pt) / delta
-    weight = min(1.0, max(0.0, weight))
-    ret_array = [0 for i in range(n_dim)]
-    ret_array[active_pos] = 1.0 - weight
-    ret_array[active_pos + 1] = weight
-    return ret_array
-
-
-def ele_state_preprocessing(ele_state):
-    """Process elevator state, make it usable for network
-    Args:
-        ele_state: ElevatorState, nametuple, defined in rlschool/liftsim/environment/mansion/utils.py
-    Returns:    
-        ele_feature: list of elevator state
-    """
-    ele_feature = []
-
-    # add floor information
-    ele_feature.extend(
-        linear_discretize(ele_state.Floor, ele_state.MaximumFloor, 1.0,
-                          ele_state.MaximumFloor))
-
-    # add velocity information
-    ele_feature.extend(
-        linear_discretize(ele_state.Velocity, 21, -ele_state.MaximumSpeed,
-                          ele_state.MaximumSpeed))
-
-    # add door information
-    ele_feature.append(ele_state.DoorState)
-    ele_feature.append(float(ele_state.DoorIsOpening))
-    ele_feature.append(float(ele_state.DoorIsClosing))
-
-    # add direction information
-    ele_feature.extend(discretize(ele_state.Direction, 3, -1, 1))
-
-    # add load weight information
-    ele_feature.extend(
-        linear_discretize(ele_state.LoadWeight / ele_state.MaximumLoad, 5, 0.0,
-                          1.0))
-
-    # add other information
-    target_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor)]
-    for target_floor in ele_state.ReservedTargetFloors:
-        target_floor_binaries[target_floor - 1] = 1.0
-    ele_feature.extend(target_floor_binaries)
-
-    dispatch_floor_binaries = [0.0 for i in range(ele_state.MaximumFloor + 1)]
-    dispatch_floor_binaries[ele_state.CurrentDispatchTarget] = 1.0
-    ele_feature.extend(dispatch_floor_binaries)
-    ele_feature.append(ele_state.DispatchTargetDirection)
-
-    return ele_feature
-
-
-def obs_dim(mansion_attr):
-    """Calculate the observation dimension
-    Args:
-        mansion_attr: MansionAttribute, attribute of mansion_manager
-    Returns:
-        observation dimension
-    """
-    assert isinstance(mansion_attr, MansionAttribute)
-    ele_dim = mansion_attr.NumberOfFloor * 3 + 34
-    obs_dim = (ele_dim + 1) * mansion_attr.ElevatorNumber + \
-        mansion_attr.NumberOfFloor * 2
-    return obs_dim
-
-
-def act_dim(mansion_attr):
-    """Calculate the action dimension, which is number of floor times 2 plus 2.
-    The additional two are for special cases: the elevator stops at once if the new dispatch_target is 0,
-    the original dispatch_target does not change if dispatch_target is -1. See implementation in
-    method action_idx_to_action below.
-    Args:
-        mansion_attr: MansionAttribute, attribute of mansion_manager
-    Returns:
-        action dimension
-    """
-    assert isinstance(mansion_attr, MansionAttribute)
-    return mansion_attr.NumberOfFloor * 2 + 2
-
-
-def mansion_state_preprocessing(mansion_state):
-    """Process mansion_state to make it usable for networks, convert it into a numpy array
-    Args:
-        mansion_state: namedtuple of mansion state, 
-            defined in rlschool/liftsim/environment/mansion/utils.py
-    Returns:
-        the converted numpy array
-    """
-    ele_features = list()
-    for ele_state in mansion_state.ElevatorStates:
-        ele_features.append(ele_state_preprocessing(ele_state))
-        max_floor = ele_state.MaximumFloor
-
-    target_floor_binaries_up = [0.0 for i in range(max_floor)]
-    target_floor_binaries_down = [0.0 for i in range(max_floor)]
-    for floor in mansion_state.RequiringUpwardFloors:
-        target_floor_binaries_up[floor - 1] = 1.0
-    for floor in mansion_state.RequiringDownwardFloors:
-        target_floor_binaries_down[floor - 1] = 1.0
-    target_floor_binaries = target_floor_binaries_up + target_floor_binaries_down
-
-    idx = 0
-    man_features = list()
-    for idx in range(len(mansion_state.ElevatorStates)):
-        elevator_id_vec = discretize(idx + 1,
-                                     len(mansion_state.ElevatorStates), 1,
-                                     len(mansion_state.ElevatorStates))
-        idx_array = list(range(len(mansion_state.ElevatorStates)))
-        idx_array.remove(idx)
-        # random.shuffle(idx_array)
-        man_features.append(ele_features[idx])
-        for left_idx in idx_array:
-            man_features[idx] = man_features[idx] + ele_features[left_idx]
-        man_features[idx] = man_features[idx] + \
-            elevator_id_vec + target_floor_binaries
-    return np.asarray(man_features, dtype='float32')
-
-
-def action_idx_to_action(action_idx, act_dim):
-    """Convert action_inx to action
-    Args:
-        action_idx: the index needed to be converted
-        act_dim: action dimension
-    Returns:
-        the converted namedtuple
-    """
-    assert isinstance(action_idx, int)
-    assert isinstance(act_dim, int)
-    realdim = act_dim - 2
-    if (action_idx == realdim):
-        return ElevatorAction(0, 1)
-    elif (action_idx == realdim + 1):
-        return ElevatorAction(-1, 1)
-    action = action_idx
-    if (action_idx < realdim / 2):
-        direction = 1
-        action += 1
-    else:
-        direction = -1
-        action -= int(realdim / 2)
-        action += 1
-    return [action, direction]
-
-
-def action_to_action_idx(action, act_dim):
-    """Convert action to number according to act_dim. 
-    Args:
-        action: namedtuple defined in rlschool/liftsim/environment/mansion/utils.py
-        act_dim: action dimension
-    Returns:
-        action_idx: the result index
-    """
-    assert isinstance(action, ElevatorAction)
-    assert isinstance(act_dim, int)
-    realdim = act_dim - 2
-    if (action.TargetFloor == 0):
-        return realdim
-    elif (action.TargetFloor < 0):
-        return realdim + 1
-    action_idx = 0
-    if (action.DirectionIndicator < 0):
-        action_idx += int(realdim / 2)
-    action_idx += action.TargetFloor - 1
-    return action_idx
diff --git a/examples/MADDPG/README.md b/examples/MADDPG/README.md
index 55d191474b62f5099d91da51bd80443abd6b87d8..0bf3a599e76a3ecc127385baa0f4d81e47e3662b 100644
--- a/examples/MADDPG/README.md
+++ b/examples/MADDPG/README.md
@@ -98,7 +98,7 @@ simple_world_comm<br>
 + [paddlepaddle>=1.6.1](https://github.com/PaddlePaddle/Paddle)
 + [parl](https://github.com/PaddlePaddle/PARL)
 + [multiagent-particle-envs](https://github.com/openai/multiagent-particle-envs)
-+ gym
++ gym==0.10.5
 
 ### Start Training:
 ```
diff --git a/examples/MADDPG/train.py b/examples/MADDPG/train.py
index d0e20dcdb4fd35638432fb1666b76b30c2a388d8..8454a73ee209707c65340897ce9b090d482c6751 100644
--- a/examples/MADDPG/train.py
+++ b/examples/MADDPG/train.py
@@ -20,7 +20,7 @@ from simple_model import MAModel
 from simple_agent import MAAgent
 import parl
 from parl.env.multiagent_simple_env import MAenv
-from parl.utils import logger, tensorboard
+from parl.utils import logger, summary
 
 
 def run_episode(env, agents):
@@ -62,8 +62,8 @@ def run_episode(env, agents):
         # learn policy
         for i, agent in enumerate(agents):
             critic_loss = agent.learn(agents)
-            tensorboard.add_scalar('critic_loss_%d' % i, critic_loss,
-                                   agent.global_train_step)
+            summary.add_scalar('critic_loss_%d' % i, critic_loss,
+                               agent.global_train_step)
 
     return total_reward, agents_reward, steps
 
@@ -155,12 +155,12 @@ def train_agent():
                 format(total_steps, total_episodes, mean_episode_reward,
                        use_time))
             t_start = time.time()
-            tensorboard.add_scalar('mean_episode_reward/episode',
-                                   mean_episode_reward, total_episodes)
-            tensorboard.add_scalar('mean_episode_reward/steps',
-                                   mean_episode_reward, total_steps)
-            tensorboard.add_scalar('use_time/1000episode', use_time,
-                                   total_episodes)
+            summary.add_scalar('mean_episode_reward/episode',
+                               mean_episode_reward, total_episodes)
+            summary.add_scalar('mean_episode_reward/steps',
+                               mean_episode_reward, total_steps)
+            summary.add_scalar('use_time/1000episode', use_time,
+                               total_episodes)
 
             # save model
             if not args.restore:
diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
index e784dae00f9ffdc5528a4c7dafda2916e5d4c456..e3a8066d79128ed9e969bb7d4c1c8cce3bee3775 100755
--- a/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
+++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/evaluate.py
@@ -22,7 +22,7 @@ import numpy as np
 from actor import Actor
 from opensim_model import OpenSimModel
 from opensim_agent import OpenSimAgent
-from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count
+from parl.utils import logger, ReplayMemory, summary, get_gpu_count
 from parl.utils.window_stat import WindowStat
 from parl.remote.client import get_global_client
 from parl.utils import machine_info
diff --git a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
index b37fb369a15c8a28a3911dbb9a864cf28d1da8b7..cf14f1e0306c69c8f134cf6c81c279ac982b52d0 100755
--- a/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
+++ b/examples/NeurIPS2019-Learn-to-Move-Challenge/train.py
@@ -22,7 +22,7 @@ import numpy as np
 from actor import Actor
 from opensim_model import OpenSimModel
 from opensim_agent import OpenSimAgent
-from parl.utils import logger, ReplayMemory, tensorboard, get_gpu_count
+from parl.utils import logger, ReplayMemory, summary, get_gpu_count
 from parl.utils.window_stat import WindowStat
 from parl.remote.client import get_global_client
 from parl.utils import machine_info
@@ -97,7 +97,7 @@ class Learner(object):
         # add lock between training and predicting
         self.model_lock = threading.Lock()
 
-        # add lock when appending data to rpm or writing scalars to tensorboard
+        # add lock when appending data to rpm or writing scalars to summary
         self.memory_lock = threading.Lock()
 
         self.ready_actor_queue = queue.Queue()
@@ -246,24 +246,24 @@ class Learner(object):
                                           episode_env_reward)
 
                 if self.env_reward_stat.count > 500:
-                    tensorboard.add_scalar('recent_env_reward',
-                                           self.env_reward_stat.mean,
-                                           self.total_steps)
-                    tensorboard.add_scalar('recent_shaping_reward',
-                                           self.shaping_reward_stat.mean,
-                                           self.total_steps)
-                if self.critic_loss_stat.count > 500:
-                    tensorboard.add_scalar('recent_critic_loss',
-                                           self.critic_loss_stat.mean,
-                                           self.total_steps)
-                tensorboard.add_scalar('episode_length', n, self.total_steps)
-                tensorboard.add_scalar('max_env_reward', self.max_env_reward,
+                    summary.add_scalar('recent_env_reward',
+                                       self.env_reward_stat.mean,
                                        self.total_steps)
-                tensorboard.add_scalar('ready_actor_num',
-                                       self.ready_actor_queue.qsize(),
+                    summary.add_scalar('recent_shaping_reward',
+                                       self.shaping_reward_stat.mean,
                                        self.total_steps)
-                tensorboard.add_scalar('episode_time', episode_time,
+                if self.critic_loss_stat.count > 500:
+                    summary.add_scalar('recent_critic_loss',
+                                       self.critic_loss_stat.mean,
                                        self.total_steps)
+                summary.add_scalar('episode_length', n, self.total_steps)
+                summary.add_scalar('max_env_reward', self.max_env_reward,
+                                   self.total_steps)
+                summary.add_scalar('ready_actor_num',
+                                   self.ready_actor_queue.qsize(),
+                                   self.total_steps)
+                summary.add_scalar('episode_time', episode_time,
+                                   self.total_steps)
 
             self.noiselevel = self.noiselevel * NOISE_DECAY
 
diff --git a/examples/SAC/train.py b/examples/SAC/train.py
index a88260245880a39738f931573dd0b183487722df..3e2b7140e9ab5694c38bd86ded04a5e977da9d3a 100644
--- a/examples/SAC/train.py
+++ b/examples/SAC/train.py
@@ -21,7 +21,7 @@ import time
 import parl
 from mujoco_agent import MujocoAgent
 from mujoco_model import ActorModel, CriticModel
-from parl.utils import logger, tensorboard, action_mapping, ReplayMemory
+from parl.utils import logger, summary, action_mapping, ReplayMemory
 
 ACTOR_LR = 1e-3
 CRITIC_LR = 1e-3
@@ -111,8 +111,7 @@ def main():
         train_reward, steps = run_train_episode(env, agent, rpm)
         total_steps += steps
         logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
-        tensorboard.add_scalar('train/episode_reward', train_reward,
-                               total_steps)
+        summary.add_scalar('train/episode_reward', train_reward, total_steps)
 
         if total_steps // args.test_every_steps >= test_flag:
             while total_steps // args.test_every_steps >= test_flag:
@@ -120,8 +119,8 @@ def main():
             evaluate_reward = run_evaluate_episode(env, agent)
             logger.info('Steps {}, Evaluate reward: {}'.format(
                 total_steps, evaluate_reward))
-            tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
-                                   total_steps)
+            summary.add_scalar('eval/episode_reward', evaluate_reward,
+                               total_steps)
 
 
 if __name__ == '__main__':
diff --git a/examples/TD3/train.py b/examples/TD3/train.py
index 4cb74d9c01ab73dcb8cb20385b36262cb7c4aeba..8115a41ba1129e00dda1f2a7ca1b0ad3b9d64c71 100644
--- a/examples/TD3/train.py
+++ b/examples/TD3/train.py
@@ -19,7 +19,7 @@ import time
 import parl
 from mujoco_agent import MujocoAgent
 from mujoco_model import MujocoModel
-from parl.utils import logger, tensorboard, action_mapping, ReplayMemory
+from parl.utils import logger, summary, action_mapping, ReplayMemory
 
 MAX_EPISODES = 5000
 ACTOR_LR = 3e-4
@@ -117,8 +117,7 @@ def main():
         train_reward, steps = run_train_episode(env, agent, rpm)
         total_steps += steps
         logger.info('Steps: {} Reward: {}'.format(total_steps, train_reward))
-        tensorboard.add_scalar('train/episode_reward', train_reward,
-                               total_steps)
+        summary.add_scalar('train/episode_reward', train_reward, total_steps)
 
         if total_steps // args.test_every_steps >= test_flag:
             while total_steps // args.test_every_steps >= test_flag:
@@ -126,8 +125,8 @@ def main():
             evaluate_reward = run_evaluate_episode(env, agent)
             logger.info('Steps {}, Evaluate reward: {}'.format(
                 total_steps, evaluate_reward))
-            tensorboard.add_scalar('eval/episode_reward', evaluate_reward,
-                                   total_steps)
+            summary.add_scalar('eval/episode_reward', evaluate_reward,
+                               total_steps)
 
 
 if __name__ == '__main__':
diff --git a/examples/offline-Q-learning/atari.py b/examples/offline-Q-learning/atari.py
index 11909eba8307ef781337b20ca2fe200ed967cc45..e0e1b3cc097be221483d0a8712951b9d38f5da54 120000
--- a/examples/offline-Q-learning/atari.py
+++ b/examples/offline-Q-learning/atari.py
@@ -1 +1 @@
-../DQN/atari.py
\ No newline at end of file
+../DQN_variant/atari.py
\ No newline at end of file
diff --git a/examples/offline-Q-learning/atari_wrapper.py b/examples/offline-Q-learning/atari_wrapper.py
index e58186a870b13dc7fff25c52cbdd1d009a18f4ac..2904fb39b7934d104209d0085ca814d5c132fe90 120000
--- a/examples/offline-Q-learning/atari_wrapper.py
+++ b/examples/offline-Q-learning/atari_wrapper.py
@@ -1 +1 @@
-../DQN/atari_wrapper.py
\ No newline at end of file
+../DQN_variant/atari_wrapper.py
\ No newline at end of file
diff --git a/examples/offline-Q-learning/dqn.py b/examples/offline-Q-learning/dqn.py
index feedf7d21797c052ab716412ceb9cb7c2db78350..d761d2f75f27b3d26e1de046b86400e35aebcbf1 100644
--- a/examples/offline-Q-learning/dqn.py
+++ b/examples/offline-Q-learning/dqn.py
@@ -19,23 +19,16 @@ import copy
 import paddle.fluid as fluid
 from parl.core.fluid.algorithm import Algorithm
 from parl.core.fluid import layers
-from parl.utils.deprecation import deprecated
 
 __all__ = ['DQN']
 
 
 class DQN(Algorithm):
-    def __init__(self,
-                 model,
-                 hyperparas=None,
-                 act_dim=None,
-                 gamma=None,
-                 lr=None):
+    def __init__(self, model, act_dim=None, gamma=None, lr=None):
         """ DQN algorithm
         
         Args:
             model (parl.Model): model defining forward network of Q function
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             act_dim (int): dimension of the action space
             gamma (float): discounted factor for reward computation.
             lr (float): learning rate.
@@ -43,20 +36,12 @@ class DQN(Algorithm):
         self.model = model
         self.target_model = copy.deepcopy(model)
 
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.act_dim = hyperparas['action_dim']
-            self.gamma = hyperparas['gamma']
-        else:
-            assert isinstance(act_dim, int)
-            assert isinstance(gamma, float)
-            assert isinstance(lr, float)
-            self.act_dim = act_dim
-            self.gamma = gamma
-            self.lr = lr
+        assert isinstance(act_dim, int)
+        assert isinstance(gamma, float)
+        assert isinstance(lr, float)
+        self.act_dim = act_dim
+        self.gamma = gamma
+        self.lr = lr
 
     def predict(self, obs):
         """ use value model self.model to predict the action value
@@ -100,12 +85,7 @@ class DQN(Algorithm):
         cost = layers.reduce_mean(cost)
         return cost
 
-    def sync_target(self, gpu_id=None):
+    def sync_target(self):
         """ sync weights of self.model to self.target_model
         """
-        if gpu_id is not None:
-            warnings.warn(
-                "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
         self.model.sync_weights_to(self.target_model)
diff --git a/examples/offline-Q-learning/parallel_run.py b/examples/offline-Q-learning/parallel_run.py
index 3416f8cd6708d75ce0884584a43b66c674d8c699..d7da430e83de46be82a935bc01ce35ca6bd83c6e 100644
--- a/examples/offline-Q-learning/parallel_run.py
+++ b/examples/offline-Q-learning/parallel_run.py
@@ -22,7 +22,7 @@ from tqdm import tqdm
 import parl
 import paddle.fluid as fluid
 from parl.utils import get_gpu_count
-from parl.utils import tensorboard, logger
+from parl.utils import summary, logger
 
 from dqn import DQN  # slight changes from parl.algorithms.DQN
 from atari_agent import AtariAgent
@@ -45,21 +45,21 @@ gpu_num = get_gpu_count()
 def run_train_step(agent, rpm):
     for step in range(args.train_total_steps):
         # use the first 80% data to train
-        batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
+        batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_batch(
             args.batch_size * gpu_num)
-        batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-        batch_next_state = batch_all_state[:, 1:, :, :]
-        cost = agent.learn(batch_state, batch_action, batch_reward,
-                           batch_next_state, batch_isOver)
+        batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+        batch_next_obs = batch_all_obs[:, 1:, :, :]
+        cost = agent.learn(batch_obs, batch_action, batch_reward,
+                           batch_next_obs, batch_isOver)
 
         if step % 100 == 0:
             # use the last 20% data to evaluate
-            batch_all_state, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch(
+            batch_all_obs, batch_action, batch_reward, batch_isOver = rpm.sample_test_batch(
                 args.batch_size)
-            batch_state = batch_all_state[:, :CONTEXT_LEN, :, :]
-            batch_next_state = batch_all_state[:, 1:, :, :]
-            eval_cost = agent.supervised_eval(batch_state, batch_action,
-                                              batch_reward, batch_next_state,
+            batch_obs = batch_all_obs[:, :CONTEXT_LEN, :, :]
+            batch_next_obs = batch_all_obs[:, 1:, :, :]
+            eval_cost = agent.supervised_eval(batch_obs, batch_action,
+                                              batch_reward, batch_next_obs,
                                               batch_isOver)
             logger.info(
                 "train step {}, train costs are {}, eval cost is {}.".format(
@@ -67,17 +67,17 @@ def run_train_step(agent, rpm):
 
 
 def collect_exp(env, rpm, agent):
-    state = env.reset()
+    obs = env.reset()
     # collect data to fulfill replay memory
     for i in tqdm(range(MEMORY_SIZE)):
-        context = rpm.recent_state()
-        context.append(state)
+        context = rpm.recent_obs()
+        context.append(obs)
         context = np.stack(context, axis=0)
         action = agent.sample(context)
 
-        next_state, reward, isOver, _ = env.step(action)
-        rpm.append(Experience(state, action, reward, isOver))
-        state = next_state
+        next_obs, reward, isOver, _ = env.step(action)
+        rpm.append(Experience(obs, action, reward, isOver))
+        obs = next_obs
 
 
 def main():
diff --git a/examples/offline-Q-learning/replay_memory.py b/examples/offline-Q-learning/replay_memory.py
index 2296ea906ee47a53f697777b6885dad6365460e8..94a43c25d32ac9c9107dfa90a33d1280a5bebd16 100644
--- a/examples/offline-Q-learning/replay_memory.py
+++ b/examples/offline-Q-learning/replay_memory.py
@@ -18,18 +18,18 @@ import os
 from collections import deque, namedtuple
 from parl.utils import logger
 
-Experience = namedtuple('Experience', ['state', 'action', 'reward', 'isOver'])
+Experience = namedtuple('Experience', ['obs', 'action', 'reward', 'isOver'])
 
 
 class ReplayMemory(object):
     def __init__(self,
                  max_size,
-                 state_shape,
+                 obs_shape,
                  context_len,
                  load_file=False,
                  file_path=None):
         self.max_size = int(max_size)
-        self.state_shape = state_shape
+        self.obs_shape = obs_shape
         self.context_len = int(context_len)
 
         self.file_path = file_path
@@ -38,8 +38,7 @@ class ReplayMemory(object):
             self.load_memory()
             logger.info("memory size is {}".format(self._curr_size))
         else:
-            self.state = np.zeros(
-                (self.max_size, ) + state_shape, dtype='uint8')
+            self.obs = np.zeros((self.max_size, ) + obs_shape, dtype='uint8')
             self.action = np.zeros((self.max_size, ), dtype='int32')
             self.reward = np.zeros((self.max_size, ), dtype='float32')
             self.isOver = np.zeros((self.max_size, ), dtype='bool')
@@ -62,42 +61,41 @@ class ReplayMemory(object):
         else:
             self._context.append(exp)
 
-    def recent_state(self):
-        """ maintain recent state for training"""
+    def recent_obs(self):
+        """ maintain recent obs for training"""
         lst = list(self._context)
-        states = [np.zeros(self.state_shape, dtype='uint8')] * \
+        obs = [np.zeros(self.obs_shape, dtype='uint8')] * \
                     (self._context.maxlen - len(lst))
-        states.extend([k.state for k in lst])
-        return states
+        obs.extend([k.obs for k in lst])
+        return obs
 
     def sample(self, idx):
-        """ return state, action, reward, isOver,
-            note that some frames in state may be generated from last episode,
-            they should be removed from state
+        """ return obs, action, reward, isOver,
+            note that some frames in obs may be generated from last episode,
+            they should be removed from obs
             """
-        state = np.zeros(
-            (self.context_len + 1, ) + self.state_shape, dtype=np.uint8)
-        state_idx = np.arange(idx,
-                              idx + self.context_len + 1) % self._curr_size
+        obs = np.zeros(
+            (self.context_len + 1, ) + self.obs_shape, dtype=np.uint8)
+        obs_idx = np.arange(idx, idx + self.context_len + 1) % self._curr_size
 
         # confirm that no frame was generated from last episode
         has_last_episode = False
         for k in range(self.context_len - 2, -1, -1):
-            to_check_idx = state_idx[k]
+            to_check_idx = obs_idx[k]
             if self.isOver[to_check_idx]:
                 has_last_episode = True
-                state_idx = state_idx[k + 1:]
-                state[k + 1:] = self.state[state_idx]
+                obs_idx = obs_idx[k + 1:]
+                obs[k + 1:] = self.obs[obs_idx]
                 break
 
         if not has_last_episode:
-            state = self.state[state_idx]
+            obs = self.obs[obs_idx]
 
         real_idx = (idx + self.context_len - 1) % self._curr_size
         action = self.action[real_idx]
         reward = self.reward[real_idx]
         isOver = self.isOver[real_idx]
-        return state, reward, action, isOver
+        return obs, reward, action, isOver
 
     def __len__(self):
         return self._curr_size
@@ -106,7 +104,7 @@ class ReplayMemory(object):
         return self._curr_size
 
     def _assign(self, pos, exp):
-        self.state[pos] = exp.state
+        self.obs[pos] = exp.obs
         self.reward[pos] = exp.reward
         self.action[pos] = exp.action
         self.isOver[pos] = exp.isOver
@@ -129,15 +127,15 @@ class ReplayMemory(object):
         return self._process_batch(batch_exp)
 
     def _process_batch(self, batch_exp):
-        state = np.asarray([e[0] for e in batch_exp], dtype='uint8')
+        obs = np.asarray([e[0] for e in batch_exp], dtype='uint8')
         reward = np.asarray([e[1] for e in batch_exp], dtype='float32')
         action = np.asarray([e[2] for e in batch_exp], dtype='int8')
         isOver = np.asarray([e[3] for e in batch_exp], dtype='bool')
-        return [state, action, reward, isOver]
+        return [obs, action, reward, isOver]
 
     def save_memory(self):
         save_data = [
-            self.state, self.reward, self.action, self.isOver, self._curr_size,
+            self.obs, self.reward, self.action, self.isOver, self._curr_size,
             self._curr_pos, self._context
         ]
         np.savez(self.file_path, *save_data)
@@ -145,7 +143,7 @@ class ReplayMemory(object):
     def load_memory(self):
         container = np.load(self.file_path, allow_pickle=True)
         [
-            self.state, self.reward, self.action, self.isOver, self._curr_size,
+            self.obs, self.reward, self.action, self.isOver, self._curr_size,
             self._curr_pos, self._context
         ] = [container[key] for key in container]
         self._curr_size = self._curr_size.astype(int)
diff --git a/examples/offline-Q-learning/rom_files b/examples/offline-Q-learning/rom_files
index 966a8940cbb2d928de9f816d41efada9aa3c9b6e..c1c50b9a99991f7f5dd34d7f243e999a636ba926 120000
--- a/examples/offline-Q-learning/rom_files
+++ b/examples/offline-Q-learning/rom_files
@@ -1 +1 @@
-../DQN/rom_files/
\ No newline at end of file
+../DQN_variant/rom_files
\ No newline at end of file
diff --git a/examples/offline-Q-learning/utils.py b/examples/offline-Q-learning/utils.py
index 721338d52451903eb1599e2396c9699a410a188d..04c590ec46f98b6cfa6d1ec833112730900fb840 120000
--- a/examples/offline-Q-learning/utils.py
+++ b/examples/offline-Q-learning/utils.py
@@ -1 +1 @@
-../DQN/utils.py
\ No newline at end of file
+../DQN_variant/utils.py
\ No newline at end of file
diff --git a/examples/others/deepes.py b/examples/others/deepes.py
new file mode 100644
index 0000000000000000000000000000000000000000..07be65326274a27892f3e0eff8067081e101b11f
--- /dev/null
+++ b/examples/others/deepes.py
@@ -0,0 +1,100 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import gym
+import numpy as np
+
+
+class CartpoleAgent(object):
+    def __init__(self, obs_dim, act_dim, learning_rate):
+        self.learning_rate = learning_rate
+        # init weights
+        self.w = np.random.random((act_dim, obs_dim)) * 0.1
+        self.b = np.zeros(act_dim)
+        self.weights_total_size = self.w.size + self.b.size
+
+    def predict(self, obs):
+        out = np.dot(self.w, obs) + self.b
+        action = np.argmax(out)
+        return action
+
+    def learn(self, rewards, noises):
+        gradient = np.dot(
+            np.asarray(rewards, dtype=np.float32),
+            np.asarray(noises, dtype=np.float32))
+        gradient /= rewards.size
+
+        flat_weights = self.get_flat_weights()
+        # Compute the new weights.
+        new_weights = flat_weights + self.learning_rate * gradient
+        self.set_flat_weights(new_weights)
+
+    def set_flat_weights(self, flat_weights):
+        self.w = flat_weights[:self.w.size].reshape(self.w.shape)
+        self.b = flat_weights[self.w.size:]
+
+    def get_flat_weights(self):
+        flat_weights = np.concatenate(([self.w.ravel(), self.b]), axis=0)
+        return flat_weights
+
+
+def evaluate(env, agent):
+    ep_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)
+        obs, reward, done, _ = env.step(action)
+        ep_reward += reward
+        if done:
+            break
+    return ep_reward
+
+
+def reward_normalize(reward):
+    reward = np.asarray(reward)
+    max_r = np.max(reward)
+    min_r = np.min(reward)
+    if max_r == min_r:
+        reward = np.zeros(reward.shape)
+    else:
+        reward = (reward - min_r) / (max_r - min_r)
+        reward -= 0.5
+    return reward
+
+
+if __name__ == '__main__':
+    env = gym.make('CartPole-v0')
+    agent = CartpoleAgent(obs_dim=4, act_dim=2, learning_rate=0.1)
+
+    for epcho in range(100):
+        rewards = []
+        noises = []
+        lastest_flat_weights = agent.get_flat_weights()
+
+        for episode in range(10):
+            noise = np.random.randn(agent.weights_total_size)
+            perturbation = noise * 0.05
+
+            agent.set_flat_weights(lastest_flat_weights + perturbation)
+            ep_reward = evaluate(env, agent)
+
+            noises.append(noise)
+            rewards.append(ep_reward)
+
+        normalized_rewards = reward_normalize(rewards)
+        agent.set_flat_weights(lastest_flat_weights)
+        agent.learn(normalized_rewards, noises)
+        # evaluate
+        if (epcho % 10) == 0:
+            ep_reward = evaluate(env, agent)
+            print('Epcho {}, Test reward {}'.format(epcho, ep_reward))
diff --git a/examples/tutorials/README.md b/examples/tutorials/README.md
new file mode 100644
index 0000000000000000000000000000000000000000..676fbc5f2d9cc2a909dd9152b2ab0cc8549e9b82
--- /dev/null
+++ b/examples/tutorials/README.md
@@ -0,0 +1,37 @@
+## 《PARL强化学习入门实践》课程示例
+
+针对强化学习初学者，PARL提供了[入门课程](https://aistudio.baidu.com/aistudio/course/introduce/1335)，展示最基础的5个强化学习算法代码示例。
+
+## 课程大纲
++ 一、强化学习(RL)初印象
+    + RL概述、入门路线
+    + 实践：环境搭建（[lesson1](lesson1/gridworld.py) 的代码提供了格子环境世界的渲染封装）
++ 二、基于表格型方法求解RL
+    + MDP、状态价值、Q表格
+    + 实践： [Sarsa](lesson2/sarsa)、[Q-learning](lesson2/q_learning)
++ 三、基于神经网络方法求解RL
+    + 函数逼近方法
+    + 实践：[DQN](lesson3/dqn)
++ 四、基于策略梯度求解RL
+    + 策略近似、策略梯度
+    + 实践：[Policy Gradient](lesson4/policy_gradient)
++ 五、连续动作空间上求解RL
+    + 实战：[DDPG](lesson5/ddpg)
+
+
+
+## 使用说明
+
+### 安装依赖
+
++ [paddlepaddle==1.6.3](https://github.com/PaddlePaddle/Paddle)
++ [parl==1.3.1](https://github.com/PaddlePaddle/PARL)
++ gym
+
+
+### 运行示例
+
+进入每个示例对应的代码文件夹中，运行
+```
+python train.py
+```
diff --git a/examples/tutorials/lesson1/gridworld.py b/examples/tutorials/lesson1/gridworld.py
new file mode 100644
index 0000000000000000000000000000000000000000..7af6e6aebadf04941e3ee744af35244dbedb31ad
--- /dev/null
+++ b/examples/tutorials/lesson1/gridworld.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        env.render()  # 渲染一帧图像
diff --git a/examples/tutorials/lesson2/q_learning/agent.py b/examples/tutorials/lesson2/q_learning/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..7d72f9cae03c935431f58043fdb505cec526cb6b
--- /dev/null
+++ b/examples/tutorials/lesson2/q_learning/agent.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+
+class QLearningAgent(object):
+    def __init__(self,
+                 obs_n,
+                 act_n,
+                 learning_rate=0.01,
+                 gamma=0.9,
+                 e_greed=0.1):
+        self.act_n = act_n  # 动作维度，有几个动作可选
+        self.lr = learning_rate  # 学习率
+        self.gamma = gamma  # reward的衰减率
+        self.epsilon = e_greed  # 按一定概率随机选动作
+        self.Q = np.zeros((obs_n, act_n))
+
+    # 根据输入观察值，采样输出的动作值，带探索
+    def sample(self, obs):
+        if np.random.uniform(0, 1) < (1.0 - self.epsilon):  #根据table的Q值选动作
+            action = self.predict(obs)
+        else:
+            action = np.random.choice(self.act_n)  #有一定概率随机探索选取一个动作
+        return action
+
+    # 根据输入观察值，预测输出的动作值
+    def predict(self, obs):
+        Q_list = self.Q[obs, :]
+        maxQ = np.max(Q_list)
+        action_list = np.where(Q_list == maxQ)[0]  # maxQ可能对应多个action
+        action = np.random.choice(action_list)
+        return action
+
+    # 学习方法，也就是更新Q-table的方法
+    def learn(self, obs, action, reward, next_obs, done):
+        """ off-policy
+            obs: 交互前的obs, s_t
+            action: 本次交互选择的action, a_t
+            reward: 本次动作获得的奖励r
+            next_obs: 本次交互后的obs, s_t+1
+            done: episode是否结束
+        """
+        predict_Q = self.Q[obs, action]
+        if done:
+            target_Q = reward  # 没有下一个状态了
+        else:
+            target_Q = reward + self.gamma * np.max(
+                self.Q[next_obs, :])  # Q-learning
+        self.Q[obs, action] += self.lr * (target_Q - predict_Q)  # 修正q
+
+    # 把 Q表格 的数据保存到文件中
+    def save(self):
+        npy_file = './q_table.npy'
+        np.save(npy_file, self.Q)
+        print(npy_file + ' saved.')
+
+    # 从文件中读取数据到 Q表格
+    def restore(self, npy_file='./q_table.npy'):
+        self.Q = np.load(npy_file)
+        print(npy_file + ' loaded.')
diff --git a/examples/tutorials/lesson2/q_learning/gridworld.py b/examples/tutorials/lesson2/q_learning/gridworld.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48
--- /dev/null
+++ b/examples/tutorials/lesson2/q_learning/gridworld.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        # env.render() # 渲染一帧图像
diff --git a/examples/tutorials/lesson2/q_learning/train.py b/examples/tutorials/lesson2/q_learning/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..2e780605117e873091fd8e2ac9ece9a41645b51a
--- /dev/null
+++ b/examples/tutorials/lesson2/q_learning/train.py
@@ -0,0 +1,90 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+from gridworld import CliffWalkingWapper, FrozenLakeWapper
+from agent import QLearningAgent
+import time
+
+
+def run_episode(env, agent, render=False):
+    total_steps = 0  # 记录每个episode走了多少step
+    total_reward = 0
+
+    obs = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
+
+    while True:
+        action = agent.sample(obs)  # 根据算法选择一个动作
+        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
+        # 训练 Q-learning算法
+        agent.learn(obs, action, reward, next_obs, done)
+
+        obs = next_obs  # 存储上一个观察值
+        total_reward += reward
+        total_steps += 1  # 计算step数
+        if render:
+            env.render()  #渲染新的一帧图形
+        if done:
+            break
+    return total_reward, total_steps
+
+
+def test_episode(env, agent):
+    total_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)  # greedy
+        next_obs, reward, done, _ = env.step(action)
+        total_reward += reward
+        obs = next_obs
+        time.sleep(0.5)
+        env.render()
+        if done:
+            print('test reward = %.1f' % (total_reward))
+            break
+
+
+def main():
+    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
+    # env = FrozenLakeWapper(env)
+
+    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    env = CliffWalkingWapper(env)
+
+    agent = QLearningAgent(
+        obs_n=env.observation_space.n,
+        act_n=env.action_space.n,
+        learning_rate=0.1,
+        gamma=0.9,
+        e_greed=0.1)
+
+    is_render = False
+    for episode in range(500):
+        ep_reward, ep_steps = run_episode(env, agent, is_render)
+        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
+                                                          ep_reward))
+
+        # 每隔20个episode渲染一下看看效果
+        if episode % 20 == 0:
+            is_render = True
+        else:
+            is_render = False
+    # 训练结束，查看算法效果
+    test_episode(env, agent)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tutorials/lesson2/sarsa/agent.py b/examples/tutorials/lesson2/sarsa/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..964230c88bef164dc8f22d5a3eb5e99f242097d3
--- /dev/null
+++ b/examples/tutorials/lesson2/sarsa/agent.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import numpy as np
+
+
+class SarsaAgent(object):
+    def __init__(self,
+                 obs_n,
+                 act_n,
+                 learning_rate=0.01,
+                 gamma=0.9,
+                 e_greed=0.1):
+        self.act_n = act_n  # 动作维度，有几个动作可选
+        self.lr = learning_rate  # 学习率
+        self.gamma = gamma  # reward的衰减率
+        self.epsilon = e_greed  # 按一定概率随机选动作
+        self.Q = np.zeros((obs_n, act_n))
+
+    # 根据输入观察值，采样输出的动作值，带探索
+    def sample(self, obs):
+        if np.random.uniform(0, 1) < (1.0 - self.epsilon):  #根据table的Q值选动作
+            action = self.predict(obs)
+        else:
+            action = np.random.choice(self.act_n)  #有一定概率随机探索选取一个动作
+        return action
+
+    # 根据输入观察值，预测输出的动作值
+    def predict(self, obs):
+        Q_list = self.Q[obs, :]
+        maxQ = np.max(Q_list)
+        action_list = np.where(Q_list == maxQ)[0]  # maxQ可能对应多个action
+        action = np.random.choice(action_list)
+        return action
+
+    # 学习方法，也就是更新Q-table的方法
+    def learn(self, obs, action, reward, next_obs, next_action, done):
+        """ on-policy
+            obs: 交互前的obs, s_t
+            action: 本次交互选择的action, a_t
+            reward: 本次动作获得的奖励r
+            next_obs: 本次交互后的obs, s_t+1
+            next_action: 根据当前Q表格, 针对next_obs会选择的动作, a_t+1
+            done: episode是否结束
+        """
+        predict_Q = self.Q[obs, action]
+        if done:
+            target_Q = reward  # 没有下一个状态了
+        else:
+            target_Q = reward + self.gamma * self.Q[next_obs,
+                                                    next_action]  # Sarsa
+        self.Q[obs, action] += self.lr * (target_Q - predict_Q)  # 修正q
+
+    def save(self):
+        npy_file = './q_table.npy'
+        np.save(npy_file, self.Q)
+        print(npy_file + ' saved.')
+
+    def restore(self, npy_file='./q_table.npy'):
+        self.Q = np.load(npy_file)
+        print(npy_file + ' loaded.')
diff --git a/examples/tutorials/lesson2/sarsa/gridworld.py b/examples/tutorials/lesson2/sarsa/gridworld.py
new file mode 100644
index 0000000000000000000000000000000000000000..ca8acb2da5476e96d3cb95a479b2dfdbd7ba0b48
--- /dev/null
+++ b/examples/tutorials/lesson2/sarsa/gridworld.py
@@ -0,0 +1,195 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+import turtle
+import numpy as np
+
+# turtle tutorial : https://docs.python.org/3.3/library/turtle.html
+
+
+def GridWorld(gridmap=None, is_slippery=False):
+    if gridmap is None:
+        gridmap = ['SFFF', 'FHFH', 'FFFH', 'HFFG']
+    env = gym.make("FrozenLake-v0", desc=gridmap, is_slippery=False)
+    env = FrozenLakeWapper(env)
+    return env
+
+
+class FrozenLakeWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.max_y = env.desc.shape[0]
+        self.max_x = env.desc.shape[1]
+        self.t = None
+        self.unit = 50
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for _ in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for i in range(self.desc.shape[0]):
+                for j in range(self.desc.shape[1]):
+                    x = j
+                    y = self.max_y - 1 - i
+                    if self.desc[i][j] == b'S':  # Start
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'F':  # Frozen ice
+                        self.draw_box(x, y, 'white')
+                    elif self.desc[i][j] == b'G':  # Goal
+                        self.draw_box(x, y, 'yellow')
+                    elif self.desc[i][j] == b'H':  # Hole
+                        self.draw_box(x, y, 'black')
+                    else:
+                        self.draw_box(x, y, 'white')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+class CliffWalkingWapper(gym.Wrapper):
+    def __init__(self, env):
+        gym.Wrapper.__init__(self, env)
+        self.t = None
+        self.unit = 50
+        self.max_x = 12
+        self.max_y = 4
+
+    def draw_x_line(self, y, x0, x1, color='gray'):
+        assert x1 > x0
+        self.t.color(color)
+        self.t.setheading(0)
+        self.t.up()
+        self.t.goto(x0, y)
+        self.t.down()
+        self.t.forward(x1 - x0)
+
+    def draw_y_line(self, x, y0, y1, color='gray'):
+        assert y1 > y0
+        self.t.color(color)
+        self.t.setheading(90)
+        self.t.up()
+        self.t.goto(x, y0)
+        self.t.down()
+        self.t.forward(y1 - y0)
+
+    def draw_box(self, x, y, fillcolor='', line_color='gray'):
+        self.t.up()
+        self.t.goto(x * self.unit, y * self.unit)
+        self.t.color(line_color)
+        self.t.fillcolor(fillcolor)
+        self.t.setheading(90)
+        self.t.down()
+        self.t.begin_fill()
+        for i in range(4):
+            self.t.forward(self.unit)
+            self.t.right(90)
+        self.t.end_fill()
+
+    def move_player(self, x, y):
+        self.t.up()
+        self.t.setheading(90)
+        self.t.fillcolor('red')
+        self.t.goto((x + 0.5) * self.unit, (y + 0.5) * self.unit)
+
+    def render(self):
+        if self.t == None:
+            self.t = turtle.Turtle()
+            self.wn = turtle.Screen()
+            self.wn.setup(self.unit * self.max_x + 100,
+                          self.unit * self.max_y + 100)
+            self.wn.setworldcoordinates(0, 0, self.unit * self.max_x,
+                                        self.unit * self.max_y)
+            self.t.shape('circle')
+            self.t.width(2)
+            self.t.speed(0)
+            self.t.color('gray')
+            for _ in range(2):
+                self.t.forward(self.max_x * self.unit)
+                self.t.left(90)
+                self.t.forward(self.max_y * self.unit)
+                self.t.left(90)
+            for i in range(1, self.max_y):
+                self.draw_x_line(
+                    y=i * self.unit, x0=0, x1=self.max_x * self.unit)
+            for i in range(1, self.max_x):
+                self.draw_y_line(
+                    x=i * self.unit, y0=0, y1=self.max_y * self.unit)
+
+            for i in range(1, self.max_x - 1):
+                self.draw_box(i, 0, 'black')
+            self.draw_box(self.max_x - 1, 0, 'yellow')
+            self.t.shape('turtle')
+
+        x_pos = self.s % self.max_x
+        y_pos = self.max_y - 1 - int(self.s / self.max_x)
+        self.move_player(x_pos, y_pos)
+
+
+if __name__ == '__main__':
+    # 环境1：FrozenLake, 可以配置冰面是否是滑的
+    # 0 left, 1 down, 2 right, 3 up
+    env = gym.make("FrozenLake-v0", is_slippery=False)
+    env = FrozenLakeWapper(env)
+
+    # 环境2：CliffWalking, 悬崖环境
+    # env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    # env = CliffWalkingWapper(env)
+
+    # 环境3：自定义格子世界，可以配置地图, S为出发点Start, F为平地Floor, H为洞Hole, G为出口目标Goal
+    # gridmap = [
+    #         'SFFF',
+    #         'FHFF',
+    #         'FFFF',
+    #         'HFGF' ]
+    # env = GridWorld(gridmap)
+
+    env.reset()
+    for step in range(10):
+        action = np.random.randint(0, 4)
+        obs, reward, done, info = env.step(action)
+        print('step {}: action {}, obs {}, reward {}, done {}, info {}'.format(\
+                step, action, obs, reward, done, info))
+        # env.render() # 渲染一帧图像
diff --git a/examples/tutorials/lesson2/sarsa/train.py b/examples/tutorials/lesson2/sarsa/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..648ca7b30dd0a4b93a7134cbb209a8ae6558409b
--- /dev/null
+++ b/examples/tutorials/lesson2/sarsa/train.py
@@ -0,0 +1,92 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# -*- coding: utf-8 -*-
+
+import gym
+from gridworld import CliffWalkingWapper, FrozenLakeWapper
+from agent import SarsaAgent
+import time
+
+
+def run_episode(env, agent, render=False):
+    total_steps = 0  # 记录每个episode走了多少step
+    total_reward = 0
+
+    obs = env.reset()  # 重置环境, 重新开一局（即开始新的一个episode）
+    action = agent.sample(obs)  # 根据算法选择一个动作
+
+    while True:
+        next_obs, reward, done, _ = env.step(action)  # 与环境进行一个交互
+        next_action = agent.sample(next_obs)  # 根据算法选择一个动作
+        # 训练 Sarsa 算法
+        agent.learn(obs, action, reward, next_obs, next_action, done)
+
+        action = next_action
+        obs = next_obs  # 存储上一个观察值
+        total_reward += reward
+        total_steps += 1  # 计算step数
+        if render:
+            env.render()  #渲染新的一帧图形
+        if done:
+            break
+    return total_reward, total_steps
+
+
+def test_episode(env, agent):
+    total_reward = 0
+    obs = env.reset()
+    while True:
+        action = agent.predict(obs)  # greedy
+        next_obs, reward, done, _ = env.step(action)
+        total_reward += reward
+        obs = next_obs
+        time.sleep(0.5)
+        env.render()
+        if done:
+            print('test reward = %.1f' % (total_reward))
+            break
+
+
+def main():
+    # env = gym.make("FrozenLake-v0", is_slippery=False)  # 0 left, 1 down, 2 right, 3 up
+    # env = FrozenLakeWapper(env)
+
+    env = gym.make("CliffWalking-v0")  # 0 up, 1 right, 2 down, 3 left
+    env = CliffWalkingWapper(env)
+
+    agent = SarsaAgent(
+        obs_n=env.observation_space.n,
+        act_n=env.action_space.n,
+        learning_rate=0.1,
+        gamma=0.9,
+        e_greed=0.1)
+
+    is_render = False
+    for episode in range(500):
+        ep_reward, ep_steps = run_episode(env, agent, is_render)
+        print('Episode %s: steps = %s , reward = %.1f' % (episode, ep_steps,
+                                                          ep_reward))
+
+        # 每隔20个episode渲染一下看看效果
+        if episode % 20 == 0:
+            is_render = True
+        else:
+            is_render = False
+    # 训练结束，查看算法效果
+    test_episode(env, agent)
+
+
+if __name__ == "__main__":
+    main()
diff --git a/examples/tutorials/lesson3/dqn/agent.py b/examples/tutorials/lesson3/dqn/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..e14a737f16b62256ee0eb0efcfe3290222209f51
--- /dev/null
+++ b/examples/tutorials/lesson3/dqn/agent.py
@@ -0,0 +1,97 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import numpy as np
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class Agent(parl.Agent):
+    def __init__(self,
+                 algorithm,
+                 obs_dim,
+                 act_dim,
+                 e_greed=0.1,
+                 e_greed_decrement=0):
+        assert isinstance(obs_dim, int)
+        assert isinstance(act_dim, int)
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(Agent, self).__init__(algorithm)
+
+        self.global_step = 0
+        self.update_target_steps = 200  # 每隔200个training steps再把model的参数复制到target_model中
+
+        self.e_greed = e_greed  # 有一定概率随机选取动作，探索
+        self.e_greed_decrement = e_greed_decrement  # 随着训练逐步收敛，探索的程度慢慢降低
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.value = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):  # 搭建计算图用于 更新Q网络，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            action = layers.data(name='act', shape=[1], dtype='int32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(
+                name='next_obs', shape=[self.obs_dim], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            self.cost = self.alg.learn(obs, action, reward, next_obs, terminal)
+
+    def sample(self, obs):
+        sample = np.random.rand()  # 产生0~1之间的小数
+        if sample < self.e_greed:
+            act = np.random.randint(self.act_dim)  # 探索：每个动作都有概率被选择
+        else:
+            act = self.predict(obs)  # 选择最优动作
+        self.e_greed = max(
+            0.01, self.e_greed - self.e_greed_decrement)  # 随着训练逐步收敛，探索的程度慢慢降低
+        return act
+
+    def predict(self, obs):  # 选择最优动作
+        obs = np.expand_dims(obs, axis=0)
+        pred_Q = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.value])[0]
+        pred_Q = np.squeeze(pred_Q, axis=0)
+        act = np.argmax(pred_Q)  # 选择Q最大的下标，即对应的动作
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        # 每隔200个training steps同步一次model和target_model的参数
+        if self.global_step % self.update_target_steps == 0:
+            self.alg.sync_target()
+        self.global_step += 1
+
+        act = np.expand_dims(act, -1)
+        feed = {
+            'obs': obs.astype('float32'),
+            'act': act.astype('int32'),
+            'reward': reward,
+            'next_obs': next_obs.astype('float32'),
+            'terminal': terminal
+        }
+        cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.cost])[0]  # 训练一次网络
+        return cost
diff --git a/examples/tutorials/lesson3/dqn/algorithm.py b/examples/tutorials/lesson3/dqn/algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..0f27d9c71b1f13f2d4f5f90b8f8e0608e04b4bb0
--- /dev/null
+++ b/examples/tutorials/lesson3/dqn/algorithm.py
@@ -0,0 +1,79 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import copy
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class DQN(parl.Algorithm):
+    def __init__(self, model, act_dim=None, gamma=None, lr=None):
+        """ DQN algorithm
+        
+        Args:
+            model (parl.Model): 定义Q函数的前向网络结构
+            act_dim (int): action空间的维度，即有几个action
+            gamma (float): reward的衰减因子
+            lr (float): learning_rate，学习率.
+        """
+        self.model = model
+        self.target_model = copy.deepcopy(model)
+
+        assert isinstance(act_dim, int)
+        assert isinstance(gamma, float)
+        assert isinstance(lr, float)
+        self.act_dim = act_dim
+        self.gamma = gamma
+        self.lr = lr
+
+    def predict(self, obs):
+        """ 使用self.model的value网络来获取 [Q(s,a1),Q(s,a2),...]
+        """
+        return self.model.value(obs)
+
+    def learn(self, obs, action, reward, next_obs, terminal):
+        """ 使用DQN算法更新self.model的value网络
+        """
+
+        # 从target_model中获取 max Q' 的值，用于计算target_Q
+        next_pred_value = self.target_model.value(next_obs)
+        best_v = layers.reduce_max(next_pred_value, dim=1)
+        best_v.stop_gradient = True  # 阻止梯度传递
+        terminal = layers.cast(terminal, dtype='float32')
+        target = reward + (1.0 - terminal) * self.gamma * best_v
+
+        pred_value = self.model.value(obs)  # 获取Q预测值
+        # 将action转onehot向量，比如：3 => [0,0,0,1,0]
+        action_onehot = layers.one_hot(action, self.act_dim)
+        action_onehot = layers.cast(action_onehot, dtype='float32')
+        # 下面一行是逐元素相乘，拿到action对应的 Q(s,a)
+        # 比如：pred_value = [[2.3, 5.7, 1.2, 3.9, 1.4]], action_onehot = [[0,0,0,1,0]]
+        #  ==> pred_action_value = [[3.9]]
+        pred_action_value = layers.reduce_sum(
+            layers.elementwise_mul(action_onehot, pred_value), dim=1)
+
+        # 计算 Q(s,a) 与 target_Q的均方差，得到loss
+        cost = layers.square_error_cost(pred_action_value, target)
+        cost = layers.reduce_mean(cost)
+        optimizer = fluid.optimizer.Adam(learning_rate=self.lr)  # 使用Adam优化器
+        optimizer.minimize(cost)
+        return cost
+
+    def sync_target(self):
+        """ 把 self.model 的模型参数值同步到 self.target_model
+        """
+        self.model.sync_weights_to(self.target_model)
diff --git a/examples/tutorials/lesson3/dqn/model.py b/examples/tutorials/lesson3/dqn/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..17c7a8d93a532884187abf0a8cb44d3823018e56
--- /dev/null
+++ b/examples/tutorials/lesson3/dqn/model.py
@@ -0,0 +1,34 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import parl
+from parl import layers  # 封装了 paddle.fluid.layers 的API
+
+
+class Model(parl.Model):
+    def __init__(self, act_dim):
+        hid1_size = 128
+        hid2_size = 128
+        # 3层全连接网络
+        self.fc1 = layers.fc(size=hid1_size, act='relu')
+        self.fc2 = layers.fc(size=hid2_size, act='relu')
+        self.fc3 = layers.fc(size=act_dim, act=None)
+
+    def value(self, obs):
+        h1 = self.fc1(obs)
+        h2 = self.fc2(h1)
+        Q = self.fc3(h2)
+        return Q
diff --git a/examples/tutorials/lesson3/dqn/replay_memory.py b/examples/tutorials/lesson3/dqn/replay_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c83688184614a23429d7f64461877f283de9f5
--- /dev/null
+++ b/examples/tutorials/lesson3/dqn/replay_memory.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
+
+import random
+import collections
+import numpy as np
+
+
+class ReplayMemory(object):
+    def __init__(self, max_size):
+        self.buffer = collections.deque(maxlen=max_size)
+
+    def append(self, exp):
+        self.buffer.append(exp)
+
+    def sample(self, batch_size):
+        mini_batch = random.sample(self.buffer, batch_size)
+        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
+
+        for experience in mini_batch:
+            s, a, r, s_p, done = experience
+            obs_batch.append(s)
+            action_batch.append(a)
+            reward_batch.append(r)
+            next_obs_batch.append(s_p)
+            done_batch.append(done)
+
+        return np.array(obs_batch).astype('float32'), \
+            np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
+            np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
+
+    def __len__(self):
+        return len(self.buffer)
diff --git a/examples/tutorials/lesson3/dqn/train.py b/examples/tutorials/lesson3/dqn/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..13dfde79636f79ba2e96201f6579360d9c450898
--- /dev/null
+++ b/examples/tutorials/lesson3/dqn/train.py
@@ -0,0 +1,129 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import os
+import gym
+import numpy as np
+import parl
+from parl.utils import logger  # 日志打印工具
+
+from model import Model
+from algorithm import DQN  # from parl.algorithms import DQN  # parl >= 1.3.1
+from agent import Agent
+
+from replay_memory import ReplayMemory
+
+LEARN_FREQ = 5  # 训练频率，不需要每一个step都learn，攒一些新增经验后再learn，提高效率
+MEMORY_SIZE = 20000  # replay memory的大小，越大越占用内存
+MEMORY_WARMUP_SIZE = 200  # replay_memory 里需要预存一些经验数据，再从里面sample一个batch的经验让agent去learn
+BATCH_SIZE = 32  # 每次给agent learn的数据数量，从replay memory随机里sample一批数据出来
+LEARNING_RATE = 0.001  # 学习率
+GAMMA = 0.99  # reward 的衰减因子，一般取 0.9 到 0.999 不等
+
+
+# 训练一个episode
+def run_episode(env, agent, rpm):
+    total_reward = 0
+    obs = env.reset()
+    step = 0
+    while True:
+        step += 1
+        action = agent.sample(obs)  # 采样动作，所有动作都有概率被尝试到
+        next_obs, reward, done, _ = env.step(action)
+        rpm.append((obs, action, reward, next_obs, done))
+
+        # train model
+        if (len(rpm) > MEMORY_WARMUP_SIZE) and (step % LEARN_FREQ == 0):
+            (batch_obs, batch_action, batch_reward, batch_next_obs,
+             batch_done) = rpm.sample(BATCH_SIZE)
+            train_loss = agent.learn(batch_obs, batch_action, batch_reward,
+                                     batch_next_obs,
+                                     batch_done)  # s,a,r,s',done
+
+        total_reward += reward
+        obs = next_obs
+        if done:
+            break
+    return total_reward
+
+
+# 评估 agent, 跑 5 个episode，总reward求平均
+def evaluate(env, agent, render=False):
+    eval_reward = []
+    for i in range(5):
+        obs = env.reset()
+        episode_reward = 0
+        while True:
+            action = agent.predict(obs)  # 预测动作，只选最优动作
+            obs, reward, done, _ = env.step(action)
+            episode_reward += reward
+            if render:
+                env.render()
+            if done:
+                break
+        eval_reward.append(episode_reward)
+    return np.mean(eval_reward)
+
+
+def main():
+    env = gym.make(
+        'CartPole-v0'
+    )  # CartPole-v0: expected reward > 180                MountainCar-v0 : expected reward > -120
+    action_dim = env.action_space.n  # CartPole-v0: 2
+    obs_shape = env.observation_space.shape  # CartPole-v0: (4,)
+
+    rpm = ReplayMemory(MEMORY_SIZE)  # DQN的经验回放池
+
+    # 根据parl框架构建agent
+    model = Model(act_dim=action_dim)
+    algorithm = DQN(model, act_dim=action_dim, gamma=GAMMA, lr=LEARNING_RATE)
+    agent = Agent(
+        algorithm,
+        obs_dim=obs_shape[0],
+        act_dim=action_dim,
+        e_greed=0.1,  # 有一定概率随机选取动作，探索
+        e_greed_decrement=1e-6)  # 随着训练逐步收敛，探索的程度慢慢降低
+
+    # 加载模型
+    # save_path = './dqn_model.ckpt'
+    # agent.restore(save_path)
+
+    # 先往经验池里存一些数据，避免最开始训练的时候样本丰富度不够
+    while len(rpm) < MEMORY_WARMUP_SIZE:
+        run_episode(env, agent, rpm)
+
+    max_episode = 2000
+
+    # start train
+    episode = 0
+    while episode < max_episode:  # 训练max_episode个回合，test部分不计算入episode数量
+        # train part
+        for i in range(0, 50):
+            total_reward = run_episode(env, agent, rpm)
+            episode += 1
+
+        # test part
+        eval_reward = evaluate(env, agent, render=True)  # render=True 查看显示效果
+        logger.info('episode:{}    e_greed:{}   Test reward:{}'.format(
+            episode, agent.e_greed, eval_reward))
+
+    # 训练结束，保存模型
+    save_path = './dqn_model.ckpt'
+    agent.save(save_path)
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tutorials/lesson4/policy_gradient/agent.py b/examples/tutorials/lesson4/policy_gradient/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..fad9528a1d1f4035aece21fb0aec753cf6519ae9
--- /dev/null
+++ b/examples/tutorials/lesson4/policy_gradient/agent.py
@@ -0,0 +1,75 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import numpy as np
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class Agent(parl.Agent):
+    def __init__(self, algorithm, obs_dim, act_dim):
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(Agent, self).__init__(algorithm)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):  # 搭建计算图用于 预测动作，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.act_prob = self.alg.predict(obs)
+
+        with fluid.program_guard(
+                self.learn_program):  # 搭建计算图用于 更新policy网络，定义输入输出变量
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            act = layers.data(name='act', shape=[1], dtype='int64')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            self.cost = self.alg.learn(obs, act, reward)
+
+    def sample(self, obs):
+        obs = np.expand_dims(obs, axis=0)  # 增加一维维度
+        act_prob = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.act_prob])[0]
+        act_prob = np.squeeze(act_prob, axis=0)  # 减少一维维度
+        act = np.random.choice(range(self.act_dim), p=act_prob)  # 根据动作概率选取动作
+        return act
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act_prob = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.act_prob])[0]
+        act_prob = np.squeeze(act_prob, axis=0)
+        act = np.argmax(act_prob)  # 根据动作概率选择概率最高的动作
+        return act
+
+    def learn(self, obs, act, reward):
+        act = np.expand_dims(act, axis=-1)
+        feed = {
+            'obs': obs.astype('float32'),
+            'act': act.astype('int64'),
+            'reward': reward.astype('float32')
+        }
+        cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.cost])[0]
+        return cost
diff --git a/examples/tutorials/lesson4/policy_gradient/algorithm.py b/examples/tutorials/lesson4/policy_gradient/algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..e48de8f5407f7bd7ff339bcd155e71364ee8e8c6
--- /dev/null
+++ b/examples/tutorials/lesson4/policy_gradient/algorithm.py
@@ -0,0 +1,54 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class PolicyGradient(parl.Algorithm):
+    def __init__(self, model, lr=None):
+        """ Policy Gradient algorithm
+        
+        Args:
+            model (parl.Model): policy的前向网络.
+            lr (float): 学习率.
+        """
+
+        self.model = model
+        assert isinstance(lr, float)
+        self.lr = lr
+
+    def predict(self, obs):
+        """ 使用policy model预测输出的动作概率
+        """
+        return self.model(obs)
+
+    def learn(self, obs, action, reward):
+        """ 用policy gradient 算法更新policy model
+        """
+        act_prob = self.model(obs)  # 获取输出动作概率
+        # log_prob = layers.cross_entropy(act_prob, action) # 交叉熵
+        log_prob = layers.reduce_sum(
+            -1.0 * layers.log(act_prob) * layers.one_hot(
+                action, act_prob.shape[1]),
+            dim=1)
+        cost = log_prob * reward
+        cost = layers.reduce_mean(cost)
+
+        optimizer = fluid.optimizer.Adam(self.lr)
+        optimizer.minimize(cost)
+        return cost
diff --git a/parl/framework/policy_distribution.py b/examples/tutorials/lesson4/policy_gradient/model.py
similarity index 51%
rename from parl/framework/policy_distribution.py
rename to examples/tutorials/lesson4/policy_gradient/model.py
index 60bd6dd4e246875e3d684a25491ca0a5b80e8590..0273afd2f7ca4915b5f04d264dc0146248bea54d 100644
--- a/parl/framework/policy_distribution.py
+++ b/examples/tutorials/lesson4/policy_gradient/model.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -12,13 +12,21 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-import warnings
+#-*- coding: utf-8 -*-
 
-warnings.simplefilter('default')
+import parl
+from parl import layers
 
-warnings.warn(
-    "module `parl.framework.policy_distribution` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.policy_distribution` instead.",
-    DeprecationWarning,
-    stacklevel=2)
 
-from parl.core.fluid.policy_distribution import *
+class Model(parl.Model):
+    def __init__(self, act_dim):
+        act_dim = act_dim
+        hid1_size = act_dim * 10
+
+        self.fc1 = layers.fc(size=hid1_size, act='tanh')
+        self.fc2 = layers.fc(size=act_dim, act='softmax')
+
+    def forward(self, obs):  # 可直接用 model = Model(5); model(obs)调用
+        out = self.fc1(obs)
+        out = self.fc2(out)
+        return out
diff --git a/examples/tutorials/lesson4/policy_gradient/train.py b/examples/tutorials/lesson4/policy_gradient/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..306c22526f76a2ecfc1793dcca083856dc51c45b
--- /dev/null
+++ b/examples/tutorials/lesson4/policy_gradient/train.py
@@ -0,0 +1,111 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import os
+import gym
+import numpy as np
+import parl
+
+from agent import Agent
+from model import Model
+from algorithm import PolicyGradient  # from parl.algorithms import PolicyGradient
+
+from parl.utils import logger
+
+LEARNING_RATE = 1e-3
+
+
+# 训练一个episode
+def run_episode(env, agent):
+    obs_list, action_list, reward_list = [], [], []
+    obs = env.reset()
+    while True:
+        obs_list.append(obs)
+        action = agent.sample(obs)
+        action_list.append(action)
+
+        obs, reward, done, info = env.step(action)
+        reward_list.append(reward)
+
+        if done:
+            break
+    return obs_list, action_list, reward_list
+
+
+# 评估 agent, 跑 5 个episode，总reward求平均
+def evaluate(env, agent, render=False):
+    eval_reward = []
+    for i in range(5):
+        obs = env.reset()
+        episode_reward = 0
+        while True:
+            action = agent.predict(obs)
+            obs, reward, isOver, _ = env.step(action)
+            episode_reward += reward
+            if render:
+                env.render()
+            if isOver:
+                break
+        eval_reward.append(episode_reward)
+    return np.mean(eval_reward)
+
+
+def calc_reward_to_go(reward_list, gamma=1.0):
+    for i in range(len(reward_list) - 2, -1, -1):
+        # G_t = r_t + γ·r_t+1 + ... = r_t + γ·G_t+1
+        reward_list[i] += gamma * reward_list[i + 1]  # Gt
+    return np.array(reward_list)
+
+
+def main():
+    env = gym.make('CartPole-v0')
+    # env = env.unwrapped # Cancel the minimum score limit
+    obs_dim = env.observation_space.shape[0]
+    act_dim = env.action_space.n
+    logger.info('obs_dim {}, act_dim {}'.format(obs_dim, act_dim))
+
+    # 根据parl框架构建agent
+    model = Model(act_dim=act_dim)
+    alg = PolicyGradient(model, lr=LEARNING_RATE)
+    agent = Agent(alg, obs_dim=obs_dim, act_dim=act_dim)
+
+    # 加载模型
+    # if os.path.exists('./model.ckpt'):
+    #     agent.restore('./model.ckpt')
+    #     run_episode(env, agent, train_or_test='test', render=True)
+    #     exit()
+
+    for i in range(1000):
+        obs_list, action_list, reward_list = run_episode(env, agent)
+        if i % 10 == 0:
+            logger.info("Episode {}, Reward Sum {}.".format(
+                i, sum(reward_list)))
+
+        batch_obs = np.array(obs_list)
+        batch_action = np.array(action_list)
+        batch_reward = calc_reward_to_go(reward_list)
+
+        agent.learn(batch_obs, batch_action, batch_reward)
+        if (i + 1) % 100 == 0:
+            total_reward = evaluate(env, agent, render=True)
+            logger.info('Test reward: {}'.format(total_reward))
+
+    # save the parameters to ./model.ckpt
+    agent.save('./model.ckpt')
+
+
+if __name__ == '__main__':
+    main()
diff --git a/examples/tutorials/lesson5/ddpg/agent.py b/examples/tutorials/lesson5/ddpg/agent.py
new file mode 100644
index 0000000000000000000000000000000000000000..5a6ab55bfba9ab819a9abecb677e9a05605248db
--- /dev/null
+++ b/examples/tutorials/lesson5/ddpg/agent.py
@@ -0,0 +1,74 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import numpy as np
+import parl
+from parl import layers
+from paddle import fluid
+
+
+class Agent(parl.Agent):
+    def __init__(self, algorithm, obs_dim, act_dim):
+        assert isinstance(obs_dim, int)
+        assert isinstance(act_dim, int)
+        self.obs_dim = obs_dim
+        self.act_dim = act_dim
+        super(Agent, self).__init__(algorithm)
+
+        # 注意：最开始先同步self.model和self.target_model的参数.
+        self.alg.sync_target(decay=0)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            self.pred_act = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(
+                name='obs', shape=[self.obs_dim], dtype='float32')
+            act = layers.data(
+                name='act', shape=[self.act_dim], dtype='float32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(
+                name='next_obs', shape=[self.obs_dim], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
+                                                 terminal)
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act = self.fluid_executor.run(
+            self.pred_program, feed={'obs': obs},
+            fetch_list=[self.pred_act])[0]
+        act = np.squeeze(act)
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        feed = {
+            'obs': obs,
+            'act': act,
+            'reward': reward,
+            'next_obs': next_obs,
+            'terminal': terminal
+        }
+        critic_cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
+        self.alg.sync_target()
+        return critic_cost
diff --git a/examples/tutorials/lesson5/ddpg/algorithm.py b/examples/tutorials/lesson5/ddpg/algorithm.py
new file mode 100644
index 0000000000000000000000000000000000000000..b77beaedc7452fe305684d09cbcc9ca0061d27e1
--- /dev/null
+++ b/examples/tutorials/lesson5/ddpg/algorithm.py
@@ -0,0 +1,96 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import parl
+from parl import layers
+from copy import deepcopy
+from paddle import fluid
+
+
+class DDPG(parl.Algorithm):
+    def __init__(self,
+                 model,
+                 gamma=None,
+                 tau=None,
+                 actor_lr=None,
+                 critic_lr=None):
+        """  DDPG algorithm
+        
+        Args:
+            model (parl.Model): actor and critic 的前向网络.
+                                model 必须实现 get_actor_params() 方法.
+            gamma (float): reward的衰减因子.
+            tau (float): self.target_model 跟 self.model 同步参数 的 软更新参数
+            actor_lr (float): actor 的学习率
+            critic_lr (float): critic 的学习率
+        """
+        assert isinstance(gamma, float)
+        assert isinstance(tau, float)
+        assert isinstance(actor_lr, float)
+        assert isinstance(critic_lr, float)
+        self.gamma = gamma
+        self.tau = tau
+        self.actor_lr = actor_lr
+        self.critic_lr = critic_lr
+
+        self.model = model
+        self.target_model = deepcopy(model)
+
+    def predict(self, obs):
+        """ 使用 self.model 的 actor model 来预测动作
+        """
+        return self.model.policy(obs)
+
+    def learn(self, obs, action, reward, next_obs, terminal):
+        """ 用DDPG算法更新 actor 和 critic
+        """
+        actor_cost = self._actor_learn(obs)
+        critic_cost = self._critic_learn(obs, action, reward, next_obs,
+                                         terminal)
+        return actor_cost, critic_cost
+
+    def _actor_learn(self, obs):
+        action = self.model.policy(obs)
+        Q = self.model.value(obs, action)
+        cost = layers.reduce_mean(-1.0 * Q)
+        optimizer = fluid.optimizer.AdamOptimizer(self.actor_lr)
+        optimizer.minimize(cost, parameter_list=self.model.get_actor_params())
+        return cost
+
+    def _critic_learn(self, obs, action, reward, next_obs, terminal):
+        next_action = self.target_model.policy(next_obs)
+        next_Q = self.target_model.value(next_obs, next_action)
+
+        terminal = layers.cast(terminal, dtype='float32')
+        target_Q = reward + (1.0 - terminal) * self.gamma * next_Q
+        target_Q.stop_gradient = True
+
+        Q = self.model.value(obs, action)
+        cost = layers.square_error_cost(Q, target_Q)
+        cost = layers.reduce_mean(cost)
+        optimizer = fluid.optimizer.AdamOptimizer(self.critic_lr)
+        optimizer.minimize(cost)
+        return cost
+
+    def sync_target(self, decay=None, share_vars_parallel_executor=None):
+        """ self.target_model从self.model复制参数过来，若decay不为None,则是软更新
+        """
+        if decay is None:
+            decay = 1.0 - self.tau
+        self.model.sync_weights_to(
+            self.target_model,
+            decay=decay,
+            share_vars_parallel_executor=share_vars_parallel_executor)
diff --git a/examples/tutorials/lesson5/ddpg/env.py b/examples/tutorials/lesson5/ddpg/env.py
new file mode 100644
index 0000000000000000000000000000000000000000..c3e1e54518b15a08f7a0316b5470b47721a1f288
--- /dev/null
+++ b/examples/tutorials/lesson5/ddpg/env.py
@@ -0,0 +1,175 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#-*- coding: utf-8 -*-
+"""
+Classic cart-pole system implemented by Rich Sutton et al.
+Copied from http://incompleteideas.net/sutton/book/code/pole.c
+permalink: https://perma.cc/C9ZM-652R
+
+Continuous version by Ian Danforth
+"""
+
+import math
+import gym
+from gym import spaces, logger
+from gym.utils import seeding
+import numpy as np
+
+
+class ContinuousCartPoleEnv(gym.Env):
+    metadata = {
+        'render.modes': ['human', 'rgb_array'],
+        'video.frames_per_second': 50
+    }
+
+    def __init__(self):
+        self.gravity = 9.8
+        self.masscart = 1.0
+        self.masspole = 0.1
+        self.total_mass = (self.masspole + self.masscart)
+        self.length = 0.5  # actually half the pole's length
+        self.polemass_length = (self.masspole * self.length)
+        self.force_mag = 30.0
+        self.tau = 0.02  # seconds between state updates
+        self.min_action = -1.0
+        self.max_action = 1.0
+
+        # Angle at which to fail the episode
+        self.theta_threshold_radians = 12 * 2 * math.pi / 360
+        self.x_threshold = 2.4
+
+        # Angle limit set to 2 * theta_threshold_radians so failing observation
+        # is still within bounds
+        high = np.array([
+            self.x_threshold * 2,
+            np.finfo(np.float32).max, self.theta_threshold_radians * 2,
+            np.finfo(np.float32).max
+        ])
+
+        self.action_space = spaces.Box(
+            low=self.min_action, high=self.max_action, shape=(1, ))
+        self.observation_space = spaces.Box(-high, high)
+
+        self.seed()
+        self.viewer = None
+        self.state = None
+
+        self.steps_beyond_done = None
+
+    def seed(self, seed=None):
+        self.np_random, seed = seeding.np_random(seed)
+        return [seed]
+
+    def stepPhysics(self, force):
+        x, x_dot, theta, theta_dot = self.state
+        costheta = math.cos(theta)
+        sintheta = math.sin(theta)
+        temp = (force + self.polemass_length * theta_dot * theta_dot * sintheta
+                ) / self.total_mass
+        thetaacc = (self.gravity * sintheta - costheta * temp) / \
+            (self.length * (4.0/3.0 - self.masspole * costheta * costheta / self.total_mass))
+        xacc = temp - self.polemass_length * thetaacc * costheta / self.total_mass
+        x = x + self.tau * x_dot
+        x_dot = x_dot + self.tau * xacc
+        theta = theta + self.tau * theta_dot
+        theta_dot = theta_dot + self.tau * thetaacc
+        return (x, x_dot, theta, theta_dot)
+
+    def step(self, action):
+        action = np.expand_dims(action, 0)
+        assert self.action_space.contains(action), \
+            "%r (%s) invalid" % (action, type(action))
+        # Cast action to float to strip np trappings
+        force = self.force_mag * float(action)
+        self.state = self.stepPhysics(force)
+        x, x_dot, theta, theta_dot = self.state
+        done = x < -self.x_threshold \
+            or x > self.x_threshold \
+            or theta < -self.theta_threshold_radians \
+            or theta > self.theta_threshold_radians
+        done = bool(done)
+
+        if not done:
+            reward = 1.0
+        elif self.steps_beyond_done is None:
+            # Pole just fell!
+            self.steps_beyond_done = 0
+            reward = 1.0
+        else:
+            if self.steps_beyond_done == 0:
+                logger.warn("""
+You are calling 'step()' even though this environment has already returned
+done = True. You should always call 'reset()' once you receive 'done = True'
+Any further steps are undefined behavior.
+                """)
+            self.steps_beyond_done += 1
+            reward = 0.0
+
+        return np.array(self.state), reward, done, {}
+
+    def reset(self):
+        self.state = self.np_random.uniform(low=-0.05, high=0.05, size=(4, ))
+        self.steps_beyond_done = None
+        return np.array(self.state)
+
+    def render(self, mode='human'):
+        screen_width = 600
+        screen_height = 400
+
+        world_width = self.x_threshold * 2
+        scale = screen_width / world_width
+        carty = 100  # TOP OF CART
+        polewidth = 10.0
+        polelen = scale * 1.0
+        cartwidth = 50.0
+        cartheight = 30.0
+
+        if self.viewer is None:
+            from gym.envs.classic_control import rendering
+            self.viewer = rendering.Viewer(screen_width, screen_height)
+            l, r, t, b = -cartwidth / 2, cartwidth / 2, cartheight / 2, -cartheight / 2
+            axleoffset = cartheight / 4.0
+            cart = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
+            self.carttrans = rendering.Transform()
+            cart.add_attr(self.carttrans)
+            self.viewer.add_geom(cart)
+            l, r, t, b = -polewidth / 2, polewidth / 2, polelen - polewidth / 2, -polewidth / 2
+            pole = rendering.FilledPolygon([(l, b), (l, t), (r, t), (r, b)])
+            pole.set_color(.8, .6, .4)
+            self.poletrans = rendering.Transform(translation=(0, axleoffset))
+            pole.add_attr(self.poletrans)
+            pole.add_attr(self.carttrans)
+            self.viewer.add_geom(pole)
+            self.axle = rendering.make_circle(polewidth / 2)
+            self.axle.add_attr(self.poletrans)
+            self.axle.add_attr(self.carttrans)
+            self.axle.set_color(.5, .5, .8)
+            self.viewer.add_geom(self.axle)
+            self.track = rendering.Line((0, carty), (screen_width, carty))
+            self.track.set_color(0, 0, 0)
+            self.viewer.add_geom(self.track)
+
+        if self.state is None:
+            return None
+
+        x = self.state
+        cartx = x[0] * scale + screen_width / 2.0  # MIDDLE OF CART
+        self.carttrans.set_translation(cartx, carty)
+        self.poletrans.set_rotation(-x[2])
+
+        return self.viewer.render(return_rgb_array=(mode == 'rgb_array'))
+
+    def close(self):
+        if self.viewer:
+            self.viewer.close()
diff --git a/examples/tutorials/lesson5/ddpg/model.py b/examples/tutorials/lesson5/ddpg/model.py
new file mode 100644
index 0000000000000000000000000000000000000000..c195cd96171fa5d329e06c61882fd6977a8ea77c
--- /dev/null
+++ b/examples/tutorials/lesson5/ddpg/model.py
@@ -0,0 +1,62 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class Model(parl.Model):
+    def __init__(self, act_dim):
+        self.actor_model = ActorModel(act_dim)
+        self.critic_model = CriticModel()
+
+    def policy(self, obs):
+        return self.actor_model.policy(obs)
+
+    def value(self, obs, act):
+        return self.critic_model.value(obs, act)
+
+    def get_actor_params(self):
+        return self.actor_model.parameters()
+
+
+class ActorModel(parl.Model):
+    def __init__(self, act_dim):
+        hid_size = 100
+
+        self.fc1 = layers.fc(size=hid_size, act='relu')
+        self.fc2 = layers.fc(size=act_dim, act='tanh')
+
+    def policy(self, obs):
+        hid = self.fc1(obs)
+        means = self.fc2(hid)
+        return means
+
+
+class CriticModel(parl.Model):
+    def __init__(self):
+        hid_size = 100
+
+        self.fc1 = layers.fc(size=hid_size, act='relu')
+        self.fc2 = layers.fc(size=1, act=None)
+
+    def value(self, obs, act):
+        concat = layers.concat([obs, act], axis=1)
+        hid = self.fc1(concat)
+        Q = self.fc2(hid)
+        Q = layers.squeeze(Q, axes=[1])
+        return Q
diff --git a/examples/tutorials/lesson5/ddpg/replay_memory.py b/examples/tutorials/lesson5/ddpg/replay_memory.py
new file mode 100644
index 0000000000000000000000000000000000000000..f7c83688184614a23429d7f64461877f283de9f5
--- /dev/null
+++ b/examples/tutorials/lesson5/ddpg/replay_memory.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+# Modified from https://github.com/seungeunrho/minimalRL/blob/master/dqn.py
+
+import random
+import collections
+import numpy as np
+
+
+class ReplayMemory(object):
+    def __init__(self, max_size):
+        self.buffer = collections.deque(maxlen=max_size)
+
+    def append(self, exp):
+        self.buffer.append(exp)
+
+    def sample(self, batch_size):
+        mini_batch = random.sample(self.buffer, batch_size)
+        obs_batch, action_batch, reward_batch, next_obs_batch, done_batch = [], [], [], [], []
+
+        for experience in mini_batch:
+            s, a, r, s_p, done = experience
+            obs_batch.append(s)
+            action_batch.append(a)
+            reward_batch.append(r)
+            next_obs_batch.append(s_p)
+            done_batch.append(done)
+
+        return np.array(obs_batch).astype('float32'), \
+            np.array(action_batch).astype('float32'), np.array(reward_batch).astype('float32'),\
+            np.array(next_obs_batch).astype('float32'), np.array(done_batch).astype('float32')
+
+    def __len__(self):
+        return len(self.buffer)
diff --git a/examples/tutorials/lesson5/ddpg/train.py b/examples/tutorials/lesson5/ddpg/train.py
new file mode 100644
index 0000000000000000000000000000000000000000..5158fef0c13d900835fe8b0751b82a91c4662c4c
--- /dev/null
+++ b/examples/tutorials/lesson5/ddpg/train.py
@@ -0,0 +1,128 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+#-*- coding: utf-8 -*-
+
+import gym
+import numpy as np
+import parl
+from parl.utils import logger
+
+from agent import Agent
+from model import Model
+from algorithm import DDPG  # from parl.algorithms import DDPG
+from env import ContinuousCartPoleEnv
+from replay_memory import ReplayMemory
+
+ACTOR_LR = 1e-3  # Actor网络的 learning rate
+CRITIC_LR = 1e-3  # Critic网络的 learning rate
+GAMMA = 0.99  # reward 的衰减因子
+TAU = 0.001  # 软更新的系数
+MEMORY_SIZE = int(1e6)  # 经验池大小
+MEMORY_WARMUP_SIZE = MEMORY_SIZE // 20  # 预存一部分经验之后再开始训练
+BATCH_SIZE = 128
+REWARD_SCALE = 0.1  # reward 缩放系数
+NOISE = 0.05  # 动作噪声方差
+TRAIN_EPISODE = 6e3  # 训练的总episode数
+
+
+# 训练一个episode
+def run_episode(agent, env, rpm):
+    obs = env.reset()
+    total_reward = 0
+    steps = 0
+    while True:
+        steps += 1
+        batch_obs = np.expand_dims(obs, axis=0)
+        action = agent.predict(batch_obs.astype('float32'))
+
+        # 增加探索扰动, 输出限制在 [-1.0, 1.0] 范围内
+        action = np.clip(np.random.normal(action, NOISE), -1.0, 1.0)
+
+        next_obs, reward, done, info = env.step(action)
+
+        action = [action]  # 方便存入replaymemory
+        rpm.append((obs, action, REWARD_SCALE * reward, next_obs, done))
+
+        if len(rpm) > MEMORY_WARMUP_SIZE and (steps % 5) == 0:
+            (batch_obs, batch_action, batch_reward, batch_next_obs,
+             batch_done) = rpm.sample(BATCH_SIZE)
+            agent.learn(batch_obs, batch_action, batch_reward, batch_next_obs,
+                        batch_done)
+
+        obs = next_obs
+        total_reward += reward
+
+        if done or steps >= 200:
+            break
+    return total_reward
+
+
+# 评估 agent, 跑 5 个episode，总reward求平均
+def evaluate(env, agent, render=False):
+    eval_reward = []
+    for i in range(5):
+        obs = env.reset()
+        total_reward = 0
+        steps = 0
+        while True:
+            batch_obs = np.expand_dims(obs, axis=0)
+            action = agent.predict(batch_obs.astype('float32'))
+            action = np.clip(action, -1.0, 1.0)
+
+            steps += 1
+            next_obs, reward, done, info = env.step(action)
+
+            obs = next_obs
+            total_reward += reward
+
+            if render:
+                env.render()
+            if done or steps >= 200:
+                break
+        eval_reward.append(total_reward)
+    return np.mean(eval_reward)
+
+
+def main():
+    env = ContinuousCartPoleEnv()
+
+    obs_dim = env.observation_space.shape[0]
+    act_dim = env.action_space.shape[0]
+
+    # 使用PARL框架创建agent
+    model = Model(act_dim)
+    algorithm = DDPG(
+        model, gamma=GAMMA, tau=TAU, actor_lr=ACTOR_LR, critic_lr=CRITIC_LR)
+    agent = Agent(algorithm, obs_dim, act_dim)
+
+    # 创建经验池
+    rpm = ReplayMemory(MEMORY_SIZE)
+    # 往经验池中预存数据
+    while len(rpm) < MEMORY_WARMUP_SIZE:
+        run_episode(agent, env, rpm)
+
+    episode = 0
+    while episode < TRAIN_EPISODE:
+        for i in range(50):
+            total_reward = run_episode(agent, env, rpm)
+            episode += 1
+
+        eval_reward = evaluate(env, agent, render=False)
+        logger.info('episode:{}    Test reward:{}'.format(
+            episode, eval_reward))
+
+
+if __name__ == '__main__':
+    main()
diff --git a/papers/AAAI_2020.md b/papers/AAAI_2020.md
new file mode 100644
index 0000000000000000000000000000000000000000..9e5564c8ede8d1e6009e0fe28c5aa32e2c4f88cb
--- /dev/null
+++ b/papers/AAAI_2020.md
@@ -0,0 +1,32 @@
+### papers relative to improved RL algorithms
+1. **Proximal Distilled Evolutionary Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1906.09807.pdf)
+
+    *Cristian Bodnar, Ben Day, Pietro Lio ́*
+
+2. **Uncertainty-Aware Action Advising for Deep Reinforcement Learning Agents** AAAI2020. [paper](https://aaai.org/Papers/AAAI/2020GB/AAAI-SilvaF.2159.pdf)
+    
+    *Felipe Leno da Silva (University of Sao Paulo)*; Pablo Hernandez-Leal (Borealis AI); Bilal Kartal (Borealis AI); Matthew Taylor (Borealis AI)*
+    
+3. **Partner Selection for the Emergence of Cooperation in Multi-Agent Systems Using Reinforcement Learning** AAAI2020. [paper](https://aaai.org/Papers/AAAI/2020GB/AAAI-AnastassacosN.1598.pdf)
+    
+    *Nicolas Anastassacos, Stephen Hailes, Mirco Musolesi*
+    
+4. **Reinforcement Learning with Perturbed Reward** AAAI2020. [paper](https://www.aaai.org/Papers/AAAI/2020GB/AAAI-WangJK.4139.pdf)
+    
+    *Jingkang Wang, Yang Liu, Bo Li*
+    
+5. **Deep Model-Based Reinforcement Learning via Estimated Uncertainty and Conservative Policy Optimization** AAAI2020. [paper](https://arxiv.org/pdf/1911.12574.pdf)
+    
+    *Qi Zhou, HouQiang Li, Jie Wang*
+
+6. **Reinforcement Learning of Risk-Constrained Policies in Markov Decision Processes** AAAI2020. [paper](https://www.fi.muni.cz/~xnovot18/aaai20.pdf)
+    
+    *Toma ́sˇ Bra ́zdil, Krishnendu Chatterjee, Petr Novotny ́, Jirˇ ́ı Vahala*
+
+7. **Exploratory Combinatorial Optimization with Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1909.04063.pdf)
+   
+    *Thomas D. Barrett, William R. Clements, Jakob N. Foerster, Alex I. Lvovsky*
+
+8. **Fixed-Horizon Temporal Difference Methods for Stable Reinforcement Learning** AAAI2020. [paper](https://arxiv.org/pdf/1909.03906.pdf)
+    
+    *Kristopher De Asis, Alan Chan, Silviu Pitis, Richard S. Sutton, Daniel Graves*
diff --git a/parl/__init__.py b/parl/__init__.py
index 7d3c26a00c4671f6aef2810a78e1f92bccaf35ed..cd4975bee788bccf82417f0a1e88e2dca67356a0 100644
--- a/parl/__init__.py
+++ b/parl/__init__.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
-__version__ = "1.2.1"
+__version__ = "1.3.1"
 """
 generates new PARL python API
 """
diff --git a/parl/algorithms/fluid/a3c.py b/parl/algorithms/fluid/a3c.py
index 9b9f57e8eb5bfd59e3f79c1fc42e4d1374618f23..2786eb640a8d9cc0a4b117c28727c5c23a32fec6 100644
--- a/parl/algorithms/fluid/a3c.py
+++ b/parl/algorithms/fluid/a3c.py
@@ -24,25 +24,17 @@ __all__ = ['A3C']
 
 
 class A3C(Algorithm):
-    def __init__(self, model, hyperparas=None, vf_loss_coeff=None):
+    def __init__(self, model, vf_loss_coeff=None):
         """ A3C/A2C algorithm
         
         Args:
             model (parl.Model): forward network of policy and value
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             vf_loss_coeff (float): coefficient of the value function loss
         """
 
         self.model = model
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithms.A3C` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.vf_loss_coeff = hyperparas['vf_loss_coeff']
-        else:
-            assert isinstance(vf_loss_coeff, (int, float))
-            self.vf_loss_coeff = vf_loss_coeff
+        assert isinstance(vf_loss_coeff, (int, float))
+        self.vf_loss_coeff = vf_loss_coeff
 
     def learn(self, obs, actions, advantages, target_values, learning_rate,
               entropy_coeff):
diff --git a/parl/algorithms/fluid/ddpg.py b/parl/algorithms/fluid/ddpg.py
index c127109c7d92f3f5b6e42d4eac25a796ae0c89ae..70992ee204449c297ff2a605951162d939f270d4 100644
--- a/parl/algorithms/fluid/ddpg.py
+++ b/parl/algorithms/fluid/ddpg.py
@@ -19,7 +19,6 @@ from parl.core.fluid import layers
 from copy import deepcopy
 from paddle import fluid
 from parl.core.fluid.algorithm import Algorithm
-from parl.utils.deprecation import deprecated
 
 __all__ = ['DDPG']
 
@@ -27,7 +26,6 @@ __all__ = ['DDPG']
 class DDPG(Algorithm):
     def __init__(self,
                  model,
-                 hyperparas=None,
                  gamma=None,
                  tau=None,
                  actor_lr=None,
@@ -37,53 +35,28 @@ class DDPG(Algorithm):
         Args:
             model (parl.Model): forward network of actor and critic.
                                 The function get_actor_params() of model should be implemented.
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             gamma (float): discounted factor for reward computation.
             tau (float): decay coefficient when updating the weights of self.target_model with self.model
             actor_lr (float): learning rate of the actor model
             critic_lr (float): learning rate of the critic model
         """
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithms.DDPG` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.gamma = hyperparas['gamma']
-            self.tau = hyperparas['tau']
-            self.actor_lr = hyperparas['actor_lr']
-            self.critic_lr = hyperparas['critic_lr']
-        else:
-            assert isinstance(gamma, float)
-            assert isinstance(tau, float)
-            assert isinstance(actor_lr, float)
-            assert isinstance(critic_lr, float)
-            self.gamma = gamma
-            self.tau = tau
-            self.actor_lr = actor_lr
-            self.critic_lr = critic_lr
+        assert isinstance(gamma, float)
+        assert isinstance(tau, float)
+        assert isinstance(actor_lr, float)
+        assert isinstance(critic_lr, float)
+        self.gamma = gamma
+        self.tau = tau
+        self.actor_lr = actor_lr
+        self.critic_lr = critic_lr
 
         self.model = model
         self.target_model = deepcopy(model)
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='predict')
-    def define_predict(self, obs):
-        """ use actor model of self.model to predict the action
-        """
-        return self.predict(obs)
-
     def predict(self, obs):
         """ use actor model of self.model to predict the action
         """
         return self.model.policy(obs)
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='learn')
-    def define_learn(self, obs, action, reward, next_obs, terminal):
-        """ update actor and critic model with DDPG algorithm
-        """
-        return self.learn(obs, action, reward, next_obs, terminal)
-
     def learn(self, obs, action, reward, next_obs, terminal):
         """ update actor and critic model with DDPG algorithm
         """
@@ -115,15 +88,7 @@ class DDPG(Algorithm):
         optimizer.minimize(cost)
         return cost
 
-    def sync_target(self,
-                    gpu_id=None,
-                    decay=None,
-                    share_vars_parallel_executor=None):
-        if gpu_id is not None:
-            warnings.warn(
-                "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DDPG` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
+    def sync_target(self, decay=None, share_vars_parallel_executor=None):
         if decay is None:
             decay = 1.0 - self.tau
         self.model.sync_weights_to(
diff --git a/parl/algorithms/fluid/ddqn.py b/parl/algorithms/fluid/ddqn.py
index 5ccd4aaafe78d6b698fb04711cdc6b7df48faac8..03c0ced5019abcef00151a68eca944b32caa8469 100644
--- a/parl/algorithms/fluid/ddqn.py
+++ b/parl/algorithms/fluid/ddqn.py
@@ -21,19 +21,17 @@ import paddle.fluid as fluid
 from parl.core.fluid.algorithm import Algorithm
 from parl.core.fluid import layers
 
+__all__ = ['DDQN']
+
 
 class DDQN(Algorithm):
-    def __init__(
-            self,
-            model,
-            act_dim=None,
-            gamma=None,
-    ):
+    def __init__(self, model, act_dim=None, gamma=None, lr=None):
         """ Double DQN algorithm
-
         Args:
-            model (parl.Model): model defining forward network of Q function.
+            model (parl.Model): model defining forward network of Q function
+            act_dim (int): dimension of the action space
             gamma (float): discounted factor for reward computation.
+            lr (float): learning rate.
         """
         self.model = model
         self.target_model = copy.deepcopy(model)
@@ -43,11 +41,29 @@ class DDQN(Algorithm):
 
         self.act_dim = act_dim
         self.gamma = gamma
+        self.lr = lr
 
     def predict(self, obs):
+        """ use value model self.model to predict the action value
+        """
         return self.model.value(obs)
 
-    def learn(self, obs, action, reward, next_obs, terminal, learning_rate):
+    def learn(self,
+              obs,
+              action,
+              reward,
+              next_obs,
+              terminal,
+              learning_rate=None):
+        """ update value model self.model with DQN algorithm
+        """
+        # Support the modification of learning_rate
+        if learning_rate is None:
+            assert isinstance(
+                self.lr,
+                float), "Please set the learning rate of DQN in initializaion."
+            learning_rate = self.lr
+
         pred_value = self.model.value(obs)
         action_onehot = layers.one_hot(action, self.act_dim)
         action_onehot = layers.cast(action_onehot, dtype='float32')
@@ -85,12 +101,7 @@ class DDQN(Algorithm):
         optimizer.minimize(cost)
         return cost
 
-    def sync_target(self, gpu_id=None):
+    def sync_target(self):
         """ sync weights of self.model to self.target_model
         """
-        if gpu_id is not None:
-            warnings.warn(
-                "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
         self.model.sync_weights_to(self.target_model)
diff --git a/parl/algorithms/fluid/dqn.py b/parl/algorithms/fluid/dqn.py
index e6e97577d041f77b1899ce460582c29f5bf480a8..56d05e0a67cf5d6653bba2e350a71bb08977733a 100644
--- a/parl/algorithms/fluid/dqn.py
+++ b/parl/algorithms/fluid/dqn.py
@@ -19,18 +19,16 @@ import copy
 import paddle.fluid as fluid
 from parl.core.fluid.algorithm import Algorithm
 from parl.core.fluid import layers
-from parl.utils.deprecation import deprecated
 
 __all__ = ['DQN']
 
 
 class DQN(Algorithm):
-    def __init__(self, model, hyperparas=None, act_dim=None, gamma=None):
+    def __init__(self, model, act_dim=None, gamma=None, lr=None):
         """ DQN algorithm
         
         Args:
             model (parl.Model): model defining forward network of Q function
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             act_dim (int): dimension of the action space
             gamma (float): discounted factor for reward computation.
             lr (float): learning rate.
@@ -38,41 +36,33 @@ class DQN(Algorithm):
         self.model = model
         self.target_model = copy.deepcopy(model)
 
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.act_dim = hyperparas['action_dim']
-            self.gamma = hyperparas['gamma']
-        else:
-            assert isinstance(act_dim, int)
-            assert isinstance(gamma, float)
-            self.act_dim = act_dim
-            self.gamma = gamma
+        assert isinstance(act_dim, int)
+        assert isinstance(gamma, float)
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='predict')
-    def define_predict(self, obs):
-        """ use value model self.model to predict the action value
-        """
-        return self.predict(obs)
+        self.act_dim = act_dim
+        self.gamma = gamma
+        self.lr = lr
 
     def predict(self, obs):
         """ use value model self.model to predict the action value
         """
         return self.model.value(obs)
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='learn')
-    def define_learn(self, obs, action, reward, next_obs, terminal,
-                     learning_rate):
-        return self.learn(obs, action, reward, next_obs, terminal,
-                          learning_rate)
-
-    def learn(self, obs, action, reward, next_obs, terminal, learning_rate):
+    def learn(self,
+              obs,
+              action,
+              reward,
+              next_obs,
+              terminal,
+              learning_rate=None):
         """ update value model self.model with DQN algorithm
         """
+        # Support the modification of learning_rate
+        if learning_rate is None:
+            assert isinstance(
+                self.lr,
+                float), "Please set the learning rate of DQN in initializaion."
+            learning_rate = self.lr
 
         pred_value = self.model.value(obs)
         next_pred_value = self.target_model.value(next_obs)
@@ -92,12 +82,7 @@ class DQN(Algorithm):
         optimizer.minimize(cost)
         return cost
 
-    def sync_target(self, gpu_id=None):
+    def sync_target(self):
         """ sync weights of self.model to self.target_model
         """
-        if gpu_id is not None:
-            warnings.warn(
-                "the `gpu_id` argument of `sync_target` function in `parl.Algorithms.DQN` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
         self.model.sync_weights_to(self.target_model)
diff --git a/parl/algorithms/fluid/impala/impala.py b/parl/algorithms/fluid/impala/impala.py
index 025f96f2650e3351552d6525c910d2f29406dbaa..a7adf56ee28f3ec14f304a9a8b163aae31805fda 100644
--- a/parl/algorithms/fluid/impala/impala.py
+++ b/parl/algorithms/fluid/impala/impala.py
@@ -85,44 +85,31 @@ class VTraceLoss(object):
 class IMPALA(Algorithm):
     def __init__(self,
                  model,
-                 hyperparas=None,
                  sample_batch_steps=None,
                  gamma=None,
                  vf_loss_coeff=None,
                  clip_rho_threshold=None,
                  clip_pg_rho_threshold=None):
-        """ IMPALA algorithm
+        r""" IMPALA algorithm
         
         Args:
             model (parl.Model): forward network of policy and value
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             sample_batch_steps (int): steps of each environment sampling.
             gamma (float): discounted factor for reward computation.
             vf_loss_coeff (float): coefficient of the value function loss.
             clip_rho_threshold (float): clipping threshold for importance weights (rho).
             clip_pg_rho_threshold (float): clipping threshold on rho_s in \rho_s \delta log \pi(a|x) (r + \gamma v_{s+1} - V(x_s)).
         """
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithms.IMPALA` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.sample_batch_steps = hyperparas['sample_batch_steps']
-            self.gamma = hyperparas['gamma']
-            self.vf_loss_coeff = hyperparas['vf_loss_coeff']
-            self.clip_rho_threshold = hyperparas['clip_rho_threshold']
-            self.clip_pg_rho_threshold = hyperparas['clip_pg_rho_threshold']
-        else:
-            assert isinstance(sample_batch_steps, int)
-            assert isinstance(gamma, float)
-            assert isinstance(vf_loss_coeff, float)
-            assert isinstance(clip_rho_threshold, float)
-            assert isinstance(clip_pg_rho_threshold, float)
-            self.sample_batch_steps = sample_batch_steps
-            self.gamma = gamma
-            self.vf_loss_coeff = vf_loss_coeff
-            self.clip_rho_threshold = clip_rho_threshold
-            self.clip_pg_rho_threshold = clip_pg_rho_threshold
+        assert isinstance(sample_batch_steps, int)
+        assert isinstance(gamma, float)
+        assert isinstance(vf_loss_coeff, float)
+        assert isinstance(clip_rho_threshold, float)
+        assert isinstance(clip_pg_rho_threshold, float)
+        self.sample_batch_steps = sample_batch_steps
+        self.gamma = gamma
+        self.vf_loss_coeff = vf_loss_coeff
+        self.clip_rho_threshold = clip_rho_threshold
+        self.clip_pg_rho_threshold = clip_pg_rho_threshold
 
         self.model = model
 
diff --git a/parl/algorithms/fluid/impala/vtrace.py b/parl/algorithms/fluid/impala/vtrace.py
index 9eb75957b60271fd9a5221c67593359efc57614d..99840bbe2c8c89d37a00cfb673b8ed19f7e82346 100644
--- a/parl/algorithms/fluid/impala/vtrace.py
+++ b/parl/algorithms/fluid/impala/vtrace.py
@@ -146,7 +146,7 @@ def from_importance_weights(behaviour_actions_log_probs,
 
 
 def recursively_scan(discounts, cs, deltas):
-    """ Recursively calculate vs_minus_v_xs according to following equation:
+    r""" Recursively calculate vs_minus_v_xs according to following equation:
     vs_minus_v_xs(t) = deltas(t) + discounts(t) * cs(t) * vs_minus_v_xs(t + 1)
 
     Args:
diff --git a/parl/algorithms/fluid/maddpg.py b/parl/algorithms/fluid/maddpg.py
index 4bf799413165d81d00238a7c156511a03619ba5d..36b14709aaf5e5e0a2cacc97bd94b1097caf2404 100644
--- a/parl/algorithms/fluid/maddpg.py
+++ b/parl/algorithms/fluid/maddpg.py
@@ -27,10 +27,11 @@ from parl.core.fluid.policy_distribution import SoftMultiCategoricalDistribution
 
 
 def SoftPDistribution(logits, act_space):
-    """input:
+    """Args:
             logits: the output of policy model
             act_space: action space, must be gym.spaces.Discrete or multiagent.multi_discrete.MultiDiscrete
-        output：
+
+        Return:
             instance of SoftCategoricalDistribution or SoftMultiCategoricalDistribution
     """
     # is instance of gym.spaces.Discrete
diff --git a/parl/algorithms/fluid/policy_gradient.py b/parl/algorithms/fluid/policy_gradient.py
index b1b901ff1cfce1458f72899ba13cfa95a80d6265..d37083fba91d12e6774a139011efd6f281e1c205 100644
--- a/parl/algorithms/fluid/policy_gradient.py
+++ b/parl/algorithms/fluid/policy_gradient.py
@@ -18,51 +18,28 @@ warnings.simplefilter('default')
 import paddle.fluid as fluid
 from parl.core.fluid.algorithm import Algorithm
 from parl.core.fluid import layers
-from parl.utils.deprecation import deprecated
 
 __all__ = ['PolicyGradient']
 
 
 class PolicyGradient(Algorithm):
-    def __init__(self, model, hyperparas=None, lr=None):
+    def __init__(self, model, lr=None):
         """ Policy Gradient algorithm
         
         Args:
             model (parl.Model): forward network of the policy.
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             lr (float): learning rate of the policy model.
         """
 
         self.model = model
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithms.PolicyGradient` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.lr = hyperparas['lr']
-        else:
-            assert isinstance(lr, float)
-            self.lr = lr
-
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='predict')
-    def define_predict(self, obs):
-        """ use policy model self.model to predict the action probability
-        """
-        return self.predict(obs)
+        assert isinstance(lr, float)
+        self.lr = lr
 
     def predict(self, obs):
         """ use policy model self.model to predict the action probability
         """
         return self.model(obs)
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='learn')
-    def define_learn(self, obs, action, reward):
-        """ update policy model self.model with policy gradient algorithm
-        """
-        return self.learn(obs, action, reward)
-
     def learn(self, obs, action, reward):
         """ update policy model self.model with policy gradient algorithm
         """
diff --git a/parl/algorithms/fluid/ppo.py b/parl/algorithms/fluid/ppo.py
index 002ab273833a1fef8deecb08543a2a17f92d4d40..2cd88f46e837d385ab0a7977a1f7123674d8cbaf 100644
--- a/parl/algorithms/fluid/ppo.py
+++ b/parl/algorithms/fluid/ppo.py
@@ -20,7 +20,6 @@ from copy import deepcopy
 from paddle import fluid
 from parl.core.fluid import layers
 from parl.core.fluid.algorithm import Algorithm
-from parl.utils.deprecation import deprecated
 
 __all__ = ['PPO']
 
@@ -28,7 +27,6 @@ __all__ = ['PPO']
 class PPO(Algorithm):
     def __init__(self,
                  model,
-                 hyperparas=None,
                  act_dim=None,
                  policy_lr=None,
                  value_lr=None,
@@ -37,7 +35,6 @@ class PPO(Algorithm):
         
         Args:
             model (parl.Model): model defining forward network of policy and value.
-            hyperparas (dict): (deprecated) dict of hyper parameters.
             act_dim (float): dimension of the action space.
             policy_lr (float): learning rate of the policy model. 
             value_lr (float): learning rate of the value model.
@@ -47,27 +44,14 @@ class PPO(Algorithm):
         # Used to calculate probability of action in old policy
         self.old_policy_model = deepcopy(model.policy_model)
 
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithms.PPO` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-            self.act_dim = hyperparas['act_dim']
-            self.policy_lr = hyperparas['policy_lr']
-            self.value_lr = hyperparas['value_lr']
-            if 'epsilon' in hyperparas:
-                self.epsilon = hyperparas['epsilon']
-            else:
-                self.epsilon = 0.2  # default
-        else:
-            assert isinstance(act_dim, int)
-            assert isinstance(policy_lr, float)
-            assert isinstance(value_lr, float)
-            assert isinstance(epsilon, float)
-            self.act_dim = act_dim
-            self.policy_lr = policy_lr
-            self.value_lr = value_lr
-            self.epsilon = epsilon
+        assert isinstance(act_dim, int)
+        assert isinstance(policy_lr, float)
+        assert isinstance(value_lr, float)
+        assert isinstance(epsilon, float)
+        self.act_dim = act_dim
+        self.policy_lr = policy_lr
+        self.value_lr = value_lr
+        self.epsilon = epsilon
 
     def _calc_logprob(self, actions, means, logvars):
         """ Calculate log probabilities of actions, when given means and logvars
@@ -111,49 +95,18 @@ class PPO(Algorithm):
                 log_det_cov_new - log_det_cov_old) + tr_old_new - self.act_dim)
         return kl
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='predict')
-    def define_predict(self, obs):
-        """ Use policy model of self.model to predict means and logvars of actions
-        """
-        return self.predict(obs)
-
     def predict(self, obs):
         """ Use the policy model of self.model to predict means and logvars of actions
         """
         means, logvars = self.model.policy(obs)
         return means
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='sample')
-    def define_sample(self, obs):
-        """ Use the policy model of self.model to sample actions
-        """
-        return self.sample(obs)
-
     def sample(self, obs):
         """ Use the policy model of self.model to sample actions
         """
         sampled_act = self.model.policy_sample(obs)
         return sampled_act
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='policy_learn')
-    def define_policy_learn(self, obs, actions, advantages, beta=None):
-        """ Learn policy model with: 
-                1. CLIP loss: Clipped Surrogate Objective 
-                2. KLPEN loss: Adaptive KL Penalty Objective
-            See: https://arxiv.org/pdf/1707.02286.pdf
-
-        Args:
-            obs: Tensor, (batch_size, obs_dim)
-            actions: Tensor, (batch_size, act_dim)
-            advantages: Tensor (batch_size, )
-            beta: Tensor (1) or None
-                  if None, use CLIP Loss; else, use KLPEN loss. 
-        """
-        return self.policy_learn(obs, actions, advantages, beta)
-
     def policy_learn(self, obs, actions, advantages, beta=None):
         """ Learn policy model with: 
                 1. CLIP loss: Clipped Surrogate Objective 
@@ -196,27 +149,11 @@ class PPO(Algorithm):
         optimizer.minimize(loss)
         return loss, kl
 
-    @deprecated(
-        deprecated_in='1.2',
-        removed_in='1.3',
-        replace_function='value_predict')
-    def define_value_predict(self, obs):
-        """ Use value model of self.model to predict value of obs
-        """
-        return self.value_predict(obs)
-
     def value_predict(self, obs):
         """ Use value model of self.model to predict value of obs
         """
         return self.model.value(obs)
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='value_learn')
-    def define_value_learn(self, obs, val):
-        """ Learn value model with square error cost
-        """
-        return self.value_learn(obs, val)
-
     def value_learn(self, obs, val):
         """ Learn the value model with square error cost
         """
@@ -227,12 +164,7 @@ class PPO(Algorithm):
         optimizer.minimize(loss)
         return loss
 
-    def sync_old_policy(self, gpu_id=None):
+    def sync_old_policy(self):
         """ Synchronize weights of self.model.policy_model to self.old_policy_model
         """
-        if gpu_id is not None:
-            warnings.warn(
-                "the `gpu_id` argument of `sync_old_policy` function in `parl.Algorithms.PPO` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
         self.model.policy_model.sync_weights_to(self.old_policy_model)
diff --git a/parl/algorithms/fluid/sac.py b/parl/algorithms/fluid/sac.py
index cec92c98568905af7bce64252e9f3ff0531da039..32d7b1edfca1498fb40ece392025d310e162dd50 100644
--- a/parl/algorithms/fluid/sac.py
+++ b/parl/algorithms/fluid/sac.py
@@ -102,11 +102,11 @@ class SAC(Algorithm):
         return cost
 
     def critic_learn(self, obs, action, reward, next_obs, terminal):
-        next_state_action, next_state_log_pi = self.sample(next_obs)
+        next_obs_action, next_obs_log_pi = self.sample(next_obs)
         qf1_next_target, qf2_next_target = self.target_critic.value(
-            next_obs, next_state_action)
+            next_obs, next_obs_action)
         min_qf_next_target = layers.elementwise_min(
-            qf1_next_target, qf2_next_target) - next_state_log_pi * self.alpha
+            qf1_next_target, qf2_next_target) - next_obs_log_pi * self.alpha
 
         terminal = layers.cast(terminal, dtype='float32')
         target_Q = reward + (1.0 - terminal) * self.gamma * min_qf_next_target
diff --git a/parl/algorithms/fluid/tests/algs_test.py b/parl/algorithms/fluid/tests/algs_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..6d272b8f58b59f1e5d0167adfe305ab7ceb51679
--- /dev/null
+++ b/parl/algorithms/fluid/tests/algs_test.py
@@ -0,0 +1,699 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+import paddle.fluid as fluid
+import parl
+from parl import layers
+
+
+class DQNModel(parl.Model):
+    def __init__(self):
+        self.fc1 = layers.fc(size=32, act='relu')
+        self.fc2 = layers.fc(size=2)
+
+    def value(self, obs):
+        x = self.fc1(obs)
+        act = self.fc2(x)
+        return act
+
+
+class DQNAgent(parl.Agent):
+    def __init__(self, algorithm):
+        super(DQNAgent, self).__init__(algorithm)
+        self.alg = algorithm
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.value = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            action = layers.data(name='act', shape=[1], dtype='int32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
+            lr = layers.data(
+                name='lr', shape=[1], dtype='float32', append_batch_size=False)
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            self.cost = self.alg.learn(obs, action, reward, next_obs, terminal,
+                                       lr)
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        pred_Q = self.fluid_executor.run(
+            self.pred_program,
+            feed={'obs': obs.astype('float32')},
+            fetch_list=[self.value])[0]
+        pred_Q = np.squeeze(pred_Q, axis=0)
+        act = np.argmax(pred_Q)
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        lr = 3e-4
+
+        obs = np.expand_dims(obs, axis=0)
+        next_obs = np.expand_dims(next_obs, axis=0)
+        act = np.expand_dims(act, -1)
+        feed = {
+            'obs': obs.astype('float32'),
+            'act': act.astype('int32'),
+            'reward': reward,
+            'next_obs': next_obs.astype('float32'),
+            'terminal': terminal,
+            'lr': np.float32(lr)
+        }
+        cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.cost])[0]
+        return cost
+
+
+class A3CModel(parl.Model):
+    def __init__(self):
+        self.fc = layers.fc(size=32, act='relu')
+
+        self.policy_fc = layers.fc(size=2)
+        self.value_fc = layers.fc(size=1)
+
+    def policy(self, obs):
+        x = self.fc(obs)
+        policy_logits = self.policy_fc(x)
+
+        return policy_logits
+
+    def value(self, obs):
+        x = self.fc(obs)
+        values = self.value_fc(x)
+        values = layers.squeeze(values, axes=[1])
+
+        return values
+
+    def policy_and_value(self, obs):
+        x = self.fc(obs)
+        policy_logits = self.policy_fc(x)
+        values = self.value_fc(x)
+        values = layers.squeeze(values, axes=[1])
+
+        return policy_logits, values
+
+
+class A3CAgent(parl.Agent):
+    def __init__(self, algorithm):
+        super(A3CAgent, self).__init__(algorithm)
+        self.alg = algorithm
+
+    def build_program(self):
+        self.predict_program = fluid.Program()
+        self.value_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.predict_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.predict_actions = self.alg.predict(obs)
+
+        with fluid.program_guard(self.value_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.values = self.alg.value(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            actions = layers.data(name='actions', shape=[], dtype='int64')
+            advantages = layers.data(
+                name='advantages', shape=[], dtype='float32')
+            target_values = layers.data(
+                name='target_values', shape=[], dtype='float32')
+            lr = layers.data(
+                name='lr', shape=[1], dtype='float32', append_batch_size=False)
+            entropy_coeff = layers.data(
+                name='entropy_coeff',
+                shape=[1],
+                dtype='float32',
+                append_batch_size=False)
+
+            total_loss, pi_loss, vf_loss, entropy = self.alg.learn(
+                obs, actions, advantages, target_values, lr, entropy_coeff)
+            self.learn_outputs = [total_loss, pi_loss, vf_loss, entropy]
+
+    def predict(self, obs_np):
+        obs_np = obs_np.astype('float32')
+
+        predict_actions = self.fluid_executor.run(
+            self.predict_program,
+            feed={'obs': obs_np},
+            fetch_list=[self.predict_actions])[0]
+        return predict_actions
+
+    def value(self, obs_np):
+        obs_np = obs_np.astype('float32')
+
+        values = self.fluid_executor.run(
+            self.value_program, feed={'obs': obs_np},
+            fetch_list=[self.values])[0]
+        return values
+
+    def learn(self, obs_np, actions_np, advantages_np, target_values_np):
+        obs_np = obs_np.astype('float32')
+        actions_np = actions_np.astype('int64')
+        advantages_np = advantages_np.astype('float32')
+        target_values_np = target_values_np.astype('float32')
+
+        lr = 3e-4
+        entropy_coeff = 0.
+
+        total_loss, pi_loss, vf_loss, entropy = self.fluid_executor.run(
+            self.learn_program,
+            feed={
+                'obs': obs_np,
+                'actions': actions_np,
+                'advantages': advantages_np,
+                'target_values': target_values_np,
+                'lr': np.array([lr], dtype='float32'),
+                'entropy_coeff': np.array([entropy_coeff], dtype='float32')
+            },
+            fetch_list=self.learn_outputs)
+        return total_loss, pi_loss, vf_loss, entropy, lr, entropy_coeff
+
+
+class IMPALAModel(parl.Model):
+    def __init__(self):
+        self.fc = layers.fc(size=32, act='relu')
+
+        self.policy_fc = layers.fc(size=2)
+        self.value_fc = layers.fc(size=1)
+
+    def policy(self, obs):
+        x = self.fc(obs)
+        policy_logits = self.policy_fc(x)
+
+        return policy_logits
+
+    def value(self, obs):
+        x = self.fc(obs)
+        values = self.value_fc(x)
+        values = layers.squeeze(values, axes=[1])
+
+        return values
+
+
+class IMPALAAgent(parl.Agent):
+    def __init__(self, algorithm):
+        super(IMPALAAgent, self).__init__(algorithm)
+        self.alg = algorithm
+
+    def build_program(self):
+        self.predict_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.predict_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.predict_actions = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            actions = layers.data(name='actions', shape=[], dtype='int64')
+            behaviour_logits = layers.data(
+                name='behaviour_logits', shape=[2], dtype='float32')
+            rewards = layers.data(name='rewards', shape=[], dtype='float32')
+            dones = layers.data(name='dones', shape=[], dtype='float32')
+            lr = layers.data(
+                name='lr', shape=[1], dtype='float32', append_batch_size=False)
+            entropy_coeff = layers.data(
+                name='entropy_coeff',
+                shape=[1],
+                dtype='float32',
+                append_batch_size=False)
+
+            vtrace_loss, kl = self.alg.learn(obs, actions, behaviour_logits,
+                                             rewards, dones, lr, entropy_coeff)
+            self.learn_outputs = [
+                vtrace_loss.total_loss, vtrace_loss.pi_loss,
+                vtrace_loss.vf_loss, vtrace_loss.entropy, kl
+            ]
+
+    def predict(self, obs_np):
+        obs_np = obs_np.astype('float32')
+
+        predict_actions = self.fluid_executor.run(
+            self.predict_program,
+            feed={'obs': obs_np},
+            fetch_list=[self.predict_actions])[0]
+        return predict_actions
+
+    def learn(self, obs, actions, behaviour_logits, rewards, dones, lr,
+              entropy_coeff):
+        total_loss, pi_loss, vf_loss, entropy, kl = self.fluid_executor.run(
+            self.learn_program,
+            feed={
+                'obs': obs,
+                'actions': actions,
+                'behaviour_logits': behaviour_logits,
+                'rewards': rewards,
+                'dones': dones,
+                'lr': np.array([lr], dtype='float32'),
+                'entropy_coeff': np.array([entropy_coeff], dtype='float32')
+            },
+            fetch_list=self.learn_outputs)
+        return total_loss, pi_loss, vf_loss, entropy, kl
+
+
+class SACActor(parl.Model):
+    def __init__(self):
+        self.mean_linear = layers.fc(size=1)
+        self.log_std_linear = layers.fc(size=1)
+
+    def policy(self, obs):
+        means = self.mean_linear(obs)
+        log_std = self.log_std_linear(obs)
+
+        return means, log_std
+
+
+class SACCritic(parl.Model):
+    def __init__(self):
+        self.fc1 = layers.fc(size=1)
+        self.fc2 = layers.fc(size=1)
+
+    def value(self, obs, act):
+        concat = layers.concat([obs, act], axis=1)
+        Q1 = self.fc1(concat)
+        Q2 = self.fc2(concat)
+        Q1 = layers.squeeze(Q1, axes=[1])
+        Q2 = layers.squeeze(Q2, axes=[1])
+        return Q1, Q2
+
+
+class SACAgent(parl.Agent):
+    def __init__(self, algorithm):
+        super(SACAgent, self).__init__(algorithm)
+        self.alg.sync_target(decay=0)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.sample_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.pred_act = self.alg.predict(obs)
+
+        with fluid.program_guard(self.sample_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.sample_act, _ = self.alg.sample(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            act = layers.data(name='act', shape=[1], dtype='float32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            self.critic_cost, self.actor_cost = self.alg.learn(
+                obs, act, reward, next_obs, terminal)
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act = self.fluid_executor.run(
+            self.pred_program, feed={'obs': obs},
+            fetch_list=[self.pred_act])[0]
+        return act
+
+    def sample(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act = self.fluid_executor.run(
+            self.sample_program,
+            feed={'obs': obs},
+            fetch_list=[self.sample_act])[0]
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        feed = {
+            'obs': obs,
+            'act': act,
+            'reward': reward,
+            'next_obs': next_obs,
+            'terminal': terminal
+        }
+        [critic_cost, actor_cost] = self.fluid_executor.run(
+            self.learn_program,
+            feed=feed,
+            fetch_list=[self.critic_cost, self.actor_cost])
+        return critic_cost[0], actor_cost[0]
+
+
+class DDPGModel(parl.Model):
+    def __init__(self):
+        self.policy_fc = layers.fc(size=1)
+        self.value_fc = layers.fc(size=1)
+
+    def policy(self, obs):
+        act = self.policy_fc(obs)
+        return act
+
+    def value(self, obs, act):
+        concat = layers.concat([obs, act], axis=1)
+        Q = self.value_fc(concat)
+        Q = layers.squeeze(Q, axes=[1])
+        return Q
+
+    def get_actor_params(self):
+        return self.parameters()[:2]
+
+
+class DDPGAgent(parl.Agent):
+    def __init__(self, algorithm):
+        super(DDPGAgent, self).__init__(algorithm)
+        self.alg.sync_target(decay=0)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.pred_act = self.alg.predict(obs)
+
+        with fluid.program_guard(self.learn_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            act = layers.data(name='act', shape=[1], dtype='float32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            _, self.critic_cost = self.alg.learn(obs, act, reward, next_obs,
+                                                 terminal)
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act = self.fluid_executor.run(
+            self.pred_program, feed={'obs': obs},
+            fetch_list=[self.pred_act])[0]
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        feed = {
+            'obs': obs,
+            'act': act,
+            'reward': reward,
+            'next_obs': next_obs,
+            'terminal': terminal
+        }
+        critic_cost = self.fluid_executor.run(
+            self.learn_program, feed=feed, fetch_list=[self.critic_cost])[0]
+        self.alg.sync_target()
+        return critic_cost
+
+
+class TD3Model(parl.Model):
+    def __init__(self):
+        self.actor_fc = layers.fc(size=1)
+        self.q1 = layers.fc(size=1)
+        self.q2 = layers.fc(size=1)
+
+    def policy(self, obs):
+        return self.actor_fc(obs)
+
+    def value(self, obs, act):
+        concat = layers.concat([obs, act], axis=1)
+        Q1 = self.q1(concat)
+        Q1 = layers.squeeze(Q1, axes=[1])
+        Q2 = self.q2(concat)
+        Q2 = layers.squeeze(Q2, axes=[1])
+        return Q1, Q2
+
+    def Q1(self, obs, act):
+        concat = layers.concat([obs, act], axis=1)
+        Q1 = self.q1(concat)
+        Q1 = layers.squeeze(Q1, axes=[1])
+        return Q1
+
+    def get_actor_params(self):
+        return self.parameters()[:2]
+
+
+class TD3Agent(parl.Agent):
+    def __init__(self, algorithm):
+        super(TD3Agent, self).__init__(algorithm)
+        self.alg.sync_target(decay=0)
+
+    def build_program(self):
+        self.pred_program = fluid.Program()
+        self.actor_learn_program = fluid.Program()
+        self.critic_learn_program = fluid.Program()
+
+        with fluid.program_guard(self.pred_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.pred_act = self.alg.predict(obs)
+
+        with fluid.program_guard(self.actor_learn_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            self.actor_cost = self.alg.actor_learn(obs)
+
+        with fluid.program_guard(self.critic_learn_program):
+            obs = layers.data(name='obs', shape=[4], dtype='float32')
+            act = layers.data(name='act', shape=[1], dtype='float32')
+            reward = layers.data(name='reward', shape=[], dtype='float32')
+            next_obs = layers.data(name='next_obs', shape=[4], dtype='float32')
+            terminal = layers.data(name='terminal', shape=[], dtype='bool')
+            self.critic_cost = self.alg.critic_learn(obs, act, reward,
+                                                     next_obs, terminal)
+
+    def predict(self, obs):
+        obs = np.expand_dims(obs, axis=0)
+        act = self.fluid_executor.run(
+            self.pred_program, feed={'obs': obs},
+            fetch_list=[self.pred_act])[0]
+        return act
+
+    def learn(self, obs, act, reward, next_obs, terminal):
+        feed = {
+            'obs': obs,
+            'act': act,
+            'reward': reward,
+            'next_obs': next_obs,
+            'terminal': terminal
+        }
+        critic_cost = self.fluid_executor.run(
+            self.critic_learn_program,
+            feed=feed,
+            fetch_list=[self.critic_cost])[0]
+
+        actor_cost = self.fluid_executor.run(
+            self.actor_learn_program,
+            feed={'obs': obs},
+            fetch_list=[self.actor_cost])[0]
+        self.alg.sync_target()
+        return actor_cost, critic_cost
+
+
+class PARLtest(unittest.TestCase):
+    def setUp(self):
+        # set up DQN test
+        DQN_model = DQNModel()
+        DQN_alg = parl.algorithms.DQN(DQN_model, act_dim=2, gamma=0.9)
+        self.DQN_agent = DQNAgent(DQN_alg)
+
+        # set up A3C test
+        A3C_model = A3CModel()
+        A3C_alg = parl.algorithms.A3C(A3C_model, vf_loss_coeff=0.)
+        self.A3C_agent = A3CAgent(A3C_alg)
+
+        # set up IMPALA test
+        IMPALA_model = IMPALAModel()
+        IMPALA_alg = parl.algorithms.IMPALA(
+            IMPALA_model,
+            sample_batch_steps=4,
+            gamma=0.9,
+            vf_loss_coeff=0.,
+            clip_rho_threshold=1.,
+            clip_pg_rho_threshold=1.)
+        self.IMPALA_agent = IMPALAAgent(IMPALA_alg)
+
+        # set up SAC test
+        SAC_actor = SACActor()
+        SAC_critic = SACCritic()
+        SAC_alg = parl.algorithms.SAC(
+            SAC_actor,
+            SAC_critic,
+            max_action=1.,
+            gamma=0.99,
+            tau=0.005,
+            actor_lr=1e-3,
+            critic_lr=1e-3)
+        self.SAC_agent = SACAgent(SAC_alg)
+
+        # set up DDPG test
+        DDPG_model = DDPGModel()
+        DDPG_alg = parl.algorithms.DDPG(
+            DDPG_model, gamma=0.99, tau=0.001, actor_lr=3e-4, critic_lr=3e-4)
+        self.DDPG_agent = DDPGAgent(DDPG_alg)
+
+        # set up TD3 test
+        TD3_model = TD3Model()
+        TD3_alg = parl.algorithms.TD3(
+            TD3_model,
+            1.,
+            gamma=0.99,
+            tau=0.005,
+            actor_lr=3e-4,
+            critic_lr=3e-4)
+        self.TD3_agent = TD3Agent(TD3_alg)
+
+    def test_DQN_predict(self):
+        """Test APIs in PARL DQN predict
+        """
+        obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
+
+        act = self.DQN_agent.predict(obs)
+
+    def test_DQN_learn(self):
+        """Test APIs in PARL DQN learn
+        """
+        obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
+        next_obs = np.array([-0.02332638, -0.16414229, 0.01142936, 0.29949173])
+        terminal = np.array([False]).astype('bool')
+        reward = np.array([1.0]).astype('float32')
+        act = np.array([0]).astype('int32')
+
+        cost = self.DQN_agent.learn(obs, act, reward, next_obs, terminal)
+
+    def test_A3C_predict(self):
+        """Test APIs in PARL A3C predict
+        """
+        obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
+        obs = np.expand_dims(obs, axis=0)
+
+        logits = self.A3C_agent.predict(obs)
+
+    def test_A3C_value(self):
+        """Test APIs in PARL A3C predict
+        """
+        obs = np.array([-0.02394919, 0.03114079, 0.01136446, 0.00324496])
+        obs = np.expand_dims(obs, axis=0)
+
+        values = self.A3C_agent.value(obs)
+
+    def test_A3C_learn(self):
+        """Test APIs in PARL A3C learn
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496]])
+        action = np.array([0])
+        advantages = np.array([-0.02332638])
+        target_values = np.array([1.])
+
+        self.A3C_agent.learn(obs, action, advantages, target_values)
+
+    def test_IMPALA_predict(self):
+        """Test APIs in PARL IMPALA predict
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496]])
+
+        policy = self.IMPALA_agent.predict(obs)
+
+    def test_IMPALA_learn(self):
+        """Test APIs in PARL IMPALA learn
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446, 0.00324496],
+                        [-0.02394919, 0.03114079, 0.01136446, 0.00324496],
+                        [-0.02394919, 0.03114079, 0.01136446, 0.00324496],
+                        [-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype('float32')
+        actions = np.array([1, 1, 1, 1]).astype('int32')
+        behaviour_logits = np.array([[-1, 1], [-1, 1], [-1, 1],
+                                     [-1, 1]]).astype('float32')
+        rewards = np.array([0, 0, 0, 0]).astype('float32')
+        dones = np.array([False, False, False, False]).astype('float32')
+        lr = 3e-4
+        entropy_coeff = 0.
+
+        total_loss, pi_loss, vf_loss, entropy, kl = self.IMPALA_agent.learn(
+            obs, actions, behaviour_logits, rewards, dones, lr, entropy_coeff)
+
+    def test_SAC_predict(self):
+        """Test APIs in PARL SAC predict
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype(np.float32)
+        act = self.SAC_agent.predict(obs)
+
+    def test_SAC_sample(self):
+        """Test APIs in PARL SAC sample
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype(np.float32)
+        act = self.SAC_agent.sample(obs)
+
+    def test_SAC_learn(self):
+        """Test APIs in PARL SAC learn
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype(np.float32)
+        next_obs = np.array(
+            [[-0.02332638, -0.16414229, 0.01142936,
+              0.29949173]]).astype(np.float32)
+        terminal = np.array([False]).astype('bool')
+        reward = np.array([1.0]).astype('float32')
+        act = np.array([[0.]]).astype('float32')
+
+        critic_cost, actor_cost = self.SAC_agent.learn(obs, act, reward,
+                                                       next_obs, terminal)
+
+    def test_DDPG_predict(self):
+        """Test APIs in PARL DDPG predict
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype(np.float32)
+        act = self.DDPG_agent.predict(obs)
+
+    def test_DDPG_learn(self):
+        """Test APIs in PARL DDPG learn
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype(np.float32)
+        next_obs = np.array(
+            [[-0.02332638, -0.16414229, 0.01142936,
+              0.29949173]]).astype(np.float32)
+        terminal = np.array([False]).astype('bool')
+        reward = np.array([1.0]).astype('float32')
+        act = np.array([[0.]]).astype('float32')
+
+        critic_cost, actor_cost = self.SAC_agent.learn(obs, act, reward,
+                                                       next_obs, terminal)
+
+    def test_TD3_predict(self):
+        """Test APIs in PARL TD3 predict
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype(np.float32)
+        act = self.TD3_agent.predict(obs)
+
+    def test_TD3_learn(self):
+        """Test APIs in PARL TD3 learn
+        """
+        obs = np.array([[-0.02394919, 0.03114079, 0.01136446,
+                         0.00324496]]).astype(np.float32)
+        next_obs = np.array(
+            [[-0.02332638, -0.16414229, 0.01142936,
+              0.29949173]]).astype(np.float32)
+        terminal = np.array([False]).astype('bool')
+        reward = np.array([1.0]).astype('float32')
+        act = np.array([[0.]]).astype('float32')
+
+        critic_cost, actor_cost = self.TD3_agent.learn(obs, act, reward,
+                                                       next_obs, terminal)
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/parl/algorithms/torch/__init__.py b/parl/algorithms/torch/__init__.py
index 97826766b2581762d1080c05eb979bc4d0e4b03f..9de7afbdd57305b1280b024556e0b1730bcbc494 100644
--- a/parl/algorithms/torch/__init__.py
+++ b/parl/algorithms/torch/__init__.py
@@ -16,5 +16,5 @@ from parl.algorithms.torch.ddqn import *
 from parl.algorithms.torch.dqn import *
 from parl.algorithms.torch.a2c import *
 from parl.algorithms.torch.td3 import *
-from parl.algorithms.torch.coma import *
+from parl.algorithms.torch.ppo import *
 from parl.algorithms.torch.policy_gradient import *
diff --git a/parl/algorithms/torch/a2c.py b/parl/algorithms/torch/a2c.py
index 3d78ce75938c583e15e4f7321ad836d869ef25b1..43e373907db821fedb4759138de064bd8dda9afa 100644
--- a/parl/algorithms/torch/a2c.py
+++ b/parl/algorithms/torch/a2c.py
@@ -27,7 +27,7 @@ __all__ = ['A2C']
 
 
 class A2C(parl.Algorithm):
-    def __init__(self, model, config, hyperparas=None):
+    def __init__(self, model, config):
         assert isinstance(config['vf_loss_coeff'], (int, float))
         self.model = model
         self.vf_loss_coeff = config['vf_loss_coeff']
diff --git a/parl/algorithms/torch/ppo.py b/parl/algorithms/torch/ppo.py
new file mode 100644
index 0000000000000000000000000000000000000000..7c838e896e26b35fa078d1db1323476fb776993f
--- /dev/null
+++ b/parl/algorithms/torch/ppo.py
@@ -0,0 +1,94 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import parl
+import torch
+import torch.nn as nn
+import torch.nn.functional as F
+import torch.optim as optim
+from torch.distributions import Normal
+
+__all__ = ['PPO']
+
+
+class PPO(parl.Algorithm):
+    def __init__(self,
+                 model,
+                 clip_param,
+                 value_loss_coef,
+                 entropy_coef,
+                 initial_lr,
+                 eps=None,
+                 max_grad_norm=None,
+                 use_clipped_value_loss=True):
+        self.model = model
+
+        self.clip_param = clip_param
+
+        self.value_loss_coef = value_loss_coef
+        self.entropy_coef = entropy_coef
+
+        self.max_grad_norm = max_grad_norm
+        self.use_clipped_value_loss = use_clipped_value_loss
+
+        self.optimizer = optim.Adam(model.parameters(), lr=initial_lr, eps=eps)
+
+    def learn(self, obs_batch, actions_batch, value_preds_batch, return_batch,
+              old_action_log_probs_batch, adv_targ):
+        values = self.model.value(obs_batch)
+        mean, log_std = self.model.policy(obs_batch)
+        dist = Normal(mean, log_std.exp())
+
+        action_log_probs = dist.log_prob(actions_batch).sum(-1, keepdim=True)
+        dist_entropy = dist.entropy().sum(-1).mean()
+
+        ratio = torch.exp(action_log_probs - old_action_log_probs_batch)
+        surr1 = ratio * adv_targ
+        surr2 = torch.clamp(ratio, 1.0 - self.clip_param,
+                            1.0 + self.clip_param) * adv_targ
+        action_loss = -torch.min(surr1, surr2).mean()
+
+        if self.use_clipped_value_loss:
+            value_pred_clipped = value_preds_batch + \
+                (values - value_preds_batch).clamp(-self.clip_param, self.clip_param)
+            value_losses = (values - return_batch).pow(2)
+            value_losses_clipped = (value_pred_clipped - return_batch).pow(2)
+            value_loss = 0.5 * torch.max(value_losses,
+                                         value_losses_clipped).mean()
+        else:
+            value_loss = 0.5 * (return_batch - values).pow(2).mean()
+
+        self.optimizer.zero_grad()
+        (value_loss * self.value_loss_coef + action_loss -
+         dist_entropy * self.entropy_coef).backward()
+        nn.utils.clip_grad_norm_(self.model.parameters(), self.max_grad_norm)
+        self.optimizer.step()
+
+        return value_loss.item(), action_loss.item(), dist_entropy.item()
+
+    def sample(self, obs):
+        value = self.model.value(obs)
+        mean, log_std = self.model.policy(obs)
+        dist = Normal(mean, log_std.exp())
+        action = dist.sample()
+        action_log_probs = dist.log_prob(action).sum(-1, keepdim=True)
+
+        return value, action, action_log_probs
+
+    def predict(self, obs):
+        mean, _ = self.model.policy(obs)
+        return mean
+
+    def value(self, obs):
+        return self.model.value(obs)
diff --git a/parl/core/fluid/agent.py b/parl/core/fluid/agent.py
index 8972443c453e75e022751cee707d9bbaeda649df..a3e196358d5775bf15e7730d44e4b6ee2706f668 100644
--- a/parl/core/fluid/agent.py
+++ b/parl/core/fluid/agent.py
@@ -15,9 +15,9 @@
 import warnings
 warnings.simplefilter('default')
 
+import os
 import paddle.fluid as fluid
 from parl.core.fluid import layers
-from parl.utils.deprecation import deprecated
 from parl.core.agent_base import AgentBase
 from parl.core.fluid.algorithm import Algorithm
 from parl.utils import machine_info
@@ -46,7 +46,6 @@ class Agent(AgentBase):
       This class will initialize the neural network parameters automatically, and provides an executor for users to run the programs (self.fluid_executor).
 
     Attributes:
-        gpu_id (int): deprecated. specify which GPU to be used. -1 if to use the CPU.
         fluid_executor (fluid.Executor): executor for running programs of the agent.
         alg (parl.algorithm): algorithm of this agent.
 
@@ -65,18 +64,12 @@ class Agent(AgentBase):
 
     """
 
-    def __init__(self, algorithm, gpu_id=None):
+    def __init__(self, algorithm):
         """Build programs by calling the method ``self.build_program()`` and run initialization function of ``fluid.default_startup_program()``.
 
         Args:
             algorithm (parl.Algorithm): an instance of `parl.Algorithm`. This algorithm is then passed to `self.alg`.
-            gpu_id (int): deprecated. specify which GPU to be used. -1 if to use the CPU.
         """
-        if gpu_id is not None:
-            warnings.warn(
-                "the `gpu_id` argument of `__init__` function in `parl.Agent` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
 
         assert isinstance(algorithm, Algorithm)
         super(Agent, self).__init__(algorithm)
@@ -119,26 +112,6 @@ class Agent(AgentBase):
         """
         raise NotImplementedError
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='get_weights')
-    def get_params(self):
-        """ Returns a Python dictionary containing the whole parameters of self.alg.
-
-        Returns:
-            a Python List containing the parameters of self.alg.
-        """
-        return self.algorithm.get_params()
-
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='set_weights')
-    def set_params(self, params):
-        """Copy parameters from ``get_params()`` into this agent.
-
-        Args:
-            params(dict): a Python List containing the parameters of self.alg.
-        """
-        self.algorithm.set_params(params)
-
     def learn(self, *args, **kwargs):
         """The training interface for ``Agent``.
         This function feeds the training data into the learn_program defined in ``build_program()``.
@@ -180,8 +153,8 @@ class Agent(AgentBase):
         """
         if program is None:
             program = self.learn_program
-        dirname = '/'.join(save_path.split('/')[:-1])
-        filename = save_path.split('/')[-1]
+        dirname = os.sep.join(save_path.split(os.sep)[:-1])
+        filename = save_path.split(os.sep)[-1]
         fluid.io.save_params(
             executor=self.fluid_executor,
             dirname=dirname,
@@ -214,8 +187,8 @@ class Agent(AgentBase):
             program = self.learn_program
         if type(program) is fluid.compiler.CompiledProgram:
             program = program._init_program
-        dirname = '/'.join(save_path.split('/')[:-1])
-        filename = save_path.split('/')[-1]
+        dirname = os.sep.join(save_path.split(os.sep)[:-1])
+        filename = save_path.split(os.sep)[-1]
         fluid.io.load_params(
             executor=self.fluid_executor,
             dirname=dirname,
diff --git a/parl/core/fluid/algorithm.py b/parl/core/fluid/algorithm.py
index 1a05a9991a658e13282f847f2cf4772dc19b2572..2267e3b6d8191d3f3b6028f9e188e7ad5394c863 100644
--- a/parl/core/fluid/algorithm.py
+++ b/parl/core/fluid/algorithm.py
@@ -17,7 +17,6 @@ warnings.simplefilter('default')
 
 from parl.core.algorithm_base import AlgorithmBase
 from parl.core.fluid.model import Model
-from parl.utils.deprecation import deprecated
 
 __all__ = ['Algorithm']
 
@@ -57,47 +56,13 @@ class Algorithm(AlgorithmBase):
         
     """
 
-    def __init__(self, model=None, hyperparas=None):
+    def __init__(self, model=None):
         """
         Args:
             model(``parl.Model``): a neural network that represents a policy or a Q-value function.
-            hyperparas(dict): a dict storing the hyper-parameters relative to training.
         """
-        if model is not None:
-            warnings.warn(
-                "the `model` argument of `__init__` function in `parl.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-
-            assert isinstance(model, Model)
-            self.model = model
-        if hyperparas is not None:
-            warnings.warn(
-                "the `hyperparas` argument of `__init__` function in `parl.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3.",
-                DeprecationWarning,
-                stacklevel=2)
-
-            self.hp = hyperparas
-
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='get_weights')
-    def get_params(self):
-        """ Get parameters of self.model.
-
-        Returns:
-            params(dict): a Python List containing the parameters of self.model.
-        """
-        return self.model.get_params()
-
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='set_weights')
-    def set_params(self, params):
-        """ Set parameters from ``get_params`` to the model.
-
-        Args:
-            params(dict ): a Python List containing the parameters of self.model.
-        """
-        self.model.set_params(params)
+        assert isinstance(model, Model)
+        self.model = model
 
     def learn(self, *args, **kwargs):
         """ Define the loss function and create an optimizer to minize the loss.
diff --git a/parl/core/fluid/layers/tests/param_sharing_test.py b/parl/core/fluid/layers/tests/param_sharing_test.py
index f457f4571375c70c8ed0333e6727ce97991996bb..d26048b08daae9ae341e88b402898f40ecd22ca4 100644
--- a/parl/core/fluid/layers/tests/param_sharing_test.py
+++ b/parl/core/fluid/layers/tests/param_sharing_test.py
@@ -45,7 +45,7 @@ class TestParamSharing(unittest.TestCase):
         dict_size = 100
         input_cx = np.random.uniform(0, 1, [batch_size, 100]).astype("float32")
         input_x = np.random.randint(
-            dict_size, size=(batch_size, 1)).astype("int")
+            dict_size, size=(batch_size, 1)).astype("int64")
         #################################
 
         main_program1 = fluid.Program()
@@ -59,7 +59,7 @@ class TestParamSharing(unittest.TestCase):
 
         main_program2 = fluid.Program()
         with fluid.program_guard(main_program2):
-            x_ = layers.data(name='x', shape=[1], dtype="int")
+            x_ = layers.data(name='x', shape=[1], dtype="int64")
             cx_ = layers.cast(
                 x=layers.one_hot(input=x_, depth=dict_size), dtype="float32")
             y1_ = net.fc1(input=cx_)
diff --git a/parl/core/fluid/model.py b/parl/core/fluid/model.py
index 38d653ad20275d281a1bca4cf63d1198475a8696..bf7069a68c53748d870c1d9d21c2ec971fee05fe 100644
--- a/parl/core/fluid/model.py
+++ b/parl/core/fluid/model.py
@@ -17,7 +17,6 @@ import paddle.fluid as fluid
 from parl.core.fluid.layers.layer_wrappers import LayerFunc
 from parl.core.fluid.plutils import *
 from parl.core.model_base import ModelBase
-from parl.utils.deprecation import deprecated
 from parl.utils import machine_info
 
 __all__ = ['Model']
@@ -67,30 +66,6 @@ class Model(ModelBase):
 
     """
 
-    @deprecated(
-        deprecated_in='1.2',
-        removed_in='1.3',
-        replace_function='sync_weights_to')
-    def sync_params_to(self,
-                       target_net,
-                       gpu_id=None,
-                       decay=0.0,
-                       share_vars_parallel_executor=None):
-        """Synchronize parameters in the model to another model (target_net).
-
-        target_net_weights = decay * target_net_weights + (1 - decay) * source_net_weights
-
-        Args:
-            target_model (`parl.Model`): an instance of ``Model`` that has the same neural network architecture as the current model.
-            decay (float):  the rate of decline in copying parameters. 0 if no parameters decay when synchronizing the parameters.
-            share_vars_parallel_executor (fluid.ParallelExecutor): Optional. If not None, will use fluid.ParallelExecutor 
-                                                                   to run program instead of fluid.Executor
-        """
-        self.sync_weights_to(
-            target_model=target_net,
-            decay=decay,
-            share_vars_parallel_executor=share_vars_parallel_executor)
-
     def sync_weights_to(self,
                         target_model,
                         decay=0.0,
@@ -181,21 +156,6 @@ class Model(ModelBase):
         else:
             self._cached_fluid_executor.run(fetch_list=[])
 
-    @property
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='parameters')
-    def parameter_names(self):
-        """Get names of all parameters in this ``Model``.
-
-        Only parameters created by ``parl.layers`` are included.
-        The order of parameter names is consistent among
-        different instances of the same `Model`.
-
-        Returns:
-            param_names(list): list of string containing parameter names of all parameters. 
-        """
-        return self.parameters()
-
     def parameters(self):
         """Get names of all parameters in this ``Model``.
 
@@ -223,26 +183,6 @@ class Model(ModelBase):
             self._parameter_names = self._get_parameter_names(self)
             return self._parameter_names
 
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='get_weights')
-    def get_params(self):
-        """ Return a Python list containing parameters of current model.
-        
-        Returns:
-            parameters: a Python list containing parameters of the current model.
-        """
-        return self.get_weights()
-
-    @deprecated(
-        deprecated_in='1.2', removed_in='1.3', replace_function='set_weights')
-    def set_params(self, params, gpu_id=None):
-        """Set parameters in the model with params.
-        
-        Args:
-            params (List): List of numpy array .
-        """
-        self.set_weights(weights=params)
-
     def get_weights(self):
         """Returns a Python list containing parameters of current model.
 
diff --git a/parl/core/fluid/tests/agent_base_test_.py b/parl/core/fluid/tests/agent_base_test.py
similarity index 94%
rename from parl/core/fluid/tests/agent_base_test_.py
rename to parl/core/fluid/tests/agent_base_test.py
index cd8ca7d06f72ae99c51f12e639d9b0de1080ba7f..5a7f8ac12aaeee0d57daf92b118d1307a65a1cc0 100644
--- a/parl/core/fluid/tests/agent_base_test_.py
+++ b/parl/core/fluid/tests/agent_base_test.py
@@ -46,8 +46,8 @@ class TestAlgorithm(parl.Algorithm):
 
 
 class TestAgent(parl.Agent):
-    def __init__(self, algorithm, gpu_id=None):
-        super(TestAgent, self).__init__(algorithm, gpu_id)
+    def __init__(self, algorithm):
+        super(TestAgent, self).__init__(algorithm)
 
     def build_program(self):
         self.predict_program = fluid.Program()
@@ -92,8 +92,8 @@ class AgentBaseTest(unittest.TestCase):
         agent = TestAgent(self.algorithm)
         obs = np.random.random([3, 10]).astype('float32')
         output_np = agent.predict(obs)
-        save_path1 = './model.ckpt'
-        save_path2 = './my_model/model-2.ckpt'
+        save_path1 = 'model.ckpt'
+        save_path2 = os.path.join('my_model', 'model-2.ckpt')
         agent.save(save_path1)
         agent.save(save_path2)
         self.assertTrue(os.path.exists(save_path1))
@@ -103,7 +103,7 @@ class AgentBaseTest(unittest.TestCase):
         agent = TestAgent(self.algorithm)
         obs = np.random.random([3, 10]).astype('float32')
         output_np = agent.predict(obs)
-        save_path1 = './model.ckpt'
+        save_path1 = 'model.ckpt'
         previous_output = agent.predict(obs)
         agent.save(save_path1)
         agent.restore(save_path1)
@@ -121,7 +121,7 @@ class AgentBaseTest(unittest.TestCase):
         agent.learn_program = parl.compile(agent.learn_program)
         obs = np.random.random([3, 10]).astype('float32')
         previous_output = agent.predict(obs)
-        save_path1 = './model.ckpt'
+        save_path1 = 'model.ckpt'
         agent.save(save_path1)
         agent.restore(save_path1)
 
diff --git a/parl/core/fluid/tests/model_base_test_.py b/parl/core/fluid/tests/model_base_test.py
similarity index 94%
rename from parl/core/fluid/tests/model_base_test_.py
rename to parl/core/fluid/tests/model_base_test.py
index 1656366a2fd97daf019b2cfb42f1ab7be640a65a..faa13684c90d6eabcc0c7561fb386a035f63c4ac 100644
--- a/parl/core/fluid/tests/model_base_test_.py
+++ b/parl/core/fluid/tests/model_base_test.py
@@ -690,6 +690,43 @@ class ModelBaseTest(unittest.TestCase):
             self.executor.run(
                 pred_program, feed={'obs': x}, fetch_list=[model_output])
 
+    def test_get_weights_set_weights_with_create_parameter(self):
+        model1 = TestModel2()
+        model2 = TestModel2()
+
+        pred_program = fluid.Program()
+        with fluid.program_guard(pred_program):
+            obs = layers.data(name='obs', shape=[100], dtype='float32')
+            model1_output = model1.predict(obs)
+            model2_output = model2.predict(obs)
+
+        self.executor.run(fluid.default_startup_program())
+
+        N = 10
+        random_obs = np.random.random(size=(N, 100)).astype('float32')
+        for i in range(N):
+            x = np.expand_dims(random_obs[i], axis=0)
+            outputs = self.executor.run(
+                pred_program,
+                feed={'obs': x},
+                fetch_list=[model1_output, model2_output])
+            self.assertNotEqual(
+                np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
+
+        # pass parameters of self.model to model2
+        params = model1.get_weights()
+        model2.set_weights(params)
+
+        random_obs = np.random.random(size=(N, 100)).astype('float32')
+        for i in range(N):
+            x = np.expand_dims(random_obs[i], axis=0)
+            outputs = self.executor.run(
+                pred_program,
+                feed={'obs': x},
+                fetch_list=[model1_output, model2_output])
+            self.assertEqual(
+                np.sum(outputs[0].flatten()), np.sum(outputs[1].flatten()))
+
 
 if __name__ == '__main__':
     unittest.main()
diff --git a/parl/core/fluid/tests/policy_distribution_test_.py b/parl/core/fluid/tests/policy_distribution_test.py
similarity index 100%
rename from parl/core/fluid/tests/policy_distribution_test_.py
rename to parl/core/fluid/tests/policy_distribution_test.py
diff --git a/parl/core/torch/agent.py b/parl/core/torch/agent.py
index 5d8bb2195dc0fdff48c2e9a3d5f477b793af06eb..7cc87fad2f7d294a707ac5dfd6dad4eedd566b46 100644
--- a/parl/core/torch/agent.py
+++ b/parl/core/torch/agent.py
@@ -113,8 +113,9 @@ class Agent(AgentBase):
         """
         if model is None:
             model = self.algorithm.model
-        dirname = '/'.join(save_path.split('/')[:-1])
-        if not os.path.exists(dirname):
+        sep = os.sep
+        dirname = sep.join(save_path.split(sep)[:-1])
+        if dirname != '' and not os.path.exists(dirname):
             os.makedirs(dirname)
         torch.save(model.state_dict(), save_path)
 
diff --git a/parl/core/torch/tests/agent_base_test_torch.py b/parl/core/torch/tests/agent_base_test_torch.py
index 96caf7532c38bafea6ba33d41ecb173361c525ac..7bf468db86a7106d62c809bca52c32f8cb55f39a 100644
--- a/parl/core/torch/tests/agent_base_test_torch.py
+++ b/parl/core/torch/tests/agent_base_test_torch.py
@@ -77,8 +77,8 @@ class AgentBaseTest(unittest.TestCase):
     def test_save(self):
         agent = TestAgent(self.alg)
         obs = torch.randn(3, 10)
-        save_path1 = './model.ckpt'
-        save_path2 = './my_model/model-2.ckpt'
+        save_path1 = 'model.ckpt'
+        save_path2 = os.path.join('my_model', 'model-2.ckpt')
         agent.save(save_path1)
         agent.save(save_path2)
         self.assertTrue(os.path.exists(save_path1))
@@ -88,7 +88,7 @@ class AgentBaseTest(unittest.TestCase):
         agent = TestAgent(self.alg)
         obs = torch.randn(3, 10)
         output = agent.predict(obs)
-        save_path1 = './model.ckpt'
+        save_path1 = 'model.ckpt'
         previous_output = agent.predict(obs).detach().cpu().numpy()
         agent.save(save_path1)
         agent.restore(save_path1)
diff --git a/parl/framework/__init__.py b/parl/framework/__init__.py
deleted file mode 100644
index 4e48085338abbc3731935722515c5591333922d8..0000000000000000000000000000000000000000
--- a/parl/framework/__init__.py
+++ /dev/null
@@ -1,25 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-import warnings
-
-warnings.simplefilter('default')
-
-warnings.warn(
-    "import way `import parl.framework` is deprecated since version 1.2 and will be removed in version 1.3.",
-    DeprecationWarning,
-    stacklevel=2)
-
-from parl.core.fluid.model import *
-from parl.core.fluid.algorithm import *
-from parl.core.fluid.agent import *
diff --git a/parl/layers/__init__.py b/parl/layers/__init__.py
deleted file mode 100644
index 3283927adcb620094c0df4dea0a0ccf8533e3766..0000000000000000000000000000000000000000
--- a/parl/layers/__init__.py
+++ /dev/null
@@ -1,24 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-import warnings
-
-warnings.simplefilter('default')
-
-warnings.warn(
-    "import way `import parl.layers` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import layers` or `import parl; parl.layers` instead.",
-    DeprecationWarning,
-    stacklevel=2)
-
-from parl.core.fluid.layers import *
diff --git a/parl/plutils/__init__.py b/parl/plutils/__init__.py
deleted file mode 100644
index 8bac1d7d3677b82f03e8c64066ef6748fa03d577..0000000000000000000000000000000000000000
--- a/parl/plutils/__init__.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-print(
-    "import way `import parl.plutils` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import plutils` or `import parl; parl.plutils` instead."
-)
-
-from parl.core.fluid.plutils.common import *
diff --git a/parl/plutils/common.py b/parl/plutils/common.py
deleted file mode 100644
index 8bac1d7d3677b82f03e8c64066ef6748fa03d577..0000000000000000000000000000000000000000
--- a/parl/plutils/common.py
+++ /dev/null
@@ -1,19 +0,0 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-#
-# Licensed under the Apache License, Version 2.0 (the "License");
-# you may not use this file except in compliance with the License.
-# You may obtain a copy of the License at
-#
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
-# Unless required by applicable law or agreed to in writing, software
-# distributed under the License is distributed on an "AS IS" BASIS,
-# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-# See the License for the specific language governing permissions and
-# limitations under the License.
-
-print(
-    "import way `import parl.plutils` is deprecated since version 1.2 and will be removed in version 1.3, please use `from parl import plutils` or `import parl; parl.plutils` instead."
-)
-
-from parl.core.fluid.plutils.common import *
diff --git a/parl/remote/client.py b/parl/remote/client.py
index 0c06effc6aa6554dc72c39ec75739bc82c11453f..379459c5768914a012cf89182724f1233cbf1329 100644
--- a/parl/remote/client.py
+++ b/parl/remote/client.py
@@ -59,6 +59,7 @@ class Client(object):
         self.heartbeat_socket_initialized = threading.Event()
         self.master_is_alive = True
         self.client_is_alive = True
+        self.log_monitor_url = None
 
         self.executable_path = self.get_executable_path()
 
@@ -105,9 +106,19 @@ class Client(object):
 
             for file in distributed_files:
                 assert os.path.exists(file)
+                assert not os.path.isabs(
+                    file
+                ), "[XPARL] Please do not distribute a file with absolute path."
                 with open(file, 'rb') as f:
                     content = f.read()
                     pyfiles['other_files'][file] = content
+            # append entry file to code list
+            main_file = sys.argv[0]
+            with open(main_file, 'rb') as code_file:
+                code = code_file.read()
+                # parl/remote/remote_decorator.py -> remote_decorator.py
+                file_name = main_file.split(os.sep)[-1]
+                pyfiles['python_files'][file_name] = code
         except AssertionError as e:
             raise Exception(
                 'Failed to create the client, the file {} does not exist.'.
@@ -132,14 +143,19 @@ class Client(object):
         thread.start()
         self.heartbeat_socket_initialized.wait()
 
+        self.client_id = self.reply_master_heartbeat_address.replace(':', '_') + \
+                            '_' + str(int(time.time()))
+
         # check if the master is connected properly
         try:
             self.submit_job_socket.send_multipart([
                 remote_constants.CLIENT_CONNECT_TAG,
-                to_byte(self.heartbeat_master_address),
-                to_byte(socket.gethostname())
+                to_byte(self.reply_master_heartbeat_address),
+                to_byte(socket.gethostname()),
+                to_byte(self.client_id),
             ])
-            _ = self.submit_job_socket.recv_multipart()
+            message = self.submit_job_socket.recv_multipart()
+            self.log_monitor_url = to_str(message[1])
         except zmq.error.Again as e:
             logger.warning("[Client] Can not connect to the master, please "
                            "check if master is started and ensure the input "
@@ -150,17 +166,18 @@ class Client(object):
                             "address {} is correct.".format(master_address))
 
     def _reply_heartbeat(self):
-        """Reply heartbeat signals to the specific node."""
+        """Reply heartbeat signals to the master node."""
 
         socket = self.ctx.socket(zmq.REP)
         socket.linger = 0
         socket.setsockopt(zmq.RCVTIMEO,
                           remote_constants.HEARTBEAT_RCVTIMEO_S * 1000)
-        heartbeat_master_port =\
+        reply_master_heartbeat_port =\
             socket.bind_to_random_port(addr="tcp://*")
-        self.heartbeat_master_address = "{}:{}".format(get_ip_address(),
-                                                       heartbeat_master_port)
+        self.reply_master_heartbeat_address = "{}:{}".format(
+            get_ip_address(), reply_master_heartbeat_port)
         self.heartbeat_socket_initialized.set()
+        connected = False
         while self.client_is_alive and self.master_is_alive:
             try:
                 message = socket.recv_multipart()
@@ -170,11 +187,18 @@ class Client(object):
                     remote_constants.HEARTBEAT_TAG,
                     to_byte(self.executable_path),
                     to_byte(str(self.actor_num)),
-                    to_byte(str(elapsed_time))
-                ])
+                    to_byte(str(elapsed_time)),
+                    to_byte(str(self.log_monitor_url)),
+                ])  # TODO: remove additional information
             except zmq.error.Again as e:
-                logger.warning("[Client] Cannot connect to the master."
-                               "Please check if it is still alive.")
+                if connected:
+                    logger.warning("[Client] Cannot connect to the master."
+                                   "Please check if it is still alive.")
+                else:
+                    logger.warning(
+                        "[Client] Cannot connect to the master."
+                        "Please check the firewall between client and master.(e.g., ping the master IP)"
+                    )
                 self.master_is_alive = False
         socket.close(0)
         logger.warning("Client exit replying heartbeat for master.")
@@ -182,7 +206,7 @@ class Client(object):
     def _check_and_monitor_job(self, job_heartbeat_address,
                                ping_heartbeat_address, max_memory):
         """ Sometimes the client may receive a job that is dead, thus 
-        we have to check if this job is still alive before sending it to the actor.
+        we have to check if this job is still alive before adding it to the `actor_num`.
         """
         # job_heartbeat_socket: sends heartbeat signal to job
         job_heartbeat_socket = self.ctx.socket(zmq.REQ)
@@ -271,7 +295,8 @@ class Client(object):
                 self.lock.acquire()
                 self.submit_job_socket.send_multipart([
                     remote_constants.CLIENT_SUBMIT_TAG,
-                    to_byte(self.heartbeat_master_address)
+                    to_byte(self.reply_master_heartbeat_address),
+                    to_byte(self.client_id),
                 ])
                 message = self.submit_job_socket.recv_multipart()
                 self.lock.release()
@@ -326,9 +351,10 @@ def connect(master_address, distributed_files=[]):
         Exception: An exception is raised if the master node is not started.
     """
 
-    assert len(master_address.split(":")) == 2, "please input address in " +\
+    assert len(master_address.split(":")) == 2, "Please input address in " +\
         "{ip}:{port} format"
     global GLOBAL_CLIENT
+    addr = master_address.split(":")[0]
     cur_process_id = os.getpid()
     if GLOBAL_CLIENT is None:
         GLOBAL_CLIENT = Client(master_address, cur_process_id,
@@ -337,6 +363,8 @@ def connect(master_address, distributed_files=[]):
         if GLOBAL_CLIENT.process_id != cur_process_id:
             GLOBAL_CLIENT = Client(master_address, cur_process_id,
                                    distributed_files)
+    logger.info("Remote actors log url: {}".format(
+        GLOBAL_CLIENT.log_monitor_url))
 
 
 def get_global_client():
@@ -366,5 +394,5 @@ def disconnect():
         GLOBAL_CLIENT = None
     else:
         logger.info(
-            "No client to be released. Please make sure that you have call `parl.connect`"
+            "No client to be released. Please make sure that you have called `parl.connect`"
         )
diff --git a/parl/remote/cluster_monitor.py b/parl/remote/cluster_monitor.py
index 99bc2beac9e4e60565c31213ec05bdb96c7678a2..889f91586161f94cddb2a16670360cc8b9d4aca0 100644
--- a/parl/remote/cluster_monitor.py
+++ b/parl/remote/cluster_monitor.py
@@ -28,7 +28,8 @@ class ClusterMonitor(object):
     def __init__(self):
         self.status = {
             'workers': defaultdict(dict),
-            'clients': defaultdict(dict)
+            'clients': defaultdict(dict),
+            'client_jobs': defaultdict(dict),
         }
         self.lock = threading.Lock()
 
@@ -46,6 +47,11 @@ class ClusterMonitor(object):
         worker_status['hostname'] = hostname
         self.lock.release()
 
+    def add_client_job(self, client_id, job_info):
+        self.lock.acquire()
+        self.status['client_jobs'][client_id].update(job_info)
+        self.lock.release()
+
     def update_client_status(self, client_status, client_address,
                              client_hostname):
         """Update client status with message send from client heartbeat.
@@ -61,7 +67,8 @@ class ClusterMonitor(object):
             'client_address': client_hostname,
             'file_path': to_str(client_status[1]),
             'actor_num': int(to_str(client_status[2])),
-            'time': to_str(client_status[3])
+            'time': to_str(client_status[3]),
+            'log_monitor_url': to_str(client_status[4]),
         }
         self.lock.release()
 
@@ -96,14 +103,15 @@ class ClusterMonitor(object):
         self.status['workers'].pop(worker_address)
         self.lock.release()
 
-    def drop_cluster_status(self, client_address):
-        """Drop cluster status when it exits.
+    def drop_client_status(self, client_address):
+        """Drop client status when it exits.
 
         Args:
-            cluster_address (str): IP address of the exited client.
+            client_address (str): IP address of the exited client.
         """
         self.lock.acquire()
-        self.status['clients'].pop(client_address)
+        if client_address in self.status['clients']:
+            self.status['clients'].pop(client_address)
         self.lock.release()
 
     def get_status_info(self):
diff --git a/parl/framework/algorithm_base.py b/parl/remote/compatible_trick.py
similarity index 53%
rename from parl/framework/algorithm_base.py
rename to parl/remote/compatible_trick.py
index 2499c639077107d4c16387a1941b8252dd6a84fa..e61ade0c50af1dd51160bfbb149672c55ca20a29 100644
--- a/parl/framework/algorithm_base.py
+++ b/parl/remote/compatible_trick.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,14 +11,27 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+"""
+This file is used to fix the problem that cloudpickle cannot load some packages normally in Mac OS.
+We hack the problem by trying load these packages in the main module in advance.
 
-import warnings
+Template:
 
-warnings.simplefilter('default')
+try:
+    import [PACKAGE1]
+except ImportError:
+    pass
 
-warnings.warn(
-    "module `parl.framework.algorithm_base.Algorithm` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Algorithm` instead.",
-    DeprecationWarning,
-    stacklevel=2)
+try:
+    import [PACKAGE2]
+except ImportError:
+    pass
 
-from parl.core.fluid.algorithm import *
+"""
+from parl.utils import _IS_MAC
+
+if _IS_MAC:
+    try:
+        import rlschool
+    except ImportError:
+        pass
diff --git a/parl/remote/job.py b/parl/remote/job.py
index 00840c088ace82ad22f87d5e8e1433691c2143e4..d835e5389aa447bb69567b61f6f1c60b9cf99d58 100644
--- a/parl/remote/job.py
+++ b/parl/remote/job.py
@@ -12,6 +12,9 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 
+# Fix cloudpickle compatible problem we known.
+import compatible_trick
+
 import os
 os.environ['CUDA_VISIBLE_DEVICES'] = ''
 os.environ['XPARL'] = 'True'
@@ -33,6 +36,7 @@ from parl.utils.communication import loads_argument, loads_return,\
 from parl.remote import remote_constants
 from parl.utils.exceptions import SerializeError, DeserializeError
 from parl.remote.message import InitializedJob
+from parl.remote.utils import load_remote_class, redirect_stdout_to_file
 
 
 class Job(object):
@@ -44,7 +48,7 @@ class Job(object):
 
     """
 
-    def __init__(self, worker_address):
+    def __init__(self, worker_address, log_server_address):
         """
         Args:
             worker_address(str): worker_address for sending job information(e.g, pid)
@@ -56,16 +60,21 @@ class Job(object):
         self.max_memory = None
 
         self.job_address_receiver, job_address_sender = Pipe()
+        self.job_id_receiver, job_id_sender = Pipe()
 
         self.worker_address = worker_address
+        self.log_server_address = log_server_address
         self.job_ip = get_ip_address()
         self.pid = os.getpid()
-        self.lock = threading.Lock()
 
         self.run_job_process = Process(
-            target=self.run, args=(job_address_sender, ))
+            target=self.run, args=(job_address_sender, job_id_sender))
         self.run_job_process.start()
-
+        """
+        NOTE:
+            In Windows, it will raise errors when creating threading.Lock before starting multiprocess.Process.
+        """
+        self.lock = threading.Lock()
         self._create_sockets()
 
         process = psutil.Process(self.pid)
@@ -81,7 +90,7 @@ class Job(object):
                 _ = self.kill_job_socket.recv_multipart()
             except zmq.error.Again as e:
                 pass
-            os._exit(1)
+            os._exit(0)
 
     def _create_sockets(self):
         """Create five sockets for each job in main process.
@@ -95,6 +104,7 @@ class Job(object):
         """
         # wait for another process to create reply socket
         self.job_address = self.job_address_receiver.recv()
+        self.job_id = self.job_id_receiver.recv()
 
         self.ctx = zmq.Context()
         # create the job_socket
@@ -128,7 +138,8 @@ class Job(object):
         # sends job information to the worker
         initialized_job = InitializedJob(
             self.job_address, worker_heartbeat_address,
-            client_heartbeat_address, ping_heartbeat_address, None, self.pid)
+            client_heartbeat_address, ping_heartbeat_address, None, self.pid,
+            self.job_id, self.log_server_address)
         self.job_socket.send_multipart(
             [remote_constants.NORMAL_TAG,
              cloudpickle.dumps(initialized_job)])
@@ -237,7 +248,7 @@ class Job(object):
         the python files to the job. Later, the job will save these files to a
         temporary directory and add the temporary diretory to Python's working
         directory.
-        
+
         Args:
             reply_socket (sockert): main socket to accept commands of remote object.
             job_address (String): address of reply_socket.
@@ -262,12 +273,15 @@ class Job(object):
                 # create directory (i.e. ./rom_files/)
                 if '/' in file:
                     try:
-                        os.makedirs(os.path.join(*file.rsplit('/')[:-1]))
+                        sep = os.sep
+                        recursive_dirs = os.path.join(*(file.split(sep)[:-1]))
+                        recursive_dirs = os.path.join(envdir, recursive_dirs)
+                        os.makedirs(recursive_dirs)
                     except OSError as e:
                         pass
+                file = os.path.join(envdir, file)
                 with open(file, 'wb') as f:
                     f.write(content)
-            logger.info('[job] reply')
             reply_socket.send_multipart([remote_constants.NORMAL_TAG])
             return envdir
         else:
@@ -295,9 +309,15 @@ class Job(object):
 
         if tag == remote_constants.INIT_OBJECT_TAG:
             try:
-                cls = cloudpickle.loads(message[1])
+                file_name, class_name, end_of_file = cloudpickle.loads(
+                    message[1])
+                #/home/nlp-ol/Firework/baidu/nlp/evokit/python_api/es_agent -> es_agent
+                file_name = file_name.split(os.sep)[-1]
+                cls = load_remote_class(file_name, class_name, end_of_file)
                 args, kwargs = cloudpickle.loads(message[2])
-                obj = cls(*args, **kwargs)
+                logfile_path = os.path.join(self.log_dir, 'stdout.log')
+                with redirect_stdout_to_file(logfile_path):
+                    obj = cls(*args, **kwargs)
             except Exception as e:
                 traceback_str = str(traceback.format_exc())
                 error_str = str(e)
@@ -318,7 +338,7 @@ class Job(object):
 
         return obj
 
-    def run(self, job_address_sender):
+    def run(self, job_address_sender, job_id_sender):
         """An infinite loop waiting for a new task.
 
         Args:
@@ -333,19 +353,28 @@ class Job(object):
         job_ip = get_ip_address()
         job_address = "{}:{}".format(job_ip, job_port)
 
+        job_id = job_address.replace(':', '_') + '_' + str(int(time.time()))
+        self.log_dir = os.path.expanduser('~/.parl_data/job/{}'.format(job_id))
+        logger.set_dir(self.log_dir)
+        logger.info(
+            "[Job] Job {} initialized. Reply heartbeat socket Address: {}.".
+            format(job_id, job_address))
+
         job_address_sender.send(job_address)
+        job_id_sender.send(job_id)
 
         try:
             # receive source code from the actor and append them to the environment variables.
             envdir = self.wait_for_files(reply_socket, job_address)
-            sys.path.append(envdir)
+            sys.path.insert(0, envdir)
+            os.chdir(envdir)
 
             obj = self.wait_for_connection(reply_socket)
             assert obj is not None
             self.single_task(obj, reply_socket, job_address)
         except Exception as e:
             logger.error(
-                "Error occurs when running a single task. We will reset this job. Reason:{}"
+                "Error occurs when running a single task. We will reset this job. \nReason:{}"
                 .format(e))
             traceback_str = str(traceback.format_exc())
             logger.error("traceback:\n{}".format(traceback_str))
@@ -376,7 +405,12 @@ class Job(object):
                     function_name = to_str(message[1])
                     data = message[2]
                     args, kwargs = loads_argument(data)
-                    ret = getattr(obj, function_name)(*args, **kwargs)
+
+                    # Redirect stdout to stdout.log temporarily
+                    logfile_path = os.path.join(self.log_dir, 'stdout.log')
+                    with redirect_stdout_to_file(logfile_path):
+                        ret = getattr(obj, function_name)(*args, **kwargs)
+
                     ret = dumps_return(ret)
 
                     reply_socket.send_multipart(
@@ -435,5 +469,10 @@ if __name__ == "__main__":
     parser = argparse.ArgumentParser()
     parser.add_argument(
         "--worker_address", required=True, type=str, help="worker_address")
+    parser.add_argument(
+        "--log_server_address",
+        required=True,
+        type=str,
+        help="log_server_address, address of the log web server on worker")
     args = parser.parse_args()
-    job = Job(args.worker_address)
+    job = Job(args.worker_address, args.log_server_address)
diff --git a/parl/remote/log_server.py b/parl/remote/log_server.py
new file mode 100644
index 0000000000000000000000000000000000000000..f3ad1cf882311b9bdffcd990e7b33ddff0711bc2
--- /dev/null
+++ b/parl/remote/log_server.py
@@ -0,0 +1,102 @@
+#   Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import argparse
+import linecache
+import os
+
+from flask import Flask, current_app, jsonify, make_response, request, send_file
+from flask_cors import CORS
+
+app = Flask(__name__)
+CORS(app)
+
+
+@app.route(
+    "/get-log", methods=[
+        'GET',
+    ])
+def get_log():
+    '''
+    args: 
+        job_id: id of the remote job
+    response:
+        log: newest `LINE_NUM` lines of the log file
+    '''
+    try:
+        job_id = request.args['job_id']
+    except:
+        return make_response(
+            jsonify(message="No job_id provided, please check your request."),
+            400)
+
+    log_dir = current_app.config.get('LOG_DIR')
+    log_dir = os.path.expanduser(log_dir)
+    log_file_path = os.path.join(log_dir, job_id, 'stdout.log')
+    if not os.path.isfile(log_file_path):
+        return make_response(
+            jsonify(message="Log not exsits, please check your job_id"), 400)
+    else:
+        line_num = current_app.config.get('LINE_NUM')
+        linecache.checkcache(log_file_path)
+        log_content = ''.join(linecache.getlines(log_file_path)[-line_num:])
+        return make_response(
+            jsonify(message="Log exsits, content in log", log=log_content),
+            200)
+
+
+@app.route(
+    '/download-log', methods=[
+        'GET',
+    ])
+def download_log():
+    '''
+    args:
+        job_id: the id of the remote job
+    response:
+        log: log file
+    '''
+    try:
+        job_id = request.args['job_id']
+    except:
+        return make_response(
+            jsonify(message="No job_id provided, please check your request."),
+            400)
+    log_dir = current_app.config.get('LOG_DIR')
+    log_dir = os.path.expanduser(log_dir)
+    log_file_path = os.path.join(log_dir, job_id, 'stdout.log')
+    if not os.path.isfile(log_file_path):
+        return make_response(
+            jsonify(message="Log not exsits, please check your job_id"), 400)
+    else:
+        return send_file(log_file_path, as_attachment=True)
+
+
+if __name__ == "__main__":
+    import logging
+    log = logging.getLogger('werkzeug')
+    log.disabled = True
+
+    parser = argparse.ArgumentParser()
+    parser.add_argument('--port', required=True, type=int)
+    parser.add_argument('--log_dir', required=True, type=str)
+    parser.add_argument('--line_num', required=True, type=int)
+    args = parser.parse_args()
+
+    app.config.from_mapping(
+        LOG_DIR=args.log_dir,
+        LINE_NUM=args.line_num,
+    )
+
+    app.run(host="0.0.0.0", port=args.port)
diff --git a/parl/remote/master.py b/parl/remote/master.py
index a5d09adb59b280dcb04bf8cbd6f99b8f6e5e7845..8cca0290a7ad68407026f2e24c4613da83af56a3 100644
--- a/parl/remote/master.py
+++ b/parl/remote/master.py
@@ -57,11 +57,12 @@ class Master(object):
         port: The ip port that the master node binds to.
     """
 
-    def __init__(self, port):
+    def __init__(self, port, monitor_port=None):
         self.ctx = zmq.Context()
         self.master_ip = get_ip_address()
+        self.monitor_url = "http://{}:{}".format(self.master_ip, monitor_port)
         logger.set_dir(
-            os.path.expanduser('~/.parl_data/master/{}:{}'.format(
+            os.path.expanduser('~/.parl_data/master/{}_{}'.format(
                 self.master_ip, port)))
         self.client_socket = self.ctx.socket(zmq.REP)
         self.client_socket.bind("tcp://*:{}".format(port))
@@ -135,7 +136,7 @@ class Master(object):
 
             except zmq.error.Again as e:
                 client_is_alive = False
-                self.cluster_monitor.drop_cluster_status(
+                self.cluster_monitor.drop_client_status(
                     client_heartbeat_address)
                 logger.warning("[Master] cannot connect to the client " +
                                "{}. ".format(client_heartbeat_address) +
@@ -205,8 +206,11 @@ class Master(object):
 
         # a client connects to the master
         elif tag == remote_constants.CLIENT_CONNECT_TAG:
+            # `client_heartbeat_address` is the
+            #      `reply_master_heartbeat_address` of the client
             client_heartbeat_address = to_str(message[1])
             client_hostname = to_str(message[2])
+            client_id = to_str(message[3])
             self.client_hostname[client_heartbeat_address] = client_hostname
             logger.info(
                 "Client {} is connected.".format(client_heartbeat_address))
@@ -215,11 +219,14 @@ class Master(object):
                 target=self._create_client_monitor,
                 args=(client_heartbeat_address, ))
             thread.start()
-            self.client_socket.send_multipart([remote_constants.NORMAL_TAG])
+            log_monitor_address = "{}/logs?client_id={}".format(
+                self.monitor_url, client_id)
+            self.client_socket.send_multipart(
+                [remote_constants.NORMAL_TAG,
+                 to_byte(log_monitor_address)])
 
         # a client submits a job to the master
         elif tag == remote_constants.CLIENT_SUBMIT_TAG:
-
             # check available CPU resources
             if self.cpu_num:
                 logger.info("Submitting job...")
@@ -230,6 +237,9 @@ class Master(object):
                     to_byte(job.client_heartbeat_address),
                     to_byte(job.ping_heartbeat_address),
                 ])
+                client_id = to_str(message[2])
+                job_info = {job.job_id: job.log_server_address}
+                self.cluster_monitor.add_client_job(client_id, job_info)
                 self._print_workers()
             else:
                 self.client_socket.send_multipart([remote_constants.CPU_TAG])
diff --git a/parl/remote/message.py b/parl/remote/message.py
index 8be8d4657110011c34cca8702290a9942d225e36..97e5482f9e5a25fe52b6919494f4dde1b21e7d5b 100644
--- a/parl/remote/message.py
+++ b/parl/remote/message.py
@@ -14,9 +14,15 @@
 
 
 class InitializedJob(object):
-    def __init__(self, job_address, worker_heartbeat_address,
-                 client_heartbeat_address, ping_heartbeat_address,
-                 worker_address, pid):
+    def __init__(self,
+                 job_address,
+                 worker_heartbeat_address,
+                 client_heartbeat_address,
+                 ping_heartbeat_address,
+                 worker_address,
+                 pid,
+                 job_id=None,
+                 log_server_address=None):
         """
     Args:
       job_address(str): Job address to which the new task connect.
@@ -35,6 +41,8 @@ class InitializedJob(object):
         self.worker_address = worker_address
         self.pid = pid
         self.is_alive = True
+        self.job_id = job_id
+        self.log_server_address = log_server_address
 
 
 class InitializedWorker(object):
diff --git a/parl/remote/monitor.py b/parl/remote/monitor.py
index 8f5c1d5f1d4b4919b4230f27a7a656a1417f6d23..452888940c4eb8de94632f5adc55a097255e94c0 100644
--- a/parl/remote/monitor.py
+++ b/parl/remote/monitor.py
@@ -19,7 +19,7 @@ import time
 import zmq
 import threading
 
-from flask import Flask, render_template, jsonify
+from flask import Flask, render_template, jsonify, request
 
 app = Flask(__name__)
 
@@ -42,7 +42,7 @@ class ClusterMonitor(object):
     def __init__(self, master_address):
         ctx = zmq.Context()
         self.socket = ctx.socket(zmq.REQ)
-        self.socket.setsockopt(zmq.RCVTIMEO, 10000)
+        self.socket.setsockopt(zmq.RCVTIMEO, 30000)
         self.socket.connect('tcp://{}'.format(master_address))
         self.data = None
 
@@ -81,6 +81,7 @@ class ClusterMonitor(object):
                 data['total_vacant_cpus'] = total_vacant_cpus
                 data['total_cpus'] = total_used_cpus + total_vacant_cpus
                 data['clients'] = list(status['clients'].values())
+                data['client_jobs'] = status['client_jobs']
                 self.data = data
                 time.sleep(10)
 
@@ -99,7 +100,44 @@ def cluster():
     return jsonify(data)
 
 
+@app.route(
+    '/logs', methods=[
+        'GET',
+    ])
+def logs():
+    client_id = request.args.get('client_id')
+    return render_template('jobs.html', client_id=client_id)
+
+
+@app.route(
+    '/get-jobs', methods=[
+        'GET',
+    ])
+def get_jobs():
+    client_id = request.args.get('client_id')
+    jobs = CLUSTER_MONITOR.get_data()['client_jobs'].get(client_id)
+    data = []
+    if jobs:
+        for idx, job_id in enumerate(jobs):
+            monitor_url = jobs[job_id]
+            data.append({
+                "id":
+                idx,
+                "job_id":
+                job_id,
+                "log_url":
+                "http://{}/get-log?job_id={}".format(monitor_url, job_id),
+                "download_url":
+                "http://{}/download-log?job_id={}".format(monitor_url, job_id),
+            })
+    return jsonify(data)
+
+
 if __name__ == "__main__":
+    import logging
+    log = logging.getLogger('werkzeug')
+    log.disabled = True
+
     parser = argparse.ArgumentParser()
     parser.add_argument('--monitor_port', default=1234, type=int)
     parser.add_argument('--address', default='localhost:8010', type=str)
diff --git a/parl/remote/remote_decorator.py b/parl/remote/remote_decorator.py
index 32a463f85b82acb7483ec5a082f12136565326e2..a066abc40832fdce00fd00d1784aa75c60925e00 100644
--- a/parl/remote/remote_decorator.py
+++ b/parl/remote/remote_decorator.py
@@ -18,6 +18,7 @@ import threading
 import time
 import zmq
 import numpy as np
+import inspect
 
 from parl.utils import get_ip_address, logger, to_str, to_byte
 from parl.utils.communication import loads_argument, loads_return,\
@@ -55,7 +56,7 @@ def remote_class(*args, **kwargs):
         actor = Actor()
         actor.step()
 
-        # Set maximum memory usage to 300 MB for each object. 
+        # Set maximum memory usage to 300 MB for each object.
         @parl.remote_class(max_memory=300)
         class LimitedActor(object):
            ...
@@ -74,6 +75,12 @@ def remote_class(*args, **kwargs):
     """
 
     def decorator(cls):
+        # we are not going to create a remote actor in job.py
+        if 'XPARL' in os.environ and os.environ['XPARL'] == 'True':
+            logger.warning(
+                "Note: this object will be runnning as a local object")
+            return cls
+
         class RemoteWrapper(object):
             """
             Wrapper for remote class in client side.
@@ -113,10 +120,13 @@ def remote_class(*args, **kwargs):
                 self.job_shutdown = False
 
                 self.send_file(self.job_socket)
-
+                file_name = inspect.getfile(cls)[:-3]
+                cls_source = inspect.getsourcelines(cls)
+                end_of_file = cls_source[1] + len(cls_source[0])
+                class_name = cls.__name__
                 self.job_socket.send_multipart([
                     remote_constants.INIT_OBJECT_TAG,
-                    cloudpickle.dumps(cls),
+                    cloudpickle.dumps([file_name, class_name, end_of_file]),
                     cloudpickle.dumps([args, kwargs]),
                 ])
                 message = self.job_socket.recv_multipart()
@@ -128,6 +138,10 @@ def remote_class(*args, **kwargs):
 
             def __del__(self):
                 """Delete the remote class object and release remote resources."""
+                try:
+                    self.job_socket.setsockopt(zmq.RCVTIMEO, 1 * 1000)
+                except AttributeError:
+                    pass
                 if not self.job_shutdown:
                     try:
                         self.job_socket.send_multipart(
@@ -138,6 +152,8 @@ def remote_class(*args, **kwargs):
                         pass
                     except zmq.error.ZMQError:
                         pass
+                    except TypeError:
+                        pass
 
             def send_file(self, socket):
                 try:
@@ -212,6 +228,7 @@ def remote_class(*args, **kwargs):
 
                 return wrapper
 
+        RemoteWrapper._original = cls
         return RemoteWrapper
 
     max_memory = kwargs.get('max_memory')
diff --git a/parl/remote/scripts.py b/parl/remote/scripts.py
index 71677d692878eef63f65b0ff1054cb6233b0d7a5..51cf3cabf7cb3a3dba0cc3d0d00a7ad55406b4f8 100644
--- a/parl/remote/scripts.py
+++ b/parl/remote/scripts.py
@@ -18,15 +18,18 @@ import multiprocessing
 import os
 import random
 import re
-import socket
+import requests
 import subprocess
 import sys
 import time
 import threading
+import tempfile
 import warnings
 import zmq
 from multiprocessing import Process
-from parl.utils import get_ip_address, to_str
+from parl.utils import (_IS_WINDOWS, get_free_tcp_port, get_ip_address,
+                        get_port_from_range, is_port_available, kill_process,
+                        to_str)
 from parl.remote.remote_constants import STATUS_TAG
 
 # A flag to mark if parl is started from a command line
@@ -34,33 +37,18 @@ os.environ['XPARL'] = 'True'
 
 # Solve `Click will abort further execution because Python 3 was configured
 # to use ASCII as encoding for the environment` error.
-locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
+
+if not _IS_WINDOWS:
+    try:
+        locale.setlocale(locale.LC_ALL, "en_US.UTF-8")
+    except:
+        pass
 
 #TODO: this line will cause error in python2/macOS
 if sys.version_info.major == 3:
     warnings.simplefilter("ignore", ResourceWarning)
 
 
-def get_free_tcp_port():
-    tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    tcp.bind(('', 0))
-    addr, port = tcp.getsockname()
-    tcp.close()
-    return str(port)
-
-
-def is_port_available(port):
-    """ Check if a port is used.
-
-    True if the port is available for connection.
-    """
-    port = int(port)
-    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
-    available = sock.connect_ex(('localhost', port))
-    sock.close()
-    return available
-
-
 def is_master_started(address):
     ctx = zmq.Context()
     socket = ctx.socket(zmq.REQ)
@@ -77,6 +65,33 @@ def is_master_started(address):
         return False
 
 
+def parse_port_range(log_server_port_range):
+    try:
+        re.match(r'\d*[-]\d*', log_server_port_range).span()
+    except:
+        raise Exception(
+            "The input log_server_port_range should be `start-end` format.")
+    start, end = map(int, log_server_port_range.split('-'))
+    if start > end:
+        raise Exception(
+            "Start port number must be smaller than the end port number.")
+
+    return start, end
+
+
+def is_log_server_started(ip_address, port):
+    started = False
+    for _ in range(3):
+        try:
+            r = requests.get("http://{}:{}/get-log".format(ip_address, port))
+            if r.status_code == 400:
+                started = True
+                break
+        except:
+            time.sleep(3)
+    return started
+
+
 @click.group()
 def cli():
     pass
@@ -95,7 +110,15 @@ def cli():
     "cpus of this machine.")
 @click.option(
     "--monitor_port", help="The port to start a cluster monitor.", type=str)
-def start_master(port, cpu_num, monitor_port, debug):
+@click.option(
+    "--log_server_port_range",
+    help='''
+    Port range (start-end) of the log server on the worker. Default: 8000-9000. 
+    The worker will pick a random avaliable port in [start, end] for the log server.
+    ''',
+    default="8000-9000",
+    type=str)
+def start_master(port, cpu_num, monitor_port, debug, log_server_port_range):
     if debug:
         os.environ['DEBUG'] = 'True'
 
@@ -112,19 +135,33 @@ def start_master(port, cpu_num, monitor_port, debug):
         cpu_num) if cpu_num is not None else multiprocessing.cpu_count()
     start_file = __file__.replace('scripts.pyc', 'start.py')
     start_file = start_file.replace('scripts.py', 'start.py')
+    monitor_file = __file__.replace('scripts.pyc', 'monitor.py')
+    monitor_file = monitor_file.replace('scripts.py', 'monitor.py')
+
     monitor_port = monitor_port if monitor_port else get_free_tcp_port()
+    start, end = parse_port_range(log_server_port_range)
+    log_server_port = get_port_from_range(start, end)
+    while log_server_port == monitor_port or log_server_port == port:
+        log_server_port = get_port_from_range(start, end)
 
     master_command = [
-        sys.executable, start_file, "--name", "master", "--port", port
+        sys.executable,
+        start_file,
+        "--name",
+        "master",
+        "--port",
+        port,
+        "--monitor_port",
+        monitor_port,
     ]
     worker_command = [
         sys.executable, start_file, "--name", "worker", "--address",
         "localhost:" + str(port), "--cpu_num",
-        str(cpu_num)
+        str(cpu_num), '--log_server_port',
+        str(log_server_port)
     ]
     monitor_command = [
-        sys.executable, '{}/monitor.py'.format(__file__[:__file__.rfind('/')]),
-        "--monitor_port",
+        sys.executable, monitor_file, "--monitor_port",
         str(monitor_port), "--address", "localhost:" + str(port)
     ]
 
@@ -133,11 +170,21 @@ def start_master(port, cpu_num, monitor_port, debug):
     # Redirect the output to DEVNULL to solve the warning log.
     _ = subprocess.Popen(
         master_command, stdout=FNULL, stderr=subprocess.STDOUT)
+
     if cpu_num > 0:
+        # Sleep 1s for master ready
+        time.sleep(1)
         _ = subprocess.Popen(
             worker_command, stdout=FNULL, stderr=subprocess.STDOUT)
-    _ = subprocess.Popen(
-        monitor_command, stdout=FNULL, stderr=subprocess.STDOUT)
+
+    if _IS_WINDOWS:
+        # TODO(@zenghsh3) redirecting stdout of monitor subprocess to FNULL will cause occasional failure
+        tmp_file = tempfile.TemporaryFile()
+        _ = subprocess.Popen(monitor_command, stdout=tmp_file)
+        tmp_file.close()
+    else:
+        _ = subprocess.Popen(
+            monitor_command, stdout=FNULL, stderr=subprocess.STDOUT)
     FNULL.close()
 
     if cpu_num > 0:
@@ -158,16 +205,20 @@ def start_master(port, cpu_num, monitor_port, debug):
     click.echo(monitor_info)
 
     # check if monitor is started
-    cmd = r'ps -ef | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format(
-        monitor_port, port)
-
     monitor_is_started = False
+    if _IS_WINDOWS:
+        cmd = r'''wmic process where "commandline like '%remote\\monitor.py --monitor_port {} --address localhost:{}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format(
+            monitor_port, port)
+    else:
+        cmd = r'ps -ef | grep -v grep | grep remote/monitor.py\ --monitor_port\ {}\ --address\ localhost:{}'.format(
+            monitor_port, port)
     for i in range(3):
-        check_monitor_is_started = os.popen(cmd).read().strip().split('\n')
-        if len(check_monitor_is_started) == 2:
+        check_monitor_is_started = os.popen(cmd).read()
+        if len(check_monitor_is_started) > 0:
             monitor_is_started = True
             break
         time.sleep(3)
+
     master_ip = get_ip_address()
     if monitor_is_started:
         start_info = """
@@ -194,6 +245,9 @@ def start_master(port, cpu_num, monitor_port, debug):
         """.format(start_info, master_ip, port)
     click.echo(monitor_info)
 
+    if not is_log_server_started(master_ip, log_server_port):
+        click.echo("# Fail to start the log server.")
+
 
 @click.command("connect", short_help="Start a worker node.")
 @click.option(
@@ -203,36 +257,53 @@ def start_master(port, cpu_num, monitor_port, debug):
     type=int,
     help="Set number of cpu manually. If not set, it will use all "
     "cpus of this machine.")
-def start_worker(address, cpu_num):
+@click.option(
+    "--log_server_port_range",
+    help='''
+    Port range (start-end) of the log server on the worker. Default: 8000-9000. 
+    The worker will pick a random avaliable port in [start, end] for the log server.
+    ''',
+    default="8000-9000",
+    type=str)
+def start_worker(address, cpu_num, log_server_port_range):
+    start, end = parse_port_range(log_server_port_range)
+    log_server_port = get_port_from_range(start, end)
+
     if not is_master_started(address):
         raise Exception("Worker can not connect to the master node, " +
                         "please check if the input address {} ".format(
                             address) + "is correct.")
     cpu_num = str(cpu_num) if cpu_num else ''
+    start_file = __file__.replace('scripts.pyc', 'start.py')
+    start_file = start_file.replace('scripts.py', 'start.py')
+
     command = [
-        sys.executable, "{}/start.py".format(__file__[:-11]), "--name",
-        "worker", "--address", address, "--cpu_num",
-        str(cpu_num)
+        sys.executable, start_file, "--name", "worker", "--address", address,
+        "--cpu_num",
+        str(cpu_num), "--log_server_port",
+        str(log_server_port)
     ]
     p = subprocess.Popen(command)
 
+    if not is_log_server_started(get_ip_address(), log_server_port):
+        click.echo("# Fail to start the log server.")
+
 
 @click.command("stop", help="Exit the cluster.")
 def stop():
-    command = (
-        "ps aux | grep remote/start.py | awk '{print $2}' | xargs kill -9")
-    subprocess.call([command], shell=True)
-    command = (
-        "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9")
-    subprocess.call([command], shell=True)
-    command = (
-        "ps aux | grep remote/monitor.py | awk '{print $2}' | xargs kill -9")
-    subprocess.call([command], shell=True)
+    kill_process('remote/start.py')
+    kill_process('remote/job.py')
+    kill_process('remote/monitor.py')
+    kill_process('remote/log_server.py')
 
 
 @click.command("status")
 def status():
-    cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address'
+    if _IS_WINDOWS:
+        cmd = r'''wmic process where "commandline like '%remote\\start.py --name worker --address%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''
+    else:
+        cmd = r'ps -ef | grep remote/start.py\ --name\ worker\ --address'
+
     content = os.popen(cmd).read().strip()
     pattern = re.compile('--address (.*?) --cpu')
     clusters = set(pattern.findall(content))
@@ -242,7 +313,11 @@ def status():
         ctx = zmq.Context()
         status = []
         for cluster in clusters:
-            cmd = r'ps -ef | grep address\ {}'.format(cluster)
+            if _IS_WINDOWS:
+                cmd = r'''wmic process where "commandline like '%address {}%'" get commandline /format:list | findstr /V wmic | findstr CommandLine='''.format(
+                    cluster)
+            else:
+                cmd = r'ps -ef | grep address\ {}'.format(cluster)
             content = os.popen(cmd).read()
             pattern = re.compile('--monitor_port (.*?)\n', re.S)
             monitors = pattern.findall(content)
diff --git a/parl/remote/start.py b/parl/remote/start.py
index d9aa231db65a04ee410d7df6660d9eaa75150828..83c8dca86a726b8c77307b0e080829ae67bc186a 100644
--- a/parl/remote/start.py
+++ b/parl/remote/start.py
@@ -28,13 +28,15 @@ def main(args):
 
     if args.name == 'master':
         port = args.port
-        master = Master(port)
+        monitor_port = args.monitor_port
+        master = Master(port, monitor_port)
         master.run()
 
     elif args.name == 'worker':
         address = args.address
+        log_server_port = args.log_server_port
         cpu_num = int(args.cpu_num) if args.cpu_num else None
-        worker = Worker(address, cpu_num)
+        worker = Worker(address, cpu_num, log_server_port)
         worker.run()
 
     else:
@@ -48,5 +50,7 @@ if __name__ == "__main__":
     parser.add_argument('--port', default='1234', type=str)
     parser.add_argument('--address', default='localhost:1234', type=str)
     parser.add_argument('--cpu_num', default='', type=str)
+    parser.add_argument('--monitor_port', default='', type=str)
+    parser.add_argument('--log_server_port', default='', type=str)
     args = parser.parse_args()
     main(args)
diff --git a/parl/remote/static/js/ansi_up.js b/parl/remote/static/js/ansi_up.js
new file mode 100644
index 0000000000000000000000000000000000000000..b207399e24887a4d2b13e03482f98f16b2137cf0
--- /dev/null
+++ b/parl/remote/static/js/ansi_up.js
@@ -0,0 +1,421 @@
+/*  ansi_up.js
+ *  author : Dru Nelson
+ *  license : MIT
+ *  http://github.com/drudru/ansi_up
+ */
+(function (root, factory) {
+    if (typeof define === 'function' && define.amd) {
+        // AMD. Register as an anonymous module.
+        define(['exports'], factory);
+    } else if (typeof exports === 'object' && typeof exports.nodeName !== 'string') {
+        // CommonJS
+        factory(exports);
+    } else {
+        // Browser globals
+        var exp = {};
+        factory(exp);
+        root.AnsiUp = exp.default;
+    }
+}(this, function (exports) {
+"use strict";
+var __makeTemplateObject = (this && this.__makeTemplateObject) || function (cooked, raw) {
+    if (Object.defineProperty) { Object.defineProperty(cooked, "raw", { value: raw }); } else { cooked.raw = raw; }
+    return cooked;
+};
+var PacketKind;
+(function (PacketKind) {
+    PacketKind[PacketKind["EOS"] = 0] = "EOS";
+    PacketKind[PacketKind["Text"] = 1] = "Text";
+    PacketKind[PacketKind["Incomplete"] = 2] = "Incomplete";
+    PacketKind[PacketKind["ESC"] = 3] = "ESC";
+    PacketKind[PacketKind["Unknown"] = 4] = "Unknown";
+    PacketKind[PacketKind["SGR"] = 5] = "SGR";
+    PacketKind[PacketKind["OSCURL"] = 6] = "OSCURL";
+})(PacketKind || (PacketKind = {}));
+var AnsiUp = (function () {
+    function AnsiUp() {
+        this.VERSION = "4.0.3";
+        this.setup_palettes();
+        this._use_classes = false;
+        this._escape_for_html = true;
+        this.bold = false;
+        this.fg = this.bg = null;
+        this._buffer = '';
+        this._url_whitelist = { 'http': 1, 'https': 1 };
+    }
+    Object.defineProperty(AnsiUp.prototype, "use_classes", {
+        get: function () {
+            return this._use_classes;
+        },
+        set: function (arg) {
+            this._use_classes = arg;
+        },
+        enumerable: true,
+        configurable: true
+    });
+    Object.defineProperty(AnsiUp.prototype, "escape_for_html", {
+        get: function () {
+            return this._escape_for_html;
+        },
+        set: function (arg) {
+            this._escape_for_html = arg;
+        },
+        enumerable: true,
+        configurable: true
+    });
+    Object.defineProperty(AnsiUp.prototype, "url_whitelist", {
+        get: function () {
+            return this._url_whitelist;
+        },
+        set: function (arg) {
+            this._url_whitelist = arg;
+        },
+        enumerable: true,
+        configurable: true
+    });
+    AnsiUp.prototype.setup_palettes = function () {
+        var _this = this;
+        this.ansi_colors =
+            [
+                [
+                    { rgb: [0, 0, 0], class_name: "ansi-black" },
+                    { rgb: [187, 0, 0], class_name: "ansi-red" },
+                    { rgb: [0, 187, 0], class_name: "ansi-green" },
+                    { rgb: [187, 187, 0], class_name: "ansi-yellow" },
+                    { rgb: [0, 0, 187], class_name: "ansi-blue" },
+                    { rgb: [187, 0, 187], class_name: "ansi-magenta" },
+                    { rgb: [0, 187, 187], class_name: "ansi-cyan" },
+                    { rgb: [255, 255, 255], class_name: "ansi-white" }
+                ],
+                [
+                    { rgb: [85, 85, 85], class_name: "ansi-bright-black" },
+                    { rgb: [255, 85, 85], class_name: "ansi-bright-red" },
+                    { rgb: [0, 255, 0], class_name: "ansi-bright-green" },
+                    { rgb: [255, 255, 85], class_name: "ansi-bright-yellow" },
+                    { rgb: [85, 85, 255], class_name: "ansi-bright-blue" },
+                    { rgb: [255, 85, 255], class_name: "ansi-bright-magenta" },
+                    { rgb: [85, 255, 255], class_name: "ansi-bright-cyan" },
+                    { rgb: [255, 255, 255], class_name: "ansi-bright-white" }
+                ]
+            ];
+        this.palette_256 = [];
+        this.ansi_colors.forEach(function (palette) {
+            palette.forEach(function (rec) {
+                _this.palette_256.push(rec);
+            });
+        });
+        var levels = [0, 95, 135, 175, 215, 255];
+        for (var r = 0; r < 6; ++r) {
+            for (var g = 0; g < 6; ++g) {
+                for (var b = 0; b < 6; ++b) {
+                    var col = { rgb: [levels[r], levels[g], levels[b]], class_name: 'truecolor' };
+                    this.palette_256.push(col);
+                }
+            }
+        }
+        var grey_level = 8;
+        for (var i = 0; i < 24; ++i, grey_level += 10) {
+            var gry = { rgb: [grey_level, grey_level, grey_level], class_name: 'truecolor' };
+            this.palette_256.push(gry);
+        }
+    };
+    AnsiUp.prototype.escape_txt_for_html = function (txt) {
+        return txt.replace(/[&<>]/gm, function (str) {
+            if (str === "&")
+                return "&amp;";
+            if (str === "<")
+                return "&lt;";
+            if (str === ">")
+                return "&gt;";
+        });
+    };
+    AnsiUp.prototype.append_buffer = function (txt) {
+        var str = this._buffer + txt;
+        this._buffer = str;
+    };
+    AnsiUp.prototype.get_next_packet = function () {
+        var pkt = {
+            kind: PacketKind.EOS,
+            text: '',
+            url: ''
+        };
+        var len = this._buffer.length;
+        if (len == 0)
+            return pkt;
+        var pos = this._buffer.indexOf("\x1B");
+        if (pos == -1) {
+            pkt.kind = PacketKind.Text;
+            pkt.text = this._buffer;
+            this._buffer = '';
+            return pkt;
+        }
+        if (pos > 0) {
+            pkt.kind = PacketKind.Text;
+            pkt.text = this._buffer.slice(0, pos);
+            this._buffer = this._buffer.slice(pos);
+            return pkt;
+        }
+        if (pos == 0) {
+            if (len == 1) {
+                pkt.kind = PacketKind.Incomplete;
+                return pkt;
+            }
+            var next_char = this._buffer.charAt(1);
+            if ((next_char != '[') && (next_char != ']')) {
+                pkt.kind = PacketKind.ESC;
+                pkt.text = this._buffer.slice(0, 1);
+                this._buffer = this._buffer.slice(1);
+                return pkt;
+            }
+            if (next_char == '[') {
+                if (!this._csi_regex) {
+                    this._csi_regex = rgx(__makeTemplateObject(["\n                        ^                           # beginning of line\n                                                    #\n                                                    # First attempt\n                        (?:                         # legal sequence\n                          \u001B[                      # CSI\n                          ([<-?]?)              # private-mode char\n                          ([d;]*)                    # any digits or semicolons\n                          ([ -/]?               # an intermediate modifier\n                          [@-~])                # the command\n                        )\n                        |                           # alternate (second attempt)\n                        (?:                         # illegal sequence\n                          \u001B[                      # CSI\n                          [ -~]*                # anything legal\n                          ([\0-\u001F:])              # anything illegal\n                        )\n                    "], ["\n                        ^                           # beginning of line\n                                                    #\n                                                    # First attempt\n                        (?:                         # legal sequence\n                          \\x1b\\[                      # CSI\n                          ([\\x3c-\\x3f]?)              # private-mode char\n                          ([\\d;]*)                    # any digits or semicolons\n                          ([\\x20-\\x2f]?               # an intermediate modifier\n                          [\\x40-\\x7e])                # the command\n                        )\n                        |                           # alternate (second attempt)\n                        (?:                         # illegal sequence\n                          \\x1b\\[                      # CSI\n                          [\\x20-\\x7e]*                # anything legal\n                          ([\\x00-\\x1f:])              # anything illegal\n                        )\n                    "]));
+                }
+                var match = this._buffer.match(this._csi_regex);
+                if (match === null) {
+                    pkt.kind = PacketKind.Incomplete;
+                    return pkt;
+                }
+                if (match[4]) {
+                    pkt.kind = PacketKind.ESC;
+                    pkt.text = this._buffer.slice(0, 1);
+                    this._buffer = this._buffer.slice(1);
+                    return pkt;
+                }
+                if ((match[1] != '') || (match[3] != 'm'))
+                    pkt.kind = PacketKind.Unknown;
+                else
+                    pkt.kind = PacketKind.SGR;
+                pkt.text = match[2];
+                var rpos = match[0].length;
+                this._buffer = this._buffer.slice(rpos);
+                return pkt;
+            }
+            if (next_char == ']') {
+                if (len < 4) {
+                    pkt.kind = PacketKind.Incomplete;
+                    return pkt;
+                }
+                if ((this._buffer.charAt(2) != '8')
+                    || (this._buffer.charAt(3) != ';')) {
+                    pkt.kind = PacketKind.ESC;
+                    pkt.text = this._buffer.slice(0, 1);
+                    this._buffer = this._buffer.slice(1);
+                    return pkt;
+                }
+                if (!this._osc_st) {
+                    this._osc_st = rgxG(__makeTemplateObject(["\n                        (?:                         # legal sequence\n                          (\u001B\\)                    # ESC                           |                           # alternate\n                          (\u0007)                      # BEL (what xterm did)\n                        )\n                        |                           # alternate (second attempt)\n                        (                           # illegal sequence\n                          [\0-\u0006]                 # anything illegal\n                          |                           # alternate\n                          [\b-\u001A]                 # anything illegal\n                          |                           # alternate\n                          [\u001C-\u001F]                 # anything illegal\n                        )\n                    "], ["\n                        (?:                         # legal sequence\n                          (\\x1b\\\\)                    # ESC \\\n                          |                           # alternate\n                          (\\x07)                      # BEL (what xterm did)\n                        )\n                        |                           # alternate (second attempt)\n                        (                           # illegal sequence\n                          [\\x00-\\x06]                 # anything illegal\n                          |                           # alternate\n                          [\\x08-\\x1a]                 # anything illegal\n                          |                           # alternate\n                          [\\x1c-\\x1f]                 # anything illegal\n                        )\n                    "]));
+                }
+                this._osc_st.lastIndex = 0;
+                {
+                    var match_1 = this._osc_st.exec(this._buffer);
+                    if (match_1 === null) {
+                        pkt.kind = PacketKind.Incomplete;
+                        return pkt;
+                    }
+                    if (match_1[3]) {
+                        pkt.kind = PacketKind.ESC;
+                        pkt.text = this._buffer.slice(0, 1);
+                        this._buffer = this._buffer.slice(1);
+                        return pkt;
+                    }
+                }
+                {
+                    var match_2 = this._osc_st.exec(this._buffer);
+                    if (match_2 === null) {
+                        pkt.kind = PacketKind.Incomplete;
+                        return pkt;
+                    }
+                    if (match_2[3]) {
+                        pkt.kind = PacketKind.ESC;
+                        pkt.text = this._buffer.slice(0, 1);
+                        this._buffer = this._buffer.slice(1);
+                        return pkt;
+                    }
+                }
+                if (!this._osc_regex) {
+                    this._osc_regex = rgx(__makeTemplateObject(["\n                        ^                           # beginning of line\n                                                    #\n                        \u001B]8;                    # OSC Hyperlink\n                        [ -:<-~]*       # params (excluding ;)\n                        ;                           # end of params\n                        ([!-~]{0,512})        # URL capture\n                        (?:                         # ST\n                          (?:\u001B\\)                  # ESC                           |                           # alternate\n                          (?:\u0007)                    # BEL (what xterm did)\n                        )\n                        ([!-~]+)              # TEXT capture\n                        \u001B]8;;                   # OSC Hyperlink End\n                        (?:                         # ST\n                          (?:\u001B\\)                  # ESC                           |                           # alternate\n                          (?:\u0007)                    # BEL (what xterm did)\n                        )\n                    "], ["\n                        ^                           # beginning of line\n                                                    #\n                        \\x1b\\]8;                    # OSC Hyperlink\n                        [\\x20-\\x3a\\x3c-\\x7e]*       # params (excluding ;)\n                        ;                           # end of params\n                        ([\\x21-\\x7e]{0,512})        # URL capture\n                        (?:                         # ST\n                          (?:\\x1b\\\\)                  # ESC \\\n                          |                           # alternate\n                          (?:\\x07)                    # BEL (what xterm did)\n                        )\n                        ([\\x21-\\x7e]+)              # TEXT capture\n                        \\x1b\\]8;;                   # OSC Hyperlink End\n                        (?:                         # ST\n                          (?:\\x1b\\\\)                  # ESC \\\n                          |                           # alternate\n                          (?:\\x07)                    # BEL (what xterm did)\n                        )\n                    "]));
+                }
+                var match = this._buffer.match(this._osc_regex);
+                if (match === null) {
+                    pkt.kind = PacketKind.ESC;
+                    pkt.text = this._buffer.slice(0, 1);
+                    this._buffer = this._buffer.slice(1);
+                    return pkt;
+                }
+                pkt.kind = PacketKind.OSCURL;
+                pkt.url = match[1];
+                pkt.text = match[2];
+                var rpos = match[0].length;
+                this._buffer = this._buffer.slice(rpos);
+                return pkt;
+            }
+        }
+    };
+    AnsiUp.prototype.ansi_to_html = function (txt) {
+        this.append_buffer(txt);
+        var blocks = [];
+        while (true) {
+            var packet = this.get_next_packet();
+            if ((packet.kind == PacketKind.EOS)
+                || (packet.kind == PacketKind.Incomplete))
+                break;
+            if ((packet.kind == PacketKind.ESC)
+                || (packet.kind == PacketKind.Unknown))
+                continue;
+            if (packet.kind == PacketKind.Text)
+                blocks.push(this.transform_to_html(this.with_state(packet)));
+            else if (packet.kind == PacketKind.SGR)
+                this.process_ansi(packet);
+            else if (packet.kind == PacketKind.OSCURL)
+                blocks.push(this.process_hyperlink(packet));
+        }
+        return blocks.join("");
+    };
+    AnsiUp.prototype.with_state = function (pkt) {
+        return { bold: this.bold, fg: this.fg, bg: this.bg, text: pkt.text };
+    };
+    AnsiUp.prototype.process_ansi = function (pkt) {
+        var sgr_cmds = pkt.text.split(';');
+        while (sgr_cmds.length > 0) {
+            var sgr_cmd_str = sgr_cmds.shift();
+            var num = parseInt(sgr_cmd_str, 10);
+            if (isNaN(num) || num === 0) {
+                this.fg = this.bg = null;
+                this.bold = false;
+            }
+            else if (num === 1) {
+                this.bold = true;
+            }
+            else if (num === 22) {
+                this.bold = false;
+            }
+            else if (num === 39) {
+                this.fg = null;
+            }
+            else if (num === 49) {
+                this.bg = null;
+            }
+            else if ((num >= 30) && (num < 38)) {
+                this.fg = this.ansi_colors[0][(num - 30)];
+            }
+            else if ((num >= 40) && (num < 48)) {
+                this.bg = this.ansi_colors[0][(num - 40)];
+            }
+            else if ((num >= 90) && (num < 98)) {
+                this.fg = this.ansi_colors[1][(num - 90)];
+            }
+            else if ((num >= 100) && (num < 108)) {
+                this.bg = this.ansi_colors[1][(num - 100)];
+            }
+            else if (num === 38 || num === 48) {
+                if (sgr_cmds.length > 0) {
+                    var is_foreground = (num === 38);
+                    var mode_cmd = sgr_cmds.shift();
+                    if (mode_cmd === '5' && sgr_cmds.length > 0) {
+                        var palette_index = parseInt(sgr_cmds.shift(), 10);
+                        if (palette_index >= 0 && palette_index <= 255) {
+                            if (is_foreground)
+                                this.fg = this.palette_256[palette_index];
+                            else
+                                this.bg = this.palette_256[palette_index];
+                        }
+                    }
+                    if (mode_cmd === '2' && sgr_cmds.length > 2) {
+                        var r = parseInt(sgr_cmds.shift(), 10);
+                        var g = parseInt(sgr_cmds.shift(), 10);
+                        var b = parseInt(sgr_cmds.shift(), 10);
+                        if ((r >= 0 && r <= 255) && (g >= 0 && g <= 255) && (b >= 0 && b <= 255)) {
+                            var c = { rgb: [r, g, b], class_name: 'truecolor' };
+                            if (is_foreground)
+                                this.fg = c;
+                            else
+                                this.bg = c;
+                        }
+                    }
+                }
+            }
+        }
+    };
+    AnsiUp.prototype.transform_to_html = function (fragment) {
+        var txt = fragment.text;
+        if (txt.length === 0)
+            return txt;
+        if (this._escape_for_html)
+            txt = this.escape_txt_for_html(txt);
+        if (!fragment.bold && fragment.fg === null && fragment.bg === null)
+            return txt;
+        var styles = [];
+        var classes = [];
+        var fg = fragment.fg;
+        var bg = fragment.bg;
+        if (fragment.bold)
+            styles.push('font-weight:bold');
+        if (!this._use_classes) {
+            if (fg)
+                styles.push("color:rgb(" + fg.rgb.join(',') + ")");
+            if (bg)
+                styles.push("background-color:rgb(" + bg.rgb + ")");
+        }
+        else {
+            if (fg) {
+                if (fg.class_name !== 'truecolor') {
+                    classes.push(fg.class_name + "-fg");
+                }
+                else {
+                    styles.push("color:rgb(" + fg.rgb.join(',') + ")");
+                }
+            }
+            if (bg) {
+                if (bg.class_name !== 'truecolor') {
+                    classes.push(bg.class_name + "-bg");
+                }
+                else {
+                    styles.push("background-color:rgb(" + bg.rgb.join(',') + ")");
+                }
+            }
+        }
+        var class_string = '';
+        var style_string = '';
+        if (classes.length)
+            class_string = " class=\"" + classes.join(' ') + "\"";
+        if (styles.length)
+            style_string = " style=\"" + styles.join(';') + "\"";
+        return "<span" + style_string + class_string + ">" + txt + "</span>";
+    };
+    ;
+    AnsiUp.prototype.process_hyperlink = function (pkt) {
+        var parts = pkt.url.split(':');
+        if (parts.length < 1)
+            return '';
+        if (!this._url_whitelist[parts[0]])
+            return '';
+        var result = "<a href=\"" + this.escape_txt_for_html(pkt.url) + "\">" + this.escape_txt_for_html(pkt.text) + "</a>";
+        return result;
+    };
+    return AnsiUp;
+}());
+function rgx(tmplObj) {
+    var subst = [];
+    for (var _i = 1; _i < arguments.length; _i++) {
+        subst[_i - 1] = arguments[_i];
+    }
+    var regexText = tmplObj.raw[0];
+    var wsrgx = /^\s+|\s+\n|\s*#[\s\S]*?\n|\n/gm;
+    var txt2 = regexText.replace(wsrgx, '');
+    return new RegExp(txt2);
+}
+function rgxG(tmplObj) {
+    var subst = [];
+    for (var _i = 1; _i < arguments.length; _i++) {
+        subst[_i - 1] = arguments[_i];
+    }
+    var regexText = tmplObj.raw[0];
+    var wsrgx = /^\s+|\s+\n|\s*#[\s\S]*?\n|\n/gm;
+    var txt2 = regexText.replace(wsrgx, '');
+    return new RegExp(txt2, 'g');
+}
+//# sourceMappingURL=ansi_up.js.map
+    Object.defineProperty(exports, "__esModule", { value: true });
+    exports.default = AnsiUp;
+}));
diff --git a/parl/remote/static/js/jquery.ajax-cross-origin.min.js b/parl/remote/static/js/jquery.ajax-cross-origin.min.js
new file mode 100644
index 0000000000000000000000000000000000000000..cd57dcff2843d0d298fa36851aefedb7984ebe1e
--- /dev/null
+++ b/parl/remote/static/js/jquery.ajax-cross-origin.min.js
@@ -0,0 +1,57 @@
+/*
+ jQuery AJAX Cross Origin v1.3 (http://www.ajax-cross-origin.com) 
+ jQuery plugin to bypass Same-origin_policy using Google Apps Script. 
+ 
+ references:
+ http://en.wikipedia.org/wiki/Same-origin_policy
+ http://www.google.com/script/start/
+ 
+ (c) 2014, Writen by Erez Ninio. site: www.dealhotelbook.com
+ 
+ Licensed under the Creative Commons Attribution 3.0 Unported License. 
+ For details, see http://creativecommons.org/licenses/by/3.0/.
+*/
+
+var proxyJsonp =
+  "https://script.google.com/macros/s/AKfycbwmqG55tt2d2FcT_WQ3WjCSKmtyFpkOcdprSITn45-4UgVJnzp9/exec";
+jQuery.ajaxOrig = jQuery.ajax;
+jQuery.ajax = function (a, b) {
+  function d(a) {
+    a = encodeURI(a).replace(/&/g, "%26");
+    return proxyJsonp + "?url=" + a + "&callback=?";
+  }
+  var c = "object" === typeof a ? a : b || {};
+  c.url = c.url || ("string" === typeof a ? a : "");
+  var c = jQuery.ajaxSetup({}, c),
+    e = (function (a, c) {
+      var b = document.createElement("a");
+      b.href = a;
+      return (
+        c.crossOrigin &&
+        "http" == a.substr(0, 4).toLowerCase() &&
+        "localhost" != b.hostname &&
+        "127.0.0.1" != b.hostname &&
+        b.hostname != window.location.hostname
+      );
+    })(c.url, c);
+  c.proxy &&
+    0 < c.proxy.length &&
+    ((proxyJsonp = c.proxy),
+    "object" === typeof a
+      ? (a.crossDomain = !0)
+      : "object" === typeof b && (b.crossDomain = !0));
+  e &&
+    ("object" === typeof a
+      ? a.url &&
+        ((a.url = d(a.url)),
+        a.charset && (a.url += "&charset=" + a.charset),
+        (a.dataType = "json"))
+      : "string" === typeof a &&
+        "object" === typeof b &&
+        ((a = d(a)),
+        b.charset && (a += "&charset=" + b.charset),
+        (b.dataType = "json")));
+  return jQuery.ajaxOrig.apply(this, arguments);
+};
+jQuery.ajax.prototype = new jQuery.ajaxOrig();
+jQuery.ajax.prototype.constructor = jQuery.ajax;
diff --git a/parl/remote/static/js/parl.js b/parl/remote/static/js/parl.js
index 117e2d5542e69213a0b4ae7e04d5b6c6533006c8..e158e69917f969c62b6be2daf4b43176f0674ba7 100644
--- a/parl/remote/static/js/parl.js
+++ b/parl/remote/static/js/parl.js
@@ -185,7 +185,8 @@ function autoTable(res) {
     var s3 = `<td>${res.clients[i].client_address}</td>`;
     var s4 = `<td>${res.clients[i].actor_num}</td>`;
     var s5 = `<td>${res.clients[i].time}</td>`;
-    tr.innerHTML = s1 + s2 + s3 + s4 + s5;
+    var s6 = `<td><a href=${res.clients[i].log_monitor_url}>link</a></td>`;
+    tr.innerHTML = s1 + s2 + s3 + s4 + s5 + s6;
     table.appendChild(tr);
   }
 };
diff --git a/parl/remote/templates/clients.html b/parl/remote/templates/clients.html
index b87962f11d1a41d649ec953d426d418be0b2baf1..e0089b6422bb4a5af43372d3962adbb9303218af 100644
--- a/parl/remote/templates/clients.html
+++ b/parl/remote/templates/clients.html
@@ -43,10 +43,11 @@
             <th scope="col">Hostname</th>
             <th scope="col">Actor Num</th>
             <th scope="col">Time (min)</th>
+            <th scope="col">Log</th>
           </tr>
         </thead>
         <tbody id='table'>
-          <th colspan="5">Loading Data...</th>
+          <th colspan="6">Loading Data...</th>
         </tbody>
       </table>
   </div>
diff --git a/parl/remote/templates/jobs.html b/parl/remote/templates/jobs.html
new file mode 100644
index 0000000000000000000000000000000000000000..56e8a775a5dd6eb86e17bfc61fedf660e97d7ff0
--- /dev/null
+++ b/parl/remote/templates/jobs.html
@@ -0,0 +1,192 @@
+<!DOCTYPE html>
+<html lang="en">
+  <head>
+    <meta charset="utf-8" />
+    <title>Parl Cluster</title>
+    <link rel="shortcut icon" href="../static/favicon.ico" />
+    <script type="text/javascript" src="../static/js/jquery.min.js"></script>
+    <script type="text/javascript" src="../static/js/ansi_up.js"></script>
+    <script
+      type="text/javascript"
+      src="../static/js/bootstrap-table.min.js"
+    ></script>
+    <script src="https://cdn.bootcdn.net/ajax/libs/twitter-bootstrap/4.5.0/js/bootstrap.bundle.min.js"></script>
+    <link rel="stylesheet" href="../static/css/bootstrap-parl.min.css" />
+  </head>
+
+  <body>
+    <nav class="navbar navbar-expand-lg navbar-light bg-dark fixed-top">
+      <div class="container">
+        <a class="navbar-brand">
+          <img src="../static/logo.png" style="height: 30px;" />
+        </a>
+        <div class="collapse navbar-collapse" id="navbarSupportedContent">
+          <ul class="navbar-nav">
+            <li class="nav-item" id="worker_nav">
+              <a class="btn text-white" href="workers">Worker</a>
+            </li>
+            <li class="nav-item" id="client_nav">
+              <a class="btn text-white" href="clients">Client</a>
+            </li>
+          </ul>
+        </div>
+      </div>
+    </nav>
+    <div class="container" id="main-container">
+      <h5 class="font-weight-light text-center text-lg-left mt-4 mb-4">
+        Jobs Monitor
+      </h5>
+
+      <div class="card">
+        <div class="card-header" style="display: inline;">
+          <h3 style="display: inline;">
+            Remote Job Log
+          </h3>
+          <p
+            style="
+              float: right;
+              margin-bottom: 0rem;
+              position: relative;
+              bottom: -0.5rem;
+            "
+          >
+            Client ID: {{ client_id }}
+          </p>
+        </div>
+      </div>
+      <table id="table"></table>
+    </div>
+
+    <!-- Modal -->
+    <div
+      class="modal fade"
+      id="log-modal"
+      tabindex="-1"
+      role="dialog"
+      aria-hidden="true"
+    >
+      <div
+        class="modal-dialog modal-lg modal-dialog-scrollable"
+        role="document"
+      >
+        <div class="modal-content">
+          <div class="modal-header">
+            <h5 class="modal-title" id="log-modal-title">
+              Job ID:
+            </h5>
+            <button
+              type="button"
+              class="close"
+              data-dismiss="modal"
+              aria-label="Close"
+            ></button>
+          </div>
+          <div class="modal-body">
+            <p id="log-content">
+              <div id="loading-spin" class="spinner-border text-primary" role="status">
+                <span class="sr-only">Loading...</span>
+              </div>
+            </p>
+          </div>
+          <div class="modal-footer">
+            <a style="position: relative; left: -160px; font-size: small;">
+              * Only the lastest 500 lines of the log are shown, <br />
+              download the log file for the full log.
+            </a>
+            <button
+              type="button"
+              class="btn btn-secondary"
+              data-dismiss="modal"
+            >
+              Close
+            </button>
+            <a
+              role="button"
+              id="download-btn"
+              type="button"
+              class="btn btn-primary"
+              href=""
+            >
+              Download the complete log
+            </a>
+          </div>
+        </div>
+      </div>
+    </div>
+
+    <script>
+      let client_id = "{{ client_id }}";
+      let $table = $("#table");
+      function initTable() {
+        $table.bootstrapTable({
+          url: "get-jobs?client_id={{ client_id }}",
+          pagination: true,
+          pageSize: 10,
+          pageList: [10, 25, 50, 100],
+          columns: [
+            {
+              field: "id",
+              title: "ID",
+            },
+            {
+              field: "job_id",
+              title: "Job ID",
+            },
+            {
+              field: "log_url",
+              title: "Log",
+              formatter: urlButtonFormatter,
+            },
+            {
+              field: "download_url",
+              title: "Download",
+              formatter: downloadButtonFormatter,
+            },
+          ],
+        });
+      }
+      function urlButtonFormatter(value, row, index) {
+        let job_id = value.split("?job_id=")[1];
+        return `<a
+                role="button"
+                data-toggle="modal"
+                data-target="#log-modal"
+                data-job-url="${value}"
+                data-job-id="${job_id}"
+                class="btn btn-sm btn-outline-primary"
+                href="">view</a>`;
+      }
+      function downloadButtonFormatter(value, row, index) {
+        return `<a
+                role="button"
+                class="btn btn-sm btn-outline-primary"
+                href=${value}>link</a>`;
+      }
+      let refresher_id = null;
+      let ansi_up = new AnsiUp();
+      $("#log-modal").on("show.bs.modal", function (e) {
+        let job_id = $(e.relatedTarget).data("job-id");
+        let job_url = $(e.relatedTarget).data("job-url");
+        $("#log-modal-title").text("Job ID: " + job_id);
+        $("#download-btn").attr("href", job_url.replace("get", "download"));
+        refresher_id = setInterval(() => {
+          $.get(job_url, function (data, status) {
+            html = ansi_up
+              .ansi_to_html(data.log)
+              .replace(/\r\n/g, "<br>")
+              .replace(/\n/g, "<br>");
+            $("#loading-spin").hide();
+            $("#log-content").html(html);
+          });
+        }, 1000);
+      });
+      $("#log-modal").on("hide.bs.modal", function (e) {
+        clearInterval(refresher_id);
+      });
+      $(document).ready(initTable);
+      setInterval(() => {
+        $table.bootstrapTable("refresh");
+      }, 10000);
+    </script>
+  </body>
+</html>
diff --git a/parl/remote/tests/actor_max_memory_test.py b/parl/remote/tests/actor_max_memory_test.py
index ebe7f35d5c2c3a978bb8c257596797c96503ee35..1619651521b9c2c3fd7ece441ad9296d2ab1e852 100644
--- a/parl/remote/tests/actor_max_memory_test.py
+++ b/parl/remote/tests/actor_max_memory_test.py
@@ -45,7 +45,10 @@ class TestMaxMemory(unittest.TestCase):
     def tearDown(self):
         disconnect()
 
-    def actor(self):
+    #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
+    @staticmethod
+    def actor(cluster_addr):
+        parl.connect(cluster_addr)
         actor1 = Actor()
         time.sleep(10)
         actor1.add_500mb()
@@ -56,16 +59,17 @@ class TestMaxMemory(unittest.TestCase):
         th = threading.Thread(target=master.run)
         th.start()
         time.sleep(5)
-        worker = Worker('localhost:{}'.format(port), 1)
-        cluster_monitor = ClusterMonitor('localhost:{}'.format(port))
+        cluster_addr = 'localhost:{}'.format(port)
+        worker = Worker(cluster_addr, 1)
+        cluster_monitor = ClusterMonitor(cluster_addr)
         time.sleep(5)
-        parl.connect('localhost:{}'.format(port))
+        parl.connect(cluster_addr)
         actor = Actor()
         time.sleep(20)
         self.assertEqual(1, cluster_monitor.data['clients'][0]['actor_num'])
         del actor
         time.sleep(10)
-        p = Process(target=self.actor)
+        p = Process(target=self.actor, args=(cluster_addr, ))
         p.start()
 
         for _ in range(6):
diff --git a/parl/remote/tests/cluster_monitor_2_test.py b/parl/remote/tests/cluster_monitor_2_test.py
index 16dd24bd0471700b8a391050f6c51eae7c34dfdf..f27ee587b908093365426d7154f23d3adeccd721 100644
--- a/parl/remote/tests/cluster_monitor_2_test.py
+++ b/parl/remote/tests/cluster_monitor_2_test.py
@@ -22,7 +22,6 @@ import time
 import threading
 from parl.remote.client import disconnect
 from parl.remote import exceptions
-import timeout_decorator
 import subprocess
 
 
diff --git a/parl/remote/tests/cluster_monitor_3_test.py b/parl/remote/tests/cluster_monitor_3_test.py
index f141bf68268878a61b2562a66592af4efba961d6..6570746a04d35651353960882538f17203610708 100644
--- a/parl/remote/tests/cluster_monitor_3_test.py
+++ b/parl/remote/tests/cluster_monitor_3_test.py
@@ -22,7 +22,6 @@ import time
 import threading
 from parl.remote.client import disconnect
 from parl.remote import exceptions
-import timeout_decorator
 import subprocess
 
 
diff --git a/parl/remote/tests/cluster_monitor_test.py b/parl/remote/tests/cluster_monitor_test.py
index abf5ea651654e44eeff49817bd993721cd3b21f0..94341700c00ffc8b5e26b818fa9d394d02b0c60a 100644
--- a/parl/remote/tests/cluster_monitor_test.py
+++ b/parl/remote/tests/cluster_monitor_test.py
@@ -22,7 +22,6 @@ import time
 import threading
 from parl.remote.client import disconnect
 from parl.remote import exceptions
-import timeout_decorator
 import subprocess
 
 
diff --git a/parl/remote/tests/cluster_test.py b/parl/remote/tests/cluster_test.py
index 9025b7b6f295c0ab75b019f03e20b94bf04f3d52..0ac9d0ba4e9b6e023528a91d3ba496aa010755f9 100644
--- a/parl/remote/tests/cluster_test.py
+++ b/parl/remote/tests/cluster_test.py
@@ -21,8 +21,8 @@ import time
 import threading
 from parl.remote.client import disconnect
 from parl.remote import exceptions
-import timeout_decorator
 import subprocess
+from parl.utils import logger
 
 
 @parl.remote_class
@@ -63,20 +63,24 @@ class TestCluster(unittest.TestCase):
         disconnect()
 
     def test_actor_exception(self):
-        master = Master(port=1235)
+        logger.info("running:test_actor_exception")
+        master = Master(port=8235)
         th = threading.Thread(target=master.run)
         th.start()
         time.sleep(3)
-        worker1 = Worker('localhost:1235', 1)
+        worker1 = Worker('localhost:8235', 1)
         for _ in range(3):
             if master.cpu_num == 1:
                 break
             time.sleep(10)
         self.assertEqual(1, master.cpu_num)
-        parl.connect('localhost:1235')
+        logger.info("running:test_actor_exception: 0")
+        parl.connect('localhost:8235')
+        logger.info("running:test_actor_exception: 1")
 
         with self.assertRaises(exceptions.RemoteError):
             actor = Actor(abcd='a bug')
+        logger.info("running:test_actor_exception: 2")
 
         actor2 = Actor()
         for _ in range(3):
@@ -89,15 +93,15 @@ class TestCluster(unittest.TestCase):
         master.exit()
         worker1.exit()
 
-    @timeout_decorator.timeout(seconds=800)
-    def test_actor_exception(self):
-        master = Master(port=1236)
+    def test_actor_exception_2(self):
+        logger.info("running: test_actor_exception_2")
+        master = Master(port=8236)
         th = threading.Thread(target=master.run)
         th.start()
         time.sleep(3)
-        worker1 = Worker('localhost:1236', 1)
+        worker1 = Worker('localhost:8236', 1)
         self.assertEqual(1, master.cpu_num)
-        parl.connect('localhost:1236')
+        parl.connect('localhost:8236')
         actor = Actor()
         try:
             actor.will_raise_exception_func()
@@ -116,14 +120,15 @@ class TestCluster(unittest.TestCase):
         master.exit()
 
     def test_reset_actor(self):
+        logger.info("running: test_reset_actor")
         # start the master
-        master = Master(port=1237)
+        master = Master(port=8237)
         th = threading.Thread(target=master.run)
         th.start()
         time.sleep(3)
 
-        worker1 = Worker('localhost:1237', 4)
-        parl.connect('localhost:1237')
+        worker1 = Worker('localhost:8237', 4)
+        parl.connect('localhost:8237')
         for _ in range(10):
             actor = Actor()
             ret = actor.add_one(1)
@@ -140,19 +145,20 @@ class TestCluster(unittest.TestCase):
         master.exit()
 
     def test_add_worker(self):
-        master = Master(port=1234)
+        logger.info("running: test_add_worker")
+        master = Master(port=8234)
         th = threading.Thread(target=master.run)
         th.start()
         time.sleep(1)
 
-        worker1 = Worker('localhost:1234', 4)
+        worker1 = Worker('localhost:8234', 4)
         for _ in range(3):
             if master.cpu_num == 4:
                 break
             time.sleep(10)
         self.assertEqual(master.cpu_num, 4)
 
-        worker2 = Worker('localhost:1234', 4)
+        worker2 = Worker('localhost:8234', 4)
         for _ in range(3):
             if master.cpu_num == 8:
                 break
diff --git a/parl/framework/agent_base.py b/parl/remote/tests/local_actor_test.py
similarity index 50%
rename from parl/framework/agent_base.py
rename to parl/remote/tests/local_actor_test.py
index 331f93b3730be0ae6c17d19ba24e8dd1f03d9c05..0435ed233153ec9efee548e012eb70ead11e2dd5 100644
--- a/parl/framework/agent_base.py
+++ b/parl/remote/tests/local_actor_test.py
@@ -1,4 +1,4 @@
-#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -11,14 +11,28 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+import os
+os.environ['XPARL'] = 'True'
+import parl
+import unittest
 
-import warnings
 
-warnings.simplefilter('default')
+@parl.remote_class(max_memory=350)
+class Actor(object):
+    def __init__(self, x=10):
+        self.x = x
+        self.data = []
 
-warnings.warn(
-    "module `parl.framework.agent_base.Agent` is deprecated since version 1.2 and will be removed in version 1.3, please use `parl.Agent` instead.",
-    DeprecationWarning,
-    stacklevel=2)
+    def add_500mb(self):
+        self.data.append(os.urandom(500 * 1024**2))
+        self.x += 1
+        return self.x
 
-from parl.core.fluid.agent import *
+
+class TestLocalActor(unittest.TestCase):
+    def test_create_actors_without_pre_connection(self):
+        actor = Actor()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/parl/remote/tests/log_server_test.py b/parl/remote/tests/log_server_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..931fc29538df1bc1c960c57e2f97a54e4bb8e0aa
--- /dev/null
+++ b/parl/remote/tests/log_server_test.py
@@ -0,0 +1,186 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import json
+import multiprocessing
+import os
+import pickle
+import subprocess
+import sys
+import tempfile
+import threading
+import time
+import unittest
+
+import requests
+
+import parl
+from parl.remote.client import disconnect, get_global_client
+from parl.remote.master import Master
+from parl.remote.worker import Worker
+from parl.utils import _IS_WINDOWS
+
+
+@parl.remote_class
+class Actor(object):
+    def __init__(self, number=None, arg1=None, arg2=None):
+        self.number = number
+        self.arg1 = arg1
+        self.arg2 = arg2
+        print("Init actor...")
+        self.init_output = "Init actor...\n"
+
+    def sim_output(self, start, end):
+        output = ""
+        print(self.number)
+        output += str(self.number)
+        output += "\n"
+        for i in range(start, end):
+            print(i)
+            output += str(i)
+            output += "\n"
+        return self.init_output + output
+
+
+class TestLogServer(unittest.TestCase):
+    def tearDown(self):
+        disconnect()
+
+    #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
+    @staticmethod
+    def _connect_and_create_actor(cluster_addr):
+        parl.connect(cluster_addr)
+        outputs = []
+        for i in range(2):
+            actor = Actor(number=i)
+            ret = actor.sim_output(1, 4)
+            assert ret != ""
+            outputs.append(ret)
+        return outputs
+
+    def test_log_server(self):
+        master_port = 8401
+        # start the master
+        master = Master(port=master_port)
+        th = threading.Thread(target=master.run)
+        th.start()
+        time.sleep(1)
+
+        cluster_addr = 'localhost:{}'.format(master_port)
+        log_server_port = 8402
+        worker = Worker(cluster_addr, 4, log_server_port=log_server_port)
+        outputs = self._connect_and_create_actor(cluster_addr)
+
+        # Get status
+        status = master._get_status()
+        client_jobs = pickle.loads(status).get('client_jobs')
+        self.assertIsNotNone(client_jobs)
+
+        # Get job id
+        client = get_global_client()
+        jobs = client_jobs.get(client.client_id)
+        self.assertIsNotNone(jobs)
+
+        for job_id, log_server_addr in jobs.items():
+            log_url = "http://{}/get-log".format(log_server_addr)
+            # Test response without job_id
+            r = requests.get(log_url)
+            self.assertEqual(r.status_code, 400)
+            # Test normal response
+            r = requests.get(log_url, params={'job_id': job_id})
+            self.assertEqual(r.status_code, 200)
+            log_content = json.loads(r.text).get('log')
+            self.assertIsNotNone(log_content)
+            log_content = log_content.replace('\r\n', '\n')
+            self.assertIn(log_content, outputs)
+
+            # Test download
+            download_url = "http://{}/download-log".format(log_server_addr)
+            r = requests.get(download_url, params={'job_id': job_id})
+            self.assertEqual(r.status_code, 200)
+            log_content = r.text.replace('\r\n', '\n')
+            self.assertIn(log_content, outputs)
+
+        disconnect()
+        worker.exit()
+        master.exit()
+
+    def test_monitor_query_log_server(self):
+        master_port = 8403
+        monitor_port = 8404
+        # start the master
+        master = Master(port=master_port, monitor_port=monitor_port)
+        th = threading.Thread(target=master.run)
+        th.start()
+        time.sleep(1)
+        # start the cluster monitor
+        monitor_file = __file__.replace(
+            os.path.join('tests', 'log_server_test.pyc'), 'monitor.py')
+        monitor_file = monitor_file.replace(
+            os.path.join('tests', 'log_server_test.py'), 'monitor.py')
+        command = [
+            sys.executable, monitor_file, "--monitor_port",
+            str(monitor_port), "--address", "localhost:" + str(master_port)
+        ]
+        if _IS_WINDOWS:
+            FNULL = tempfile.TemporaryFile()
+        else:
+            FNULL = open(os.devnull, 'w')
+        monitor_proc = subprocess.Popen(
+            command,
+            stdout=FNULL,
+            stderr=subprocess.STDOUT,
+        )
+
+        # Start worker
+        cluster_addr = 'localhost:{}'.format(master_port)
+        log_server_port = 8405
+        worker = Worker(cluster_addr, 4, log_server_port=log_server_port)
+
+        # Test monitor API
+        outputs = self._connect_and_create_actor(cluster_addr)
+        time.sleep(5)  # Wait for the status update
+        client = get_global_client()
+        jobs_url = "{}/get-jobs?client_id={}".format(master.monitor_url,
+                                                     client.client_id)
+        r = requests.get(jobs_url)
+        self.assertEqual(r.status_code, 200)
+        data = json.loads(r.text)
+        for job in data:
+            log_url = job.get('log_url')
+            self.assertIsNotNone(log_url)
+            r = requests.get(log_url)
+            self.assertEqual(r.status_code, 200)
+            log_content = json.loads(r.text).get('log')
+            self.assertIsNotNone(log_content)
+            log_content = log_content.replace('\r\n', '\n')
+            self.assertIn(log_content, outputs)
+
+            # Test download
+            download_url = job.get('download_url')
+            r = requests.get(download_url)
+            self.assertEqual(r.status_code, 200)
+            log_content = r.text.replace('\r\n', '\n')
+            self.assertIn(log_content, outputs)
+
+        # Clean context
+        monitor_proc.kill()
+        monitor_proc.wait()
+        disconnect()
+        worker.exit()
+        master.exit()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py b/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py
index 5dd7d9fca6737324ea187258f2f32364e72034b2..c87afad15d5e0e92e44bda1a97259eaaf6b32256 100644
--- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py
+++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_1_test.py
@@ -16,7 +16,6 @@ import unittest
 import parl
 import time
 import threading
-import timeout_decorator
 import multiprocessing
 
 from parl.remote.master import Master
@@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase):
     def tearDown(self):
         disconnect()
 
-    def _connect_and_create_actor(self, cluster_addr):
+    #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
+    @staticmethod
+    def _connect_and_create_actor(cluster_addr):
         parl.connect(cluster_addr)
         for _ in range(2):
             actor = Actor()
             ret = actor.add_one(1)
-            self.assertEqual(ret, 2)
+            assert ret == 2
         disconnect()
 
     def _create_actor(self):
@@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase):
             ret = actor.add_one(1)
             self.assertEqual(ret, 2)
 
-    @timeout_decorator.timeout(seconds=300)
     def test_connect_and_create_actor_in_multiprocessing_with_connected_in_main_process(
             self):
         # start the master
diff --git a/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py b/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py
index 8f4458912dbed9a7bcd408fed4b0937271b21b40..09b8b95002bdd5a45d132b04c39811bf5d32cdd8 100644
--- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py
+++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_2_test.py
@@ -16,7 +16,6 @@ import unittest
 import parl
 import time
 import threading
-import timeout_decorator
 import multiprocessing
 
 from parl.remote.master import Master
@@ -39,12 +38,14 @@ class TestCluster(unittest.TestCase):
     def tearDown(self):
         disconnect()
 
-    def _connect_and_create_actor(self, cluster_addr):
+    #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
+    @staticmethod
+    def _connect_and_create_actor(cluster_addr):
         parl.connect(cluster_addr)
         for _ in range(2):
             actor = Actor()
             ret = actor.add_one(1)
-            self.assertEqual(ret, 2)
+            assert ret == 2
         disconnect()
 
     def _create_actor(self):
@@ -53,7 +54,6 @@ class TestCluster(unittest.TestCase):
             ret = actor.add_one(1)
             self.assertEqual(ret, 2)
 
-    @timeout_decorator.timeout(seconds=300)
     def test_connect_and_create_actor_in_multiprocessing_without_connected_in_main_process(
             self):
         # start the master
diff --git a/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py b/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py
index 22625c0e959fdc7511eb1b4cb25cd6eaea31780a..3cdffd71aaf238c460fd31ae3c757d11e3fec18c 100644
--- a/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py
+++ b/parl/remote/tests/multiprocessing/cluster_multiprocessing_test.py
@@ -4,8 +4,7 @@
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
-#     http://www.apache.org/licenses/LICENSE-2.0
-#
+#     http://www.apache.org/licenses/LICENSE-2.0 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
@@ -16,12 +15,12 @@ import unittest
 import parl
 import time
 import threading
-import timeout_decorator
 import multiprocessing
 
 from parl.remote.master import Master
 from parl.remote.worker import Worker
 from parl.remote.client import disconnect
+from parl.utils import _IS_WINDOWS
 
 
 @parl.remote_class
@@ -39,21 +38,14 @@ class TestCluster(unittest.TestCase):
     def tearDown(self):
         disconnect()
 
-    def _connect_and_create_actor(self, cluster_addr):
-        parl.connect(cluster_addr)
-        for _ in range(2):
-            actor = Actor()
-            ret = actor.add_one(1)
-            self.assertEqual(ret, 2)
-        disconnect()
-
-    def _create_actor(self):
+    #In windows, multiprocessing.Process cannot run the method of class, but static method is ok.
+    @staticmethod
+    def _create_actor():
         for _ in range(2):
             actor = Actor()
             ret = actor.add_one(1)
-            self.assertEqual(ret, 2)
+            assert ret == 2
 
-    @timeout_decorator.timeout(seconds=300)
     def test_create_actor_in_multiprocessing(self):
         # start the master
         master = Master(port=8240)
@@ -64,14 +56,15 @@ class TestCluster(unittest.TestCase):
         worker1 = Worker('localhost:8240', 4)
         parl.connect('localhost:8240')
 
-        proc1 = multiprocessing.Process(target=self._create_actor)
-        proc2 = multiprocessing.Process(target=self._create_actor)
-        proc1.start()
-        proc2.start()
+        if not _IS_WINDOWS:  # In windows, fork process cannot access client created in main process.
+            proc1 = multiprocessing.Process(target=self._create_actor)
+            proc2 = multiprocessing.Process(target=self._create_actor)
+            proc1.start()
+            proc2.start()
 
-        proc1.join()
-        proc2.join()
-        print("[test_create_actor_in_multiprocessing]  Join")
+            proc1.join()
+            proc2.join()
+            print("[test_create_actor_in_multiprocessing]  Join")
 
         # make sure that the client of the main process still works
         self._create_actor()
diff --git a/parl/remote/tests/recursive_actor_test.py b/parl/remote/tests/recursive_actor_test.py
new file mode 100644
index 0000000000000000000000000000000000000000..5e9613b6be23ae64bf2fba10df793cfa57dbeea1
--- /dev/null
+++ b/parl/remote/tests/recursive_actor_test.py
@@ -0,0 +1,56 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+from parl.utils import logger
+import parl
+from parl.remote.client import disconnect
+from parl.remote.master import Master
+from parl.remote.worker import Worker
+import time
+import threading
+
+c = 10
+port = 3002
+if __name__ == '__main__':
+    master = Master(port=port)
+    th = threading.Thread(target=master.run)
+    th.setDaemon(True)
+    th.start()
+time.sleep(5)
+cluster_addr = 'localhost:{}'.format(port)
+parl.connect(cluster_addr)
+worker = Worker(cluster_addr, 1)
+
+
+@parl.remote_class
+class Actor(object):
+    def add(self, a, b):
+        return a + b + c
+
+
+actor = Actor()
+
+
+class TestRecursive_actor(unittest.TestCase):
+    def tearDown(self):
+        disconnect()
+
+    def test_global_running(self):
+        self.assertEqual(actor.add(1, 2), 13)
+        master.exit()
+        worker.exit()
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/parl/remote/tests/reset_job_test.py b/parl/remote/tests/reset_job_test.py
index 85f07184a0b55e7d3dcf285a707e1d7862ec08bf..478da6506821306d997ad788240178d4739c1bf6 100644
--- a/parl/remote/tests/reset_job_test.py
+++ b/parl/remote/tests/reset_job_test.py
@@ -23,7 +23,6 @@ import time
 import threading
 import subprocess
 import sys
-import timeout_decorator
 
 
 @parl.remote_class
@@ -63,7 +62,6 @@ class TestJob(unittest.TestCase):
     def tearDown(self):
         disconnect()
 
-    @timeout_decorator.timeout(seconds=600)
     def test_acor_exit_exceptionally(self):
         port = 1337
         master = Master(port)
diff --git a/parl/remote/tests/reset_job_test_alone.py b/parl/remote/tests/reset_job_test_alone.py
index 81cc2fe77a102521c0dc0633d215821a2a5d991c..425f02ad4ee8acca93560d6a4de5e4836112ffb9 100644
--- a/parl/remote/tests/reset_job_test_alone.py
+++ b/parl/remote/tests/reset_job_test_alone.py
@@ -16,7 +16,8 @@ import parl
 from parl.remote.master import Master
 from parl.remote.worker import Worker
 from parl.remote.client import disconnect
-from parl.utils import logger
+from parl.utils import logger, _IS_WINDOWS
+import os
 import threading
 import time
 import subprocess
@@ -70,9 +71,14 @@ class TestJobAlone(unittest.TestCase):
         time.sleep(1)
         self.assertEqual(master.cpu_num, 4)
         print("We are going to kill all the jobs.")
-        command = (
-            "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9")
-        subprocess.call([command], shell=True)
+        if _IS_WINDOWS:
+            command = r'''for /F "skip=2 tokens=2 delims=," %a in ('wmic process where "commandline like '%remote\\job.py%'" get processid^,status /format:csv') do taskkill /F /T /pid %a'''
+            print(os.popen(command).read())
+        else:
+            command = (
+                "ps aux | grep remote/job.py | awk '{print $2}' | xargs kill -9"
+            )
+            subprocess.call([command], shell=True)
         parl.connect('localhost:1334')
         actor = Actor()
         self.assertEqual(actor.add_one(1), 2)
diff --git a/parl/remote/tests/send_job_test.py b/parl/remote/tests/send_job_test.py
index 77ea421fde09e042c6620da8087a683fb4710acf..8ea2d4083c45f66184e8a2287c77e3f0ca840257 100644
--- a/parl/remote/tests/send_job_test.py
+++ b/parl/remote/tests/send_job_test.py
@@ -21,6 +21,7 @@ import threading
 from parl.remote.master import Master
 from parl.remote.worker import Worker
 from parl.remote.client import disconnect
+from parl.utils import _IS_WINDOWS
 
 
 @parl.remote_class
@@ -44,12 +45,15 @@ class TestSendFile(unittest.TestCase):
         worker = Worker('localhost:{}'.format(port), 1)
         time.sleep(2)
 
-        os.system('mkdir ./rom_files')
-        os.system('touch ./rom_files/pong.bin')
-        assert os.path.exists('./rom_files/pong.bin')
-        parl.connect(
-            'localhost:{}'.format(port),
-            distributed_files=['./rom_files/pong.bin'])
+        tmp_dir = 'rom_files'
+        tmp_file = os.path.join(tmp_dir, 'pong.bin')
+        os.system('mkdir {}'.format(tmp_dir))
+        if _IS_WINDOWS:
+            os.system('type NUL >> {}'.format(tmp_file))
+        else:
+            os.system('touch {}'.format(tmp_file))
+        assert os.path.exists(tmp_file)
+        parl.connect('localhost:{}'.format(port), distributed_files=[tmp_file])
         time.sleep(5)
         actor = Actor()
         for _ in range(10):
@@ -70,8 +74,9 @@ class TestSendFile(unittest.TestCase):
         worker = Worker('localhost:{}'.format(port), 1)
         time.sleep(2)
 
+        tmp_file = os.path.join('rom_files', 'no_pong.bin')
         self.assertRaises(Exception, parl.connect, 'localhost:{}'.format(port),
-                          ['./rom_files/no_pong.bin'])
+                          [tmp_file])
 
         worker.exit()
         master.exit()
diff --git a/parl/remote/tests/sync_config_file_test.py b/parl/remote/tests/sync_config_file_test.py
index a4d131d5e13111a1c7faaa209aa2acb114e7c7c7..c8be19443e446e1d90819a63c2a64b471fb23e6d 100644
--- a/parl/remote/tests/sync_config_file_test.py
+++ b/parl/remote/tests/sync_config_file_test.py
@@ -17,12 +17,10 @@ import parl
 from parl.remote.master import Master
 from parl.remote.worker import Worker
 from parl.remote.client import disconnect
-
+import os
 import time
 import threading
-
 import sys
-
 import numpy as np
 import json
 
@@ -65,7 +63,8 @@ class TestConfigfile(unittest.TestCase):
         parl.connect('localhost:1335', ['random.npy', 'config.json'])
         actor = Actor('random.npy', 'config.json')
         time.sleep(5)
-
+        os.remove('./random.npy')
+        os.remove('./config.json')
         remote_sum = actor.random_sum()
         self.assertEqual(remote_sum, random_sum)
         time.sleep(10)
diff --git a/parl/remote/utils.py b/parl/remote/utils.py
new file mode 100644
index 0000000000000000000000000000000000000000..9a2ece8686ff7de73c8164565f34281e412aa4ee
--- /dev/null
+++ b/parl/remote/utils.py
@@ -0,0 +1,96 @@
+#   Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import sys
+from contextlib import contextmanager
+
+__all__ = ['load_remote_class', 'redirect_stdout_to_file']
+
+
+def simplify_code(code, end_of_file):
+    """
+  @parl.remote_actor has to use this function to simplify the code.
+  To create a remote object, PARL has to import the module that contains the decorated class.
+  It may run some unnecessary code when importing the module, and this is why we use this function
+  to simplify the code.
+
+  For example.
+  @parl.remote_actor
+  class A(object):
+    def add(self, a, b):
+    return a + b
+  def data_process():
+    XXXX
+  ------------------>
+  The last two lines of the above code block will be removed as they are not class related.
+  """
+    to_write_lines = []
+    for i, line in enumerate(code):
+        if line.startswith('parl.connect'):
+            continue
+        if i < end_of_file - 1:
+            to_write_lines.append(line)
+        else:
+            break
+    return to_write_lines
+
+
+def load_remote_class(file_name, class_name, end_of_file):
+    """
+  load a class given its file_name and class_name.
+
+  Args:
+    file_name: specify the file to load the class
+    class_name: specify the class to be loaded
+    end_of_file: line ID to indicate the last line that defines the class.
+
+  Return:
+    cls: the class to load
+  """
+    with open(file_name + '.py') as t_file:
+        code = t_file.readlines()
+    code = simplify_code(code, end_of_file)
+    module_name = 'xparl_' + file_name
+    tmp_file_name = 'xparl_' + file_name + '.py'
+    with open(tmp_file_name, 'w') as t_file:
+        for line in code:
+            t_file.write(line)
+    mod = __import__(module_name)
+    cls = getattr(mod, class_name)
+    return cls
+
+
+@contextmanager
+def redirect_stdout_to_file(file_path):
+    """Redirect stdout (e.g., `print`) to specified file.
+
+    Example:
+    >>> print('test')
+    test
+    >>> with redirect_stdout_to_file('test.log'):
+    ...     print('test')  # Output nothing, `test` is printed to `test.log`.
+    >>> print('test')
+    test
+
+    Args:
+        file_path: Path of the file to output the stdout.
+
+    """
+    tmp = sys.stdout
+    f = open(file_path, 'a')
+    sys.stdout = f
+    try:
+        yield
+    finally:
+        sys.stdout = tmp
+        f.close()
diff --git a/parl/remote/worker.py b/parl/remote/worker.py
index fae9cd8306463b4d463d90292dbeabbd79b6b492..eec5598c6d081ca054541657c61670ecffc70cee 100644
--- a/parl/remote/worker.py
+++ b/parl/remote/worker.py
@@ -20,13 +20,14 @@ import signal
 import socket
 import subprocess
 import sys
+import tempfile
 import time
 import threading
 import warnings
 import zmq
 from datetime import datetime
 
-from parl.utils import get_ip_address, to_byte, to_str, logger
+from parl.utils import get_ip_address, to_byte, to_str, logger, _IS_WINDOWS, kill_process
 from parl.remote import remote_constants
 from parl.remote.message import InitializedWorker
 from parl.remote.status import WorkerStatus
@@ -63,7 +64,7 @@ class Worker(object):
         cpu_num (int): Number of cpu to be used on the worker.
     """
 
-    def __init__(self, master_address, cpu_num=None):
+    def __init__(self, master_address, cpu_num=None, log_server_port=None):
         self.lock = threading.Lock()
         self.heartbeat_socket_initialized = threading.Event()
         self.ctx = zmq.Context.instance()
@@ -75,9 +76,13 @@ class Worker(object):
         self._set_cpu_num(cpu_num)
         self.job_buffer = queue.Queue(maxsize=self.cpu_num)
         self._create_sockets()
+        # create log server
+        self.log_server_proc, self.log_server_address = self._create_log_server(
+            port=log_server_port)
 
         # create a thread that waits commands from the job to kill the job.
         self.kill_job_thread = threading.Thread(target=self._reply_kill_job)
+        self.kill_job_thread.setDaemon(True)
         self.kill_job_thread.start()
 
         self._create_jobs()
@@ -169,6 +174,7 @@ class Worker(object):
 
     def _fill_job_buffer(self):
         """An endless loop that adds initialized job into the job buffer"""
+        initialized_jobs = []
         while self.worker_is_alive:
             if self.job_buffer.full() is False:
                 job_num = self.cpu_num - self.job_buffer.qsize()
@@ -178,13 +184,7 @@ class Worker(object):
                         self.job_buffer.put(job)
 
             time.sleep(0.02)
-
-        # release jobs if the worker is not alive
-        for job in initialized_jobs:
-            try:
-                os.kill(job.pid, signal.SIGTERM)
-            except OSError:
-                pass
+        self.exit()
 
     def _init_jobs(self, job_num):
         """Create jobs.
@@ -196,7 +196,8 @@ class Worker(object):
         job_file = job_file.replace('worker.py', 'job.py')
         command = [
             sys.executable, job_file, "--worker_address",
-            self.reply_job_address
+            self.reply_job_address, "--log_server_address",
+            self.log_server_address
         ]
 
         if sys.version_info.major == 3:
@@ -223,6 +224,7 @@ class Worker(object):
             # a thread for sending heartbeat signals to job
             thread = threading.Thread(
                 target=self._create_job_monitor, args=(initialized_job, ))
+            thread.setDaemon(True)
             thread.start()
         self.lock.release()
         assert len(new_jobs) > 0, "init jobs failed"
@@ -311,7 +313,10 @@ class Worker(object):
         total_memory = round(virtual_memory[0] / (1024**3), 2)
         used_memory = round(virtual_memory[3] / (1024**3), 2)
         vacant_memory = round(total_memory - used_memory, 2)
-        load_average = round(os.getloadavg()[0], 2)
+        if _IS_WINDOWS:
+            load_average = round(psutil.getloadavg()[0], 2)
+        else:
+            load_average = round(os.getloadavg()[0], 2)
         return (vacant_memory, used_memory, now, load_average)
 
     def _reply_heartbeat(self, target):
@@ -329,7 +334,7 @@ class Worker(object):
 
         logger.set_dir(
             os.path.expanduser('~/.parl_data/worker/{}'.format(
-                self.master_heartbeat_address)))
+                self.master_heartbeat_address.replace(':', '_'))))
 
         self.heartbeat_socket_initialized.set()
         logger.info("[Worker] Connect to the master node successfully. "
@@ -351,15 +356,47 @@ class Worker(object):
                 break
         socket.close(0)
         logger.warning(
-            "[Worker] lost connection with the master, will exit replying heartbeat for master."
+            "[Worker] lost connection with the master, will exit reply heartbeat for master."
         )
         self.worker_status.clear()
+        self.log_server_proc.kill()
+        self.log_server_proc.wait()
         # exit the worker
         self.worker_is_alive = False
+        self.exit()
+
+    def _create_log_server(self, port):
+        log_server_file = __file__.replace('worker.pyc', 'log_server.py')
+        log_server_file = log_server_file.replace('worker.py', 'log_server.py')
+
+        if port is None:
+            port = "0"  # `0` means using a random port in flask
+        command = [
+            sys.executable, log_server_file, "--port",
+            str(port), "--log_dir", "~/.parl_data/job/", "--line_num", "500"
+        ]
+
+        if sys.version_info.major == 3:
+            warnings.simplefilter("ignore", ResourceWarning)
+
+        if _IS_WINDOWS:
+            FNULL = tempfile.TemporaryFile()
+        else:
+            FNULL = open(os.devnull, 'w')
+        log_server_proc = subprocess.Popen(
+            command,
+            stdout=FNULL,
+            stderr=subprocess.STDOUT,
+        )
+        FNULL.close()
+
+        log_server_address = "{}:{}".format(self.worker_ip, port)
+        return log_server_proc, log_server_address
 
     def exit(self):
         """close the worker"""
         self.worker_is_alive = False
+        kill_process('remote/job.py.*{}'.format(self.reply_job_address))
 
     def run(self):
         """Keep running until it lost connection with the master.
diff --git a/parl/utils/communication.py b/parl/utils/communication.py
index ea201bae16e571ab429ef8f194228fc5b7fa4432..c13c28e93df006fd464b880e4196e22cbacc97cb 100644
--- a/parl/utils/communication.py
+++ b/parl/utils/communication.py
@@ -14,6 +14,8 @@
 
 import cloudpickle
 import pyarrow
+import subprocess
+import os
 from parl.utils import SerializeError, DeserializeError
 
 __all__ = ['dumps_argument', 'loads_argument', 'dumps_return', 'loads_return']
diff --git a/parl/utils/machine_info.py b/parl/utils/machine_info.py
index 3ab8e404a442d8ac4eec20e5cbb3bf6c07b0f541..f69319ad02ea0673ef135b1cd914a70fc6a2fea3 100644
--- a/parl/utils/machine_info.py
+++ b/parl/utils/machine_info.py
@@ -14,40 +14,40 @@
 
 import os
 import platform
+import random
+import socket
 import subprocess
-from parl.utils import logger
-from parl.utils import utils
+from parl.utils import logger, _HAS_FLUID, _IS_WINDOWS
 
-__all__ = ['get_gpu_count', 'get_ip_address', 'is_gpu_available']
+__all__ = [
+    'get_gpu_count', 'get_ip_address', 'is_gpu_available', 'get_free_tcp_port',
+    'is_port_available', 'get_port_from_range'
+]
 
 
 def get_ip_address():
     """
     get the IP address of the host.
     """
-    platform_sys = platform.system()
 
-    # Only support Linux and MacOS
-    if platform_sys != 'Linux' and platform_sys != 'Darwin':
-        logger.warning(
-            'get_ip_address only support Linux and MacOS, please set ip address manually.'
-        )
-        return None
-
-    local_ip = None
-    import socket
-    try:
-        # First way, tested in Ubuntu and MacOS
-        s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
-        s.connect(("8.8.8.8", 80))
-        local_ip = s.getsockname()[0]
-        s.close()
-    except:
-        # Second way, tested in CentOS
+    # Windows
+    if _IS_WINDOWS:
+        local_ip = socket.gethostbyname(socket.gethostname())
+    else:
+        # Linux and MacOS
+        local_ip = None
         try:
-            local_ip = socket.gethostbyname(socket.gethostname())
+            # First way, tested in Ubuntu and MacOS
+            s = socket.socket(socket.AF_INET, socket.SOCK_DGRAM)
+            s.connect(("8.8.8.8", 80))
+            local_ip = s.getsockname()[0]
+            s.close()
         except:
-            pass
+            # Second way, tested in CentOS
+            try:
+                local_ip = socket.gethostbyname(socket.gethostname())
+            except:
+                pass
 
     if local_ip == None or local_ip == '127.0.0.1' or local_ip == '127.0.1.1':
         logger.warning(
@@ -97,10 +97,40 @@ def is_gpu_available():
       True if a gpu device can be found.
     """
     ret = get_gpu_count() > 0
-    if utils._HAS_FLUID:
+    if _HAS_FLUID:
         from paddle import fluid
         if ret is True and not fluid.is_compiled_with_cuda():
             logger.warning("Found non-empty CUDA_VISIBLE_DEVICES. \
-                But PARL found that Paddle was not complied with CUDA, which may cause issues."
-                           )
+                But PARL found that Paddle was not complied with CUDA, which may cause issues. \
+                Thus PARL will not use GPU.")
+            return False
     return ret
+
+
+def get_free_tcp_port():
+    tcp = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    tcp.bind(('', 0))
+    addr, port = tcp.getsockname()
+    tcp.close()
+    return str(port)
+
+
+def is_port_available(port):
+    """ Check if a port is used.
+
+    True if the port is available for connection.
+    """
+    port = int(port)
+    sock = socket.socket(socket.AF_INET, socket.SOCK_STREAM)
+    available = sock.connect_ex(('localhost', port))
+    sock.close()
+    return available
+
+
+def get_port_from_range(start, end):
+    while True:
+        port = random.randint(start, end)
+        if is_port_available(port):
+            break
+
+    return port
diff --git a/examples/LiftSim_baseline/__init__.py b/parl/utils/summary.py
similarity index 86%
rename from examples/LiftSim_baseline/__init__.py
rename to parl/utils/summary.py
index eca2dce114b069bf9b455d77ce670d73b5047fd2..bc3578ef384222a4e55b7b9af90f36d9a7fccb4c 100644
--- a/examples/LiftSim_baseline/__init__.py
+++ b/parl/utils/summary.py
@@ -11,3 +11,8 @@
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
+
+try:
+    from parl.utils.visualdl import *
+except:
+    from parl.utils.tensorboard import *
diff --git a/parl/utils/tensorboard.py b/parl/utils/tensorboard.py
index 575fc6b9976906e43dddeb7da2ea1ef32d4644c1..3fef518196216986f33f187c215b8aa4834003d5 100644
--- a/parl/utils/tensorboard.py
+++ b/parl/utils/tensorboard.py
@@ -14,6 +14,7 @@
 
 from tensorboardX import SummaryWriter
 from parl.utils import logger
+from parl.utils.machine_info import get_ip_address
 
 __all__ = []
 
@@ -29,8 +30,8 @@ def create_file_after_first_call(func_name):
             if logdir is None:
                 logdir = logger.auto_set_dir(action='d')
                 logger.warning(
-                    "[tensorboard] logdir is None, will save tensorboard files to {}"
-                    .format(logdir))
+                    "[tensorboard] logdir is None, will save tensorboard files to {}\nView the data using: tensorboard --logdir=./{} --host={}"
+                    .format(logdir, logdir, get_ip_address()))
             _writer = SummaryWriter(logdir=logger.get_dir())
         func = getattr(_writer, func_name)
         func(*args, **kwargs)
diff --git a/parl/utils/tests/tensorboard_test.py b/parl/utils/tests/summary_test.py
similarity index 74%
rename from parl/utils/tests/tensorboard_test.py
rename to parl/utils/tests/summary_test.py
index 65fcb82404adfe461395e594dcf112ea41fd330e..401051c5debd3ed69d3cba54bcdda1c9ef75f12c 100644
--- a/parl/utils/tests/tensorboard_test.py
+++ b/parl/utils/tests/summary_test.py
@@ -12,7 +12,7 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 import unittest
-from parl.utils import tensorboard
+from parl.utils import summary
 import numpy as np
 from parl.utils import logger
 import os
@@ -20,18 +20,21 @@ import os
 
 class TestUtils(unittest.TestCase):
     def tearDown(self):
-        tensorboard.flush()
+        if hasattr(summary, 'flush'):
+            summary.flush()
 
     def test_add_scalar(self):
         x = range(100)
         for i in x:
-            tensorboard.add_scalar('y=2x', i * 2, i)
-        self.assertTrue(os.path.exists('./train_log/tensorboard_test'))
+            summary.add_scalar('y=2x', i * 2, i)
+        self.assertTrue(os.path.exists('./train_log/summary_test'))
 
     def test_add_histogram(self):
+        if not hasattr(summary, 'add_histogram'):
+            return
         for i in range(10):
             x = np.random.random(1000)
-            tensorboard.add_histogram('distribution centers', x + i, i)
+            summary.add_histogram('distribution centers', x + i, i)
 
 
 if __name__ == '__main__':
diff --git a/parl/utils/utils.py b/parl/utils/utils.py
index cb95b4d18c2c7aa38b2822a12b2995380609535d..a29a8c825017f9241ea59f76f5fe5e58de4f7b80 100644
--- a/parl/utils/utils.py
+++ b/parl/utils/utils.py
@@ -13,10 +13,14 @@
 # limitations under the License.
 
 import sys
+import os
+import subprocess
+import numpy as np
 
 __all__ = [
     'has_func', 'action_mapping', 'to_str', 'to_byte', 'is_PY2', 'is_PY3',
-    'MAX_INT32', '_HAS_FLUID', '_HAS_TORCH'
+    'MAX_INT32', '_HAS_FLUID', '_HAS_TORCH', '_IS_WINDOWS', '_IS_MAC',
+    'kill_process'
 ]
 
 
@@ -45,9 +49,12 @@ def action_mapping(model_output_act, low_bound, high_bound):
     Returns:
         action: np.array, which value is in [low_bound, high_bound]
     """
+    assert np.all(((model_output_act<=1.0), (model_output_act>=-1.0))), \
+        'the action should be in range [-1.0, 1.0]'
     assert high_bound > low_bound
     action = low_bound + (model_output_act - (-1.0)) * (
         (high_bound - low_bound) / 2.0)
+    action = np.clip(action, low_bound, high_bound)
     return action
 
 
@@ -82,7 +89,7 @@ MAX_INT32 = 0x7fffffff
 try:
     from paddle import fluid
     fluid_version = get_fluid_version()
-    assert fluid_version >= 151, "PARL requires paddle>=1.5.1"
+    assert fluid_version >= 161 or fluid_version == 0, "PARL requires paddle>=1.6.1"
     _HAS_FLUID = True
 except ImportError:
     _HAS_FLUID = False
@@ -92,3 +99,26 @@ try:
     _HAS_TORCH = True
 except ImportError:
     _HAS_TORCH = False
+
+_IS_WINDOWS = (sys.platform == 'win32')
+_IS_MAC = (sys.platform == 'darwin')
+
+
+def kill_process(regex_pattern):
+    """kill process whose execution commnad is matched by regex pattern
+
+    Args:
+        regex_pattern(string): regex pattern used to filter the process to be killed
+    
+    NOTE:
+        In windows, we will replace sep `/` with `\\\\`
+    """
+    if _IS_WINDOWS:
+        regex_pattern = regex_pattern.replace('/', '\\\\')
+        command = r'''for /F "skip=2 tokens=2 delims=," %a in ('wmic process where "commandline like '%{}%'" get processid^,status /format:csv') do taskkill /F /T /pid %a'''.format(
+            regex_pattern)
+        os.popen(command).read()
+    else:
+        command = "ps aux | grep {} | awk '{{print $2}}' | xargs kill -9".format(
+            regex_pattern)
+        subprocess.call([command], shell=True)
diff --git a/parl/utils/visualdl.py b/parl/utils/visualdl.py
new file mode 100644
index 0000000000000000000000000000000000000000..bbf1aa08e313440a47a73936d1be61ca9701f166
--- /dev/null
+++ b/parl/utils/visualdl.py
@@ -0,0 +1,46 @@
+#   Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+from visualdl import LogWriter
+from parl.utils import logger
+from parl.utils.machine_info import get_ip_address
+
+__all__ = []
+
+_writer = None
+_WRITTER_METHOD = ['add_scalar']
+
+
+def create_file_after_first_call(func_name):
+    def call(*args, **kwargs):
+        global _writer
+        if _writer is None:
+            logdir = logger.get_dir()
+            if logdir is None:
+                logdir = logger.auto_set_dir(action='d')
+                logger.warning(
+                    "[VisualDL] logdir is None, will save VisualDL files to {}\nView the data using: visualdl --logdir=./{} --host={}"
+                    .format(logdir, logdir, get_ip_address()))
+            _writer = LogWriter(logdir=logger.get_dir())
+        func = getattr(_writer, func_name)
+        func(*args, **kwargs)
+        _writer.flush()
+
+    return call
+
+
+# export writter functions
+for func_name in _WRITTER_METHOD:
+    locals()[func_name] = create_file_after_first_call(func_name)
+    __all__.append(func_name)
diff --git a/setup.py b/setup.py
index 18f7ac96a8b075a3db3a12bb19f8f0c0905e33c2..56af19b09fe0f7bf658557ee5b8ab4f7e25498ab 100644
--- a/setup.py
+++ b/setup.py
@@ -31,7 +31,12 @@ def _find_packages(prefix=''):
     prefix = prefix
     for root, _, files in os.walk(path):
         if '__init__.py' in files:
-            packages.append(re.sub('^[^A-z0-9_]', '', root.replace('/', '.')))
+            if sys.platform == 'win32':
+                packages.append(
+                    re.sub('^[^A-z0-9_]', '', root.replace('\\', '.')))
+            else:
+                packages.append(
+                    re.sub('^[^A-z0-9_]', '', root.replace('/', '.')))
     return packages
 
 
@@ -72,9 +77,11 @@ setup(
         "cloudpickle==1.2.1",
         "tensorboardX==1.8",
         "tb-nightly==1.15.0a20190801",
-        "flask==1.0.4",
+        "flask>=1.0.4",
         "click",
-        "psutil",
+        "psutil>=5.6.2",
+        "flask_cors",
+        "visualdl>=2.0.0b;python_version>='3' and platform_system=='Linux'",
     ],
     classifiers=[
         'Intended Audience :: Developers',