Fix conflicts with develop branch

613d7c81 · liaogang · 4453d767 · db379811 · 613d7c81 · 613d7c81
91 changed file
--- a/.gitmodules
+++ b/.gitmodules
+[submodule "warp-ctc"]
+	path = warp-ctc
+	url = https://github.com/baidu-research/warp-ctc.git
--- a/.pre-commit-config.yaml
+++ b/.pre-commit-config.yaml
@@ -2,6 +2,7 @@
    sha: c25201a00e6b0514370501050cf2a8538ac12270
    hooks:
    -   id: remove-crlf
+        files: (?!.*warp-ctc)^.*$
 -   repo: https://github.com/reyoung/mirrors-yapf.git
    sha: v0.13.2
    hooks:
@@ -13,6 +14,7 @@
    -   id: check-merge-conflict
    -   id: check-symlinks
    -   id: detect-private-key
+        files: (?!.*warp-ctc)^.*$
    -   id: end-of-file-fixer
 -   repo: https://github.com/PaddlePaddle/clang-format-pre-commit-hook.git
    sha: 28c0ea8a67a3e2dbbf4822ef44e85b63a0080a29

--- a/.travis.yml
+++ b/.travis.yml
@@ -50,7 +50,7 @@ before_install:
    fi
  - if [[ "$TRAVIS_OS_NAME" == "linux" ]]; then sudo paddle/scripts/travis/before_install.linux.sh; fi
  - if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then paddle/scripts/travis/before_install.osx.sh; fi
-  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy
+  - pip install wheel protobuf sphinx breathe recommonmark virtualenv numpy sphinx_rtd_theme
 script:
  - paddle/scripts/travis/main.sh
 notifications:

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -71,10 +71,10 @@ find_package(Git REQUIRED)
 include(version)
 add_definitions(-DPADDLE_VERSION=\"${PADDLE_VERSION}\")
 if(NOT WITH_GPU)
    add_definitions(-DPADDLE_ONLY_CPU)
    add_definitions(-DHPPL_STUB_FUNC)
    list(APPEND CMAKE_CXX_SOURCE_FILE_EXTENSIONS cu)
 else()
    if(${CUDA_VERSION_MAJOR} VERSION_LESS 7)
@@ -91,15 +91,15 @@ else()
        set(CUDA_NVCC_FLAGS ${CUDA_NVCC_FLAGS} "-Xcompiler ${SSE3_FLAG}")
    endif(WITH_AVX)
-    if(WITH_DSO)
-        add_definitions(-DPADDLE_USE_DSO)
-    endif(WITH_DSO)
    # Include cuda and cudnn
    include_directories(${CUDNN_INCLUDE_DIR})
    include_directories(${CUDA_TOOLKIT_INCLUDE})
 endif(NOT WITH_GPU)
+if(WITH_DSO)
+    add_definitions(-DPADDLE_USE_DSO)
+endif(WITH_DSO)
 if(WITH_DOUBLE)
    add_definitions(-DPADDLE_TYPE_DOUBLE)
    set(ACCURACY double)

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -148,6 +148,11 @@ function(link_paddle_exe TARGET_NAME)
            target_link_libraries(${TARGET_NAME} rt)
        endif()
    endif()
+    if(NOT WITH_DSO)
+        target_link_libraries(${TARGET_NAME}
+            ${WARPCTC_LIBRARY})
+    endif()
 endfunction()
 # link_paddle_test

--- a/demo/seqToseq/dataprovider.py
+++ b/demo/seqToseq/dataprovider.py
@@ -19,27 +19,43 @@ START = "<s>"
 END = "<e>"
-def hook(settings, src_dict, trg_dict, file_list, **kwargs):
+def hook(settings, src_dict_path, trg_dict_path, is_generating, file_list,
+         **kwargs):
    # job_mode = 1: training mode
    # job_mode = 0: generating mode
-    settings.job_mode = trg_dict is not None
+    settings.job_mode = not is_generating
-    settings.src_dict = src_dict
+    def fun(dict_path):
+        out_dict = dict()
+        with open(dict_path, "r") as fin:
+            out_dict = {
+                line.strip(): line_count
+                for line_count, line in enumerate(fin)
+            }
+        return out_dict
+    settings.src_dict = fun(src_dict_path)
+    settings.trg_dict = fun(trg_dict_path)
    settings.logger.info("src dict len : %d" % (len(settings.src_dict)))
-    settings.sample_count = 0
    if settings.job_mode:
-        settings.trg_dict = trg_dict
+        settings.slots = {
-        settings.slots = [
+            'source_language_word':
            integer_value_sequence(len(settings.src_dict)),
+            'target_language_word':
            integer_value_sequence(len(settings.trg_dict)),
+            'target_language_next_word':
            integer_value_sequence(len(settings.trg_dict))
-        ]
+        }
        settings.logger.info("trg dict len : %d" % (len(settings.trg_dict)))
    else:
-        settings.slots = [
+        settings.slots = {
+            'source_language_word':
            integer_value_sequence(len(settings.src_dict)),
+            'sent_id':
            integer_value_sequence(len(open(file_list[0], "r").readlines()))
-        ]
+        }
 def _get_ids(s, dictionary):
@@ -69,6 +85,10 @@ def process(settings, file_name):
                    continue
                trg_ids_next = trg_ids + [settings.trg_dict[END]]
                trg_ids = [settings.trg_dict[START]] + trg_ids
-                yield src_ids, trg_ids, trg_ids_next
+                yield {
+                    'source_language_word': src_ids,
+                    'target_language_word': trg_ids,
+                    'target_language_next_word': trg_ids_next
+                }
            else:
-                yield src_ids, [line_count]
+                yield {'source_language_word': src_ids, 'sent_id': [line_count]}
--- a/demo/seqToseq/seqToseq_net.py
+++ b/demo/seqToseq/seqToseq_net.py
@@ -37,17 +37,10 @@ def seq_to_seq_data(data_dir,
    """
    src_lang_dict = os.path.join(data_dir, 'src.dict')
    trg_lang_dict = os.path.join(data_dir, 'trg.dict')
-    src_dict = dict()
-    for line_count, line in enumerate(open(src_lang_dict, "r")):
-        src_dict[line.strip()] = line_count
-    trg_dict = dict()
-    for line_count, line in enumerate(open(trg_lang_dict, "r")):
-        trg_dict[line.strip()] = line_count
    if is_generating:
        train_list = None
        test_list = os.path.join(data_dir, gen_list)
-        trg_dict = None
    else:
        train_list = os.path.join(data_dir, train_list)
        test_list = os.path.join(data_dir, test_list)
@@ -57,8 +50,11 @@ def seq_to_seq_data(data_dir,
        test_list,
        module="dataprovider",
        obj="process",
-        args={"src_dict": src_dict,
+        args={
-              "trg_dict": trg_dict})
+            "src_dict_path": src_lang_dict,
+            "trg_dict_path": trg_lang_dict,
+            "is_generating": is_generating
+        })
    return {
        "src_dict_path": src_lang_dict,

--- a/doc/conf.py.in
+++ b/doc/conf.py.in
@@ -23,7 +23,7 @@ AutoStructify = transform.AutoStructify
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
-templates_path = ["@PROJ_ROOT@/doc/templates"]
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 # -- General configuration ------------------------------------------------
@@ -113,13 +113,12 @@ todo_include_todos = False
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'
+html_theme = 'sphinx_rtd_theme'
-html_theme = 'classic'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/doc/getstarted/build_and_install/build_from_source.md
+++ b/doc/getstarted/build_and_install/build_from_source.md
@@ -11,6 +11,7 @@ You can download PaddlePaddle from the [github source](https://github.com/Paddle
 ```bash
 git clone https://github.com/PaddlePaddle/Paddle paddle
 cd paddle
+git submodule update --init --recursive
 ```
 ## <span id="requirements">Requirements</span>

--- a/doc/getstarted/build_and_install/docker_install.rst
+++ b/doc/getstarted/build_and_install/docker_install.rst
@@ -19,8 +19,8 @@ automatically runs the following commands:
 .. code-block:: base
-   docker build -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
+   docker build -t paddle:cpu -f paddle/scripts/docker/Dockerfile .
-   docker build -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+   docker build -t paddle:gpu -f paddle/scripts/docker/Dockerfile.gpu .
 To run the CPU-only image as an interactive container:
@@ -79,5 +79,28 @@ source code:
   cd ~
   git clone github.com/PaddlePaddle/Paddle
   cd Paddle
+   git submodule update --init --recursive
   docker build --build-arg WITH_AVX=OFF -t paddle:cpu-noavx -f paddle/scripts/docker/Dockerfile .
   docker build --build-arg WITH_AVX=OFF -t paddle:gpu-noavx -f paddle/scripts/docker/Dockerfile.gpu .
+Documentation
+-------------
+Paddle Docker images include an HTML version of C++ source code
+generated using `woboq code browser
+<https://github.com/woboq/woboq_codebrowser>`_.  This makes it easy
+for users to browse and understand the C++ source code.
+As long as we give the Paddle Docker container a name, we can run an
+additional nginx Docker container to serve the volume from the Paddle
+container:
+.. code-block:: bash
+   docker run -d --name paddle-cpu-doc paddle:cpu
+   docker run -d --volumes-from paddle-cpu-doc -p 8088:80 nginx
+Then we can direct our Web browser to the HTML version of source code
+at http://localhost:8088/paddle/
--- a/doc/howto/cmd_parameter/arguments.md
+++ b/doc/howto/cmd_parameter/arguments.md
@@ -143,7 +143,7 @@ It looks like there are a lot of arguments. However, most of them are for develo
 </tr>
 <tr>
-<td class="left" rowspan = "2">testing during training</td><td class="left">test_all_data_in_one_period</td>
+<td class="left" rowspan = "2">testing during training</td><td class="left">test_period</td>
 <td class="left">√</td><td class="left">√</td><td class="left"></td><td class="left"></td>
 </tr>

--- a/doc/howto/cmd_parameter/detail_introduction.md
+++ b/doc/howto/cmd_parameter/detail_introduction.md
@@ -31,7 +31,7 @@
  - type: string (default: null).
 * `--version`
-  - Whether to print version infomatrion.
+  - Whether to print version information.
  - type: bool (default: 0).
 * `--show_layer_stat`
@@ -110,8 +110,8 @@
  - type: int32 (default: -1).
 * `--test_period`
-  - Run testing every test_period train batches. If not set, run testing each pass.
+   - if equal 0, do test on all test data at the end of each pass. While if equal non-zero, do test on all test data every test_period batches.
-  - type: int32 (default: 1000).
+  - type: int32 (default: 0).
 * `--test_wait`
  - Whether to wait for parameter per pass if not exist. If set test_data_path in submitting environment of cluster, it will launch one process to perfom testing, so we need to set test_wait=1. Note that in the cluster submitting environment, this argument has been set True by default.
@@ -121,10 +121,6 @@
  - File that saves the model list when testing. It was set automatically when using cluster submitting environment after setting model_path.
  - type: string (default: "", null).
-* `--test_all_data_in_one_period`
-  - This argument is usually used in testing period during traning. If true, all data will be tested in one test period. Otherwise (batch_size * log_peroid) data will be tested.
-  - type: bool (default: 0).
 * `--predict_output_dir`
  - Directory that saves the layer output. It is configured in Outputs() in network config. Default, this argument is null, meaning save nothing. Specify this directory if you want to save feature map of some layers in testing mode. Note that, layer outputs are values after activation function.
  - type: string (default: "", null).

--- a/doc/howto/cmd_parameter/use_case.md
+++ b/doc/howto/cmd_parameter/use_case.md
@@ -10,9 +10,8 @@ paddle train \
  --config=network_config \
  --save_dir=output \
  --trainer_count=COUNT \                #(default:1)
-  --test_period=M \                      #(default:1000）
+  --test_period=M \                      #(default:0) 
-  --test_all_data_in_one_period=true \   #(default:false) 
+  --num_passes=N \                       #(defalut:100)
-  --num_passes=N \                       #(defalut:100）
  --log_period=K \                       #(default:100)
  --dot_period=1000 \                    #(default:1)
  #[--show_parameter_stats_period=100] \ #(default:0)

--- a/doc/howto/contribute_to_paddle.md
+++ b/doc/howto/contribute_to_paddle.md
@@ -36,8 +36,9 @@ If your repository doesn't contain **develop** branch, just create it by your ow
 git clone https://github.com/USERNAME/Paddle.git Paddle
 cd Paddle
 git checkout -b develop  # create develop branch.
-git remote add upstream https://github.com/baidu/Paddle.git  # add upstream to baidu/Paddle
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git  # add upstream to baidu/Paddle
 git pull upstream develop  # update to upstream
+git submodule update --init --recursive
 ```
 Then you can start to develop by making a local developement branch
@@ -69,7 +70,7 @@ To do this, you'll need to add a remote at first:
 # see the current configured remote repository
 git remote -v
 # add upstream repository
-git remote add upstream https://github.com/baidu/Paddle.git
+git remote add upstream https://github.com/PaddlePaddle/Paddle.git
 # verify the new upstream
 git remote -v
 ```

--- a/doc_cn/conf.py.in
+++ b/doc_cn/conf.py.in
@@ -22,7 +22,7 @@ AutoStructify = transform.AutoStructify
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
 sys.path.insert(0, '@PROJ_ROOT@/python')
-templates_path = ["@PROJ_ROOT@/doc/templates"]
+templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
 # -- General configuration ------------------------------------------------
@@ -112,12 +112,12 @@ todo_include_todos = False
 # The theme to use for HTML and HTML Help pages.  See the documentation for
 # a list of builtin themes.
-#html_theme = 'sphinx_rtd_theme'  # sphinx_rtd_theme will cause table bad style
+html_theme = 'sphinx_rtd_theme'
-html_theme = 'classic'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['_static']
+html_static_path = ['@PROJ_ROOT@/doc_theme/static']
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/doc_cn/faq/index.rst
+++ b/doc_cn/faq/index.rst
@@ -214,3 +214,41 @@ PaddlePaddle的参数使用名字 :code:`name` 作为参数的ID，相同名字
        cmake .. -DPYTHON_EXECUTABLE=<exc_path> -DPYTHON_LIBRARY=<lib_path>  -DPYTHON_INCLUDE_DIR=<inc_path>
 用户需要指定本机上Python的路径：``<exc_path>``, ``<lib_path>``, ``<inc_path>``
+10. A protocol message was rejected because it was too big
+----------------------------------------------------------
+如果在训练NLP相关模型时，出现以下错误：
+..  code-block:: bash
+    [libprotobuf ERROR google/protobuf/io/coded_stream.cc:171] A protocol message was rejected because it was too big (more than 67108864 bytes).  To increase the limit (or to disable these warnings), see CodedInputStream::SetTotalBytesLimit() in google/protobuf/io/coded_stream.h.
+    F1205 14:59:50.295174 14703 TrainerConfigHelper.cpp:59] Check failed: m->conf.ParseFromString(configProtoStr) 
+可能的原因是：传给dataprovider的某一个args过大，一般是由于直接传递大字典导致的。错误的define_py_data_sources2类似：
+..  code-block:: python
+     src_dict = dict()
+     for line_count, line in enumerate(open(src_dict_path, "r")):
+        src_dict[line.strip()] = line_count
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict": src_dict})
+解决方案是：将字典的地址作为args传给dataprovider，然后在dataprovider里面根据该地址加载字典。即define_py_data_sources2应改为：
+..  code-block:: python
+     define_py_data_sources2(
+        train_list,
+        test_list,
+        module="dataprovider",
+        obj="process",
+        args={"src_dict_path": src_dict_path})
+完整源码可参考 `seqToseq <https://github.com/PaddlePaddle/Paddle/tree/develop/demo/seqToseq>`_ 示例。
\ No newline at end of file
--- a/doc_cn/ui/data_provider/dataprovider.rst
+++ b/doc_cn/ui/data_provider/dataprovider.rst
+DataProvider的介绍
+==================
+DataProvider是PaddlePaddle负责提供数据的模块。其作用是将数据传入内存或显存，让神经网络可以进行训练或预测。用户可以通过简单使用Python接口 `PyDataProvider2 <pydataprovider2.html>`_ ，来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 ``DataProvider`` 。
+PaddlePaddle需要用户在网络配置（trainer_config.py）中定义使用哪种DataProvider，并且在DataProvider中实现如何访问训练文件列表（train.list）或测试文件列表（test.list）。
+- train.list和test.list存放在本地（推荐直接存放到训练目录，以相对路径引用)。一般情况下，两者均为纯文本文件，其中每一行对应一个数据文件地址：
+  - 如果数据文件存于本地磁盘，这个地址则为它的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)。
+  - 地址也可以为hdfs文件路径，或者数据库连接路径等。
+  - 由于这个地址会被DataProvider使用，因此，如何解析该地址也是用户自定义DataProvider时需要考虑的地方。
+- 如果没有设置test.list，或设置为None，那么在训练过程中不会执行测试操作；否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
--- a/doc_cn/ui/data_provider/index.rst
+++ b/doc_cn/ui/data_provider/index.rst
-PaddlePaddle的数据提供(DataProvider)介绍
-========================================
-数据提供(DataProvider)是PaddlePaddle负责提供数据的模块。其作用是将训练数据传入内存或者显存，让神经网络可以进行训练。简单的使用，用户可以使用Python的 :code:`PyDataProvider` 来自定义传数据的过程。如果有更复杂的使用，或者需要更高的效率，用户也可以在C++端自定义一个 :code:`DataProvider` 。
-PaddlePaddle需要用户在网络配置(trainer_config.py)中定义使用哪种DataProvider及其参数，训练文件列表(train.list)和测试文件列表(test.list)。
-其中，train.list和test.list均为本地的两个文件(推荐直接放置到训练目录，以相对路径引用)。如果test.list不设置，或者设置为None，那么在训练过程中，不会执行测试操作。否则，会根据命令行参数指定的测试方式，在训练过程中进行测试，从而防止过拟合。
-一般情况下，train.list和test.list为纯文本文件，一行对应一个数据文件，数据文件存放在本地磁盘中。将文件的绝对路径或相对路径(相对于PaddlePaddle程序运行时的路径)写在train.list和test.list中。当然，train.list和test.list也可以放置hdfs文件路径，或者数据库连接地址等等。
-用户在DataProvider中需要实现如何访问其中每一个文件。DataProvider的具体用法和如何实现一个新的DataProvider，请参考下述文章:
-..	toctree::
-	pydataprovider2.rst
-	write_new_dataprovider.rst
--- a/doc_cn/ui/data_provider/mnist_config.py
+++ b/doc_cn/ui/data_provider/mnist_config.py
@@ -5,5 +5,6 @@ define_py_data_sources2(
    test_list=None,
    module='mnist_provider',
    obj='process')
 img = data_layer(name='pixel', size=784)
 label = data_layer(name='label', size=10)
--- a/doc_cn/ui/data_provider/mnist_provider.py
+++ b/doc_cn/ui/data_provider/mnist_provider.py
-from paddle.trainer.PyDataProvider2 import *
-# Define a py data provider
-@provider(input_types=[dense_vector(28 * 28), integer_value(10)])
-def process(settings, filename):  # settings is not used currently.
-    f = open(filename, 'r')  # open one of training file
-    for line in f:  # read each line
-        label, pixel = line.split(';')
-        # get features and label
-        pixels_str = pixel.split(' ')
-        pixels_float = []
-        for each_pixel_str in pixels_str:
-            pixels_float.append(float(each_pixel_str))
-        # give data to paddle.
-        yield pixels_float, int(label)
-    f.close()  # close file
--- a/doc_cn/ui/data_provider/pydataprovider2.rst
+++ b/doc_cn/ui/data_provider/pydataprovider2.rst
 PyDataProvider2的使用
 =====================
-PyDataProvider是PaddlePaddle使用Python提供数据的推荐接口。使用该接口用户可以只关注如何
+PyDataProvider2是PaddlePaddle使用Python提供数据的推荐接口。该接口使用多线程读取数据，并提供了简单的Cache功能；同时可以使用户只关注如何从文件中读取每一条数据，而不用关心数据如何传输，如何存储等等。
-从文件中读取每一条数据，而不用关心数据如何传输给PaddlePaddle，数据如何存储等等。该数据
-接口使用多线程读取数据，并提供了简单的Cache功能。
+..  contents::
+MNIST的使用场景
-简单的使用场景
+---------------
--------------
+我们以MNIST手写识别为例，来说明PyDataProvider2的简单使用场景。
-这里以MNIST手写识别为例，来说明简单的PyDataProvider如何使用。MNIST是一个包含有
-70,000张灰度图片的数字分类数据集。对于MNIST而言，标签是0-9的数字，而特征即为
+样例数据
-28*28的像素灰度值。这里我们使用简单的文本文件表示MNIST图片，样例数据如下。
++++++++
-..  literalinclude:: mnist_train.txt
+MNIST是一个包含有70,000张灰度图片的数字分类数据集。样例数据 ``mnist_train.txt`` 如下：
-其数据使用;间隔，第一段数据为这张图片的label，第二段数据为这个图片的像素值。
+..  literalinclude:: mnist_train.txt
-首先我们将这个数据文件(例如文件名是'mnist_train.txt')写入train.list。那么
-train.list即为
+其中每行数据代表一张图片，行内使用 ``;`` 分成两部分。第一部分是图片的标签，为0-9中的一个数字；第二部分是28*28的图片像素灰度值。 对应的 ``train.list`` 即为这个数据文件的名字：
 ..  literalinclude:: train.list
-那么对应的dataprovider既为
+dataprovider的使用
++++++++++++++++++
-..  literalinclude:: mnist_provider.py
-    :linenos:
+..  literalinclude:: mnist_provider.dict.py
-其中第一行是引入PaddlePaddle的PyDataProvider2包。主要函数是process函数。process函数
+- 首先，引入PaddlePaddle的PyDataProvider2包。
-具有两个参数，第一个参数是 settings 。这个参数在这个样例里没有使用，具
+- 其次，定义一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_ `@provider`_ 。用于将下一行的数据输入函数标记成一个PyDataProvider2，同时设置它的input_types属性。
-体可以参考 settings 。第二个参数是filename，这个参数被PaddlePaddle进程传入，为
-train.list中的一行(即train.list若干数据文件路径的某一个路径)。
+  - `input_types`_：设置这个PyDataProvider2返回什么样的数据。本例根据网络配置中 ``data_layer`` 的名字，显式指定返回的是一个28*28维的稠密浮点数向量和一个[0-9]的10维整数标签。
-:code:`@provider` 是一个Python的 `Decorator <http://www.learnpython.org/en/Decorators>`_
+    ..  literalinclude:: mnist_config.py
-。这行的作用是设置DataProvider的一些属性，并且标记process函数是一个DataProvider。
+         :lines: 9-10
-如果不了解 `Decorator <http://www.learnpython.org/en/Decorators>`_ 是什么也没关系，
-只需要知道这只是一个标记属性的方法就可以了。
+  - 注意：如果用户不显示指定返回数据的对应关系，那么PaddlePaddle会根据layer的声明顺序，来确定对应关系。但这个关系可能不正确，所以推荐使用显式指定的方式来设置input_types。
+- 最后，实现数据输入函数（如本例的 ``process`` 函数）。
-属性 `input_types`_ 是设置这个DataProvider返回什么样的数据。这里设置的是返回一个
-28*28的稠密向量和一个[0-9]，10维的整数值。 `input_types`_ 具体可以设置成什么其他格
+  - 该函数的功能是：打开文本文件，读取每一行，将行中的数据转换成与input_types一致的格式，然后返回给PaddlePaddle进程。注意，
-式，请参考 `input_types`_ 的文档。
+    - 返回的顺序需要和input_types中定义的顺序一致。
-process函数是实现数据输入的主函数，在这个函数中，实现了打开文本文件，从文本文件中读取
+    - 返回时，必须使用Python关键词 ``yield`` ，相关概念是 ``generator`` 。
-每一行，并将每行转换成和 `input_types`_ 一致的特征，并在23行返回给PaddlePaddle进程。需要注意
+    - 一次yield调用，返回一条完整的样本。如果想为一个数据文件返回多条样本，只需要在函数中调用多次yield即可（本例中使用for循环进行多次调用）。
-的是， 返回的顺序需要和 `input_types`_ 中定义的顺序一致。
+  - 该函数具有两个参数：
-同时，返回数据在PaddlePaddle中是仅仅返回一条完整的训练样本，并且使用关键词 :code:`yield` 。
-在PyDataProvider中，可以为一个数据文件返回多条训练样本(就像这个样例一样)，只需要在
+    - settings：在本例中没有使用，具体可以参考 `init_hook`_ 中的说明。
-process函数调用多次 :code:`yield` 即可。 :code:`yield` 是Python的一个关键词，相关的概
+    - filename：为 ``train.list`` 或 ``test.list`` 中的一行，即若干数据文件路径的某一个。
-念是 :code:`generator` 。使用这个关键词，可以在一个函数里，多次返回变量。
+网络配置中的调用
-在训练配置里，只需要使用一行代码即可以设置训练引用这个DataProvider。这个设置为
++++++++++++++++
-..  literalinclude:: mnist_config.py
+在网络配置里，只需要一行代码就可以调用这个PyDataProvider2，如，
-这里说明了训练数据是 'train.list'，而没有测试数据。引用的DataProvider是 'mnist_provider' 
+..  literalinclude:: mnist_config.py
-这个模块中的 'process' 函数。
+     :lines: 1-7
-同时，根据模型配置文件中 :code:`data_layer` 的名字，用户也可以显式指定返回的数据对应关系。例如:
+训练数据是 ``train.list`` ，没有测试数据，调用的PyDataProvider2是 ``mnist_provider`` 模块中的 ``process`` 函数。
-.. literalinclude:: mnist_provider.dict.py
+小结
-   :linenos:
+++++
-如果用户不指定返回数据的对应关系，那么PaddlePaddle会粗略的根据layer的声明顺序，
+至此，简单的PyDataProvider2样例就说明完毕了。对用户来说，仅需要知道如何从 **一个文件** 中读取 **一条样本** ，就可以将数据传送给PaddlePaddle了。而PaddlePaddle则会帮用户做以下工作：
-来确定对应关系。这个对应关系可能不正确。所以推荐使用显式指定返回值和数据对应关系。
+* 将数据组合成Batch进行训练
-至此，简单的PyDataProvider样例就说明完毕了。对于用户来说，讲数据发送给PaddlePaddle，仅仅需要
+* 对训练数据进行Shuffle
-知道如何从 **一个文件** 里面读取 **一条** 样本。而PaddlePaddle进程帮助用户做了
+* 多线程的数据读取
+* 缓存训练数据到内存(可选)
-* 将数据组合成Batch训练
+* CPU->GPU双缓存
-* Shuffle训练数据
-* 多线程数据读取
+是不是很简单呢？
-* 缓存训练数据到内存(可选)
-* CPU->GPU双缓存
+时序模型的使用场景
+------------------
-是不是很简单呢？
+样例数据
++++++++
-序列模型数据提供
----------------
+时序模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列数据。
-序列模型是指数据的某一维度是一个序列形式，即包含时间步信息。所谓时间步信息，
+本例采用英文情感分类的数据，即将一段英文文本数据，分类成正面情绪和负面情绪两类(用0和1表示)。样例数据 ``sentimental_train.txt`` 如下：
-不一定和时间有关系，只是说明数据的顺序是重要的。例如，文本信息就是一个序列
-数据。
+..  literalinclude:: sentimental_train.txt
-这里举例的数据是英文情感分类的数据。数据是给一段英文文本，分类成正面情绪和
+dataprovider的使用
-负面情绪两类(用0和1表示)。样例数据为
++++++++++++++++++
-..  literalinclude:: sentimental_train.txt
+相对MNIST而言，这个dataprovider较复杂，主要原因是增加了初始化机制 `init_hook`_。本例的 ``on_init`` 函数就是根据该机制配置的，它会在dataprovider创建的时候执行。
-这里，DataProvider可以是
+- 其中 ``input_types`` 和在 `@provider`_ 中配置的效果一致。本例中的输入特征是词ID的序列，因此使用 ``integer_value_sequence`` 类型来设置。
+- 将 ``dictionary`` 存入settings对象，在 ``process`` 函数中使用。 dictionary是从网络配置中传入的dict对象，即一个将单词字符串映射到单词ID的字典。
-..  literalinclude:: sentimental_provider.py
+..  literalinclude:: sentimental_provider.py
-这个序列模型比较复杂。主要是增加了初始化机制。其中 :code:`on_init` 函数是使用
-`@provider`_ 中的 `init_hook`_ 配置参数配置给DataProvider的。这个函数会在
+网络配置中的调用
-DataProvider创建的时候执行。这个初始化函数具有如下参数:
++++++++++++++++
-* 第一个参数是 settings 对象。
+调用这个PyDataProvider2的方法，基本上和MNIST样例一致，除了
-* 其他参数均使用key word argument形式传入。有部分参数是Paddle自动生成的，
-  参考 `init_hook`_ 。这里的 :code:`dictionary` 是从训练配置传入的dict对象。
+* 在配置中需要读取外部字典。
-  即从单词字符串到单词id的字典。
+* 在声明DataProvider的时候传入dictionary作为参数。
-传入这个变量的方式为
+..  literalinclude:: sentimental_config.py
+     :emphasize-lines: 12-14
-..  literalinclude:: sentimental_config.py
+参考(Reference)
-这个声明基本上和mnist的样例一致。除了
+---------------
-* 在配置中读取了字典
+@provider
-* 在声明DataProvider的时候传入了dictionary作为参数。
+++++++++
-在 :code:`on_init` 函数中，配置了 `input_types` 。这个和在 `@provider`_ 中配置
+``@provider`` 是一个Python的 `Decorator`_ ，可以将某一个函数标记成一个PyDataProvider2。如果不了解 `Decorator`_ 是什么也没关系，只需知道这是一个标记属性的方法就可以了。它包含的属性参数如下:
-`input_types` 效果一致，但是在 `on_init` 中配置 `input_types` 是在运行时执行的，所以
-可以根据不同的数据配置不同的输入类型。这里的输入特征是词id的序列，所以将 :code:`seq_type`
+*  input_types：数据输入格式。具体的格式说明，请参考 `input_types`_ 。
-设置成了序列(同时，也可以使用 :code:`integer_sequence` 类型来设置)。
+*  should_shuffle：是不是要对数据做Shuffle。训练时默认shuffle，测试时默认不shuffle。
+*  min_pool_size：设置内存中最小暂存的数据条数，也是PaddlePaddle所能够保证的shuffle粒度。如果为-1，则会预先读取全部数据到内存中。
-同时，将字典存入了settings 对象。这个字典可以在 :code:`process` 函数中使用。 :code:`process`
+*  pool_size： 设置内存中暂存的数据条数。如果为-1（默认），则不在乎内存暂存多少条数据。如果设置，则推荐大于训练时batch size的值，并且在内存足够的情况下越大越好。
-函数中的 settings 和 :code:`on_init` 中的settings 是同一个对象。
+*  can_over_batch_size：是否允许暂存略微多余pool_size的数据。由于这样做可以避免很多死锁问题，一般推荐设置成True。
+*  calc_batch_size：可以传入一个函数，用于自定义每条数据的batch size（默认为1）。
-而在 :code:`process` 函数中，基本的处理逻辑也和mnist逻辑一致。依次返回了文件中的每条数据。
+*  cache： 数据缓存的策略，具体请参考 `cache`_ 。
+*  init_hook：初始化时调用的函数，具体请参考 `init_hook`_ 。
-至此，基本的PyDataProvider使用介绍完毕了。具体DataProvider还具有什么功能，请参考下节reference。
+*  check：如果为true，会根据input_types检查数据的合法性。
+*  check_fail_continue：如果为true，那么当check出数据不合法时，会扔到这条数据，继续训练或预测。（对check=false的情况，没有作用）
-参考(Reference)
---------------
+input_types
+++++++++++
-@provider
-+++++++++
+PaddlePaddle的数据包括四种主要类型，和三种序列模式。
-:code:`@provider` 是一个Python的 `Decorator`_ ，他可以将某一个函数标记成一个PyDataProvider。它包含的参数有:
+四种数据类型：
-*  `input_types`_ 是数据输入格式。具体有哪些格式，参考 `input_types`_ 。
+* dense_vector：稠密的浮点数向量。
-*  should_shuffle 是个DataProvider是不是要做shuffle，如果不设置的话，训练的时候默认shuffle，
+* sparse_binary_vector：稀疏的01向量，即大部分值为0，但有值的地方必须为1。
-   测试的时候默认不shuffle。
+* sparse_float_vector：稀疏的向量，即大部分值为0，但有值的部分可以是任何浮点数。
-*  min_pool_size 是设置DataProvider在内存中最小暂存的数据条数。这个也是PaddlePaddle所能够保证的shuffle粒度。
+* integer：整数标签。
-   设置成-1的话，会预先读取全部数据到内存中。
-*  pool_size 是设置DataProvider在内存中暂存的数据条数。设置成-1的话，即不在乎内存暂存多少条数据。
+三种序列模式：
-*  can_over_batch_size 表示是否允许Paddle暂存略微多余pool_size的数据。这样做可以避免很多死锁问题。
-   一般推荐设置成True
+* SequenceType.NO_SEQUENCE：不是一条序列
-*  calc_batch_size 传入的是一个函数，这个函数以一条数据为参数，返回batch_size的大小。默认情况下一条数据
+* SequenceType.SEQUENCE：是一条时间序列
-   是一个batch size，但是有时为了计算均衡性，可以将一条数据设置成多个batch size
+* SequenceType.SUB_SEQUENCE： 是一条时间序列，且序列的每一个元素还是一个时间序列。
-*  cache 是数据缓存的策略，参考 `cache`_
-*  init_hook 是初始化时调用的函数，参考 `init_hook`_
+不同的数据类型和序列模式返回的格式不同，列表如下：
-*  check 设置成true的话，会根据input_types检查数据的合法性。
-*  check_fail_continue 如果设置成true的话，即使在check中数据不合法，也会扔到这条数据，继续训练。 如果
+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-   check是false的话，没有作用。
+|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
+======================+=====================+===================================+================================================+
-input_types
+| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
-+++++++++++
+----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
-PaddlePaddle的数据包括四种主要类型，和三种序列模式。其中，四种数据类型是
+----------------------+---------------------+-----------------------------------+------------------------------------------------+
+| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
-* dense_vector 表示稠密的浮点数向量。
+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-* sparse_binary_vector 表示稀疏的零一向量，即大部分值为0，有值的位置只能取1
+| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-* sparse_float_vector 表示稀疏的向量，即大部分值为0，有值的部分可以是任何浮点数
+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-* integer 表示整数标签。
+其中，f代表一个浮点数，i代表一个整数。
-而三种序列模式为
+注意：对sparse_binary_vector和sparse_float_vector，PaddlePaddle存的是有值位置的索引。例如，
-* SequenceType.NO_SEQUENCE 即不是一条序列
-* SequenceType.SEQUENCE 即是一条时间序列
+- 对一个5维非序列的稀疏01向量 ``[0, 1, 1, 0, 0]`` ，类型是sparse_binary_vector，返回的是 ``[1, 2]`` 。
-* SequenceType.SUB_SEQUENCE 即是一条时间序列，且序列的每一个元素还是一个时间序列。
+- 对一个5维非序列的稀疏浮点向量 ``[0, 0.5, 0.7, 0, 0]`` ，类型是sparse_float_vector，返回的是 ``[(1, 0.5), (2, 0.7)]`` 。
-不同的数据类型和序列模式返回的格式不同，列表如下
+init_hook
+++++++++
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
-|                      | NO_SEQUENCE         | SEQUENCE                          |  SUB_SEQUENCE                                  |
+init_hook可以传入一个函数。该函数在初始化的时候会被调用，其参数如下:
-+======================+=====================+===================================+================================================+
-| dense_vector         | [f, f, ...]         | [[f, ...], [f, ...], ...]         | [[[f, ...], ...], [[f, ...], ...],...]         |
+* 第一个参数是settings对象，它和数据传入函数的第一个参数（如本例中 ``process`` 函数的 ``settings`` 参数）必须一致。该对象具有以下两个属性：
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
+    * settings.input_types：数据输入格式，具体请参考 `input_types`_ 。
-| sparse_binary_vector | [i, i, ...]         | [[i, ...], [i, ...], ...]         | [[[i, ...], ...], [[i, ...], ...],...]         |
+    * settings.logger：一个logging对象。
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
+* 其他参数使用 ``kwargs`` （key word arguments）传入，包括以下两种：
-| sparse_float_vector  | [(i,f), (i,f), ...] | [[(i,f), ...], [(i,f), ...], ...] | [[[(i,f), ...], ...], [[(i,f), ...], ...],...] |
+    * PaddlePaddle定义的参数: 1）is_train：bool型参数，表示用于训练或预测；2）file_list：所有文件列表。
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
+    * 用户定义的参数：使用args在网络配置中设置。
-| integer_value        |  i                  | [i, i, ...]                       | [[i, ...], [i, ...], ...]                      |
-+----------------------+---------------------+-----------------------------------+------------------------------------------------+
+注意：PaddlePaddle保留添加参数的权力，因此init_hook尽量使用 ``**kwargs`` 来接受不使用的函数以保证兼容性。
-其中，f代表一个浮点数，i代表一个整数。
+cache
+++++
-init_hook
-+++++++++
+PyDataProvider2提供了两种简单的Cache策略：
-init_hook可以传入一个函数。这个函数在初始化的时候会被调用。这个函数的参数是:
+* CacheType.NO_CACHE：不缓存任何数据，每次都会从python端读取数据
+* CacheType.CACHE_PASS_IN_MEM：第一个pass会从python端读取数据，剩下的pass会直接从内存里
-* 第一个参数是 settings 对象。这个对象和process的第一个参数一致。具有的属性有
+  读取数据。 
-    * settings.input_types 设置输入类型。参考 `input_types`_
-    * settings.logger 一个logging对象
-* 其他参数都使用key word argument传入。这些参数包括paddle定义的参数，和用户传入的参数。
+注意事项
-    * Paddle定义的参数包括:
+--------
-        * is_train bool参数，表示这个DataProvider是训练用的DataProvider或者测试用的
-          DataProvider
+可能的内存泄露问题
-        * file_list 所有文件列表。
++++++++++++++++++
-    * 用户定义的参数使用args在训练配置中设置。
+PaddlePaddle将train.list中的每一行都传递给process函数，从而生成多个generator。当训练数据非常多时，就会生成非常多的generator。
-注意，PaddlePaddle保留添加参数的权力，所以init_hook尽量使用 :code:`**kwargs` , 来接受不使用的
-函数来保证兼容性。
+虽然每个generator在没有调用的时候，是几乎不占内存的；但当调用过一次后，generator便会存下当前的上下文(Context)，而这个Context可能会非常大。并且，generator至少需要调用两次才会知道是否停止。所以，即使process函数里面只有一个yield，也需要两次随机选择到相同generator的时候，才会释放该段内存。
-cache
+..  code-block:: python
-+++++
+    def func():
-DataProvider提供了两种简单的Cache策略。他们是
+        yield 0
-* CacheType.NO_CACHE 不缓存任何数据，每次都会从python端读取数据
+    f = func()  # 创建generator
-* CacheType.CACHE_PASS_IN_MEM 第一个pass会从python端读取数据，剩下的pass会直接从内存里
+    tmp = next(f)  # 调用一次，返回0
-  读取数据。 
+    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
+由于顺序调用这些generator不会出现上述问题，因此有两种解决方案：
-注意事项
--------
+1. **最佳推荐**：将样本的地址放入另一个文本文件，train.list写入那个文本文件的地址。即不要将每一个样本都放入train.list。
+2. 在generator的上下文中尽量留下非常少的变量引用，例如
-可能的内存泄露问题
-++++++++++++++++++
+..  code-block:: python
-PaddlePaddle将train.list中的每一行，都传递给process函数，从而生成多个generator。
+    def real_process(fn):
-即如果train.list中，有100个训练文件，即会生成100个generator。这个本身不是一个很
+        # ... read from fn
-严重的问题。
+        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
-但是，如果在训练时，每一条训练数据都是一个文件，并且，训练数据非常多的情况下，就
+    def process(fn):
-会生成多个generator。每个generator在没有调用的时候，是几乎不占内存的。但是，当调
+        yield real_process(fn)
-用过一次的时候，generator便会存下当前的上下文(Context)。而这个Context可能会非常
-大。并且，generator至少调用两次才会知道是否停止。所以，即使在process里面只会有一
+注意：这个问题是PyDataProvider读数据时候的逻辑问题，很难整体修正。
-个yield，也需要两次随机选择到同样的generator的时候，才会释放该段内存。
+内存不够用的情况
-..  code-block:: python
++++++++++++++++
-    def func():
+PyDataProvider2会尽可能多的使用内存。因此，对于内存较小的机器，推荐使用 ``pool_size`` 变量来设置内存中暂存的数据条。具体请参考 `@provider`_ 中的说明。
-        yield 0
-    f = func()  # 创建generator
-    tmp = next(f)  # 调用一次，返回0
-    tmp = next(f)  # 调用第二次的时候，才会Stop Iteration
-而如果按顺序调用这些generator就不会出现这个问题。
-所以最佳实践推荐不要将每一个样本都放入train.list。而是将样本的地址放入另一个文本
-文件，train.list写入那个文本文件的地址。 或者在python generator的上下文中尽量留
-下非常少的变量引用。例如
-..  code-block:: python
-    def real_process(fn):
-        # ... read from fn
-        return result   # 当函数返回的时候，python可以解除掉内部变量的引用。
-    def process(fn):
-        yield real_process(fn)
-这个问题是PyDataProvider读数据时候的逻辑问题，基本上不能整体修正。
-内存不够用的情况
-++++++++++++++++
-PyDataProvider2会尽量使用内存。所以如果对于内存比较小的机器，推荐设置
-:code:`pool_size` 变量，而这个变量推荐大于训练的batch size，并且在内存足够
-的情况下越大越好。
--- a/doc_cn/ui/data_provider/sentimental_provider.py
+++ b/doc_cn/ui/data_provider/sentimental_provider.py
@@ -8,19 +8,16 @@ def on_init(settings, dictionary, **kwargs):
    # set input types in runtime. It will do the same thing as
    # @provider(input_types) will do, but it is set dynamically during runtime.
-    settings.input_types = [
+    settings.input_types = {
        # The text is a sequence of integer values, and each value is a word id.
        # The whole sequence is the sentences that we want to predict its
        # sentimental.
-        integer_value(
+        'data': integer_value_sequence(len(dictionary)),  # text input
-            len(dictionary), seq_type=SequenceType),  # text input
+        'label': integer_value(2)  # label positive/negative
+    }
-        # label positive/negative
+    # save dictionary as settings.dictionary. 
-        integer_value(2)
+    # It will be used in process method.
-    ]
-    # save dictionary as settings.dictionary. It will be used in process
-    # method.
    settings.dictionary = dictionary

--- a/doc_cn/ui/data_provider/write_new_dataprovider.rst
+++ b/doc_cn/ui/data_provider/write_new_dataprovider.rst
-自定义一个DataProvider
-====================
-TBD
\ No newline at end of file
--- a/doc_cn/ui/index.rst
+++ b/doc_cn/ui/index.rst
@@ -8,8 +8,8 @@
 ..  toctree::
    :maxdepth: 1
-    data_provider/index.rst
+    data_provider/dataprovider.rst
+    data_provider/pydataprovider2.rst
 命令及命令行参数
 ================
@@ -23,9 +23,8 @@
 * `参数分类 <../../doc/ui/cmd_argument/argument_outline.html>`_
 * `参数描述 <../../doc/ui/cmd_argument/detail_introduction.html>`_
 预测
-====
+=======
 ..  toctree::
    :maxdepth: 1

--- a/doc_theme/static/css/override.css
+++ b/doc_theme/static/css/override.css
+body {
+    padding-top: 80px;
+    background-image: none !important;
+    font-family: Roboto;
+}
+a, a:focus, a:hover, a:visited {
+    color: #597cf1;
+}
+.site-header {
+    position: fixed;
+    top: 0;
+    width: 100%;
+    left: 0;
+    z-index: 99;
+    background: #333;
+    height: 80px;
+    display: -webkit-flex;
+    display: -ms-flex;
+    display: -o-flex;
+    display: flex;
+    flex-flow: row nowrap;
+    justify-content: space-between;
+    box-shadow: #ccc 0 3px 3px;
+}
+.site-header > div {
+    height: 80px;
+    display: inline-block;
+    background-color: #2f323a;
+    padding: 0 30px;
+}
+.site-header .site-logo {
+    line-height: 80px;
+    width: 290px;
+    flex: 0 1 290px;
+}
+.site-header .site-logo > a {
+    display: inline-block;
+    width: 230px;
+}
+.site-header .site-nav-links {
+    flex: 0 1 100%;
+}
+.site-header .site-nav-links .site-menu {
+    height: 30px;
+    line-height: 30px; 
+    font-size: 12px;
+    background: -webkit-linear-gradient(#282b33, #2f323a);
+    background: -o-linear-gradient(#282b33, #2f323a);
+    background: -moz-linear-gradient(#282b33, #2f323a);
+    background: linear-gradient(to left, #282b33, #2f323a);
+    margin-right: -30px;
+    padding-right: 30px;
+}
+.site-header .site-nav-links .site-menu .site-page-links {
+    display: inline-block;
+    float: right;
+    margin-right: 20px;
+}
+.site-header .site-nav-links .site-menu .site-page-links> li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li > a {
+    color: #a7adbd;
+    display: inline-block;
+    height: 30px;
+    padding: 0 20px;
+    font-size: 12px;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li:hover > a,
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    background-color: #2f323a;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .site-page-links > li.active > a {
+    font-weight: bold;
+}
+.site-header .site-nav-links .site-menu .fork-on-github {
+    color: #597cf1;
+    line-height: 30px;
+    display: inline-block;
+    padding: 0 0 0 20px;
+    float: right;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .fork-on-github .fa {
+    margin-right: 5px;
+    font-size: 16px;
+    vertical-align: middle;
+}
+.site-header .site-nav-links .site-menu .language-switcher {
+    height: 30px;
+    display: inline-block;
+    float: right;
+    line-height: 30px;
+    padding: 0 20px;
+    position: relative;
+}
+.site-header .site-nav-links .site-menu .language-switcher > a {
+    color: #a7adbd;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open > a {
+    background-color: #24272f;
+    color: #bcc1d0;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa {
+    margin-left: 5px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-down {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-down {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher .fa-angle-up {
+    display: none;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .fa-angle-up {
+    display: inline;
+}
+.site-header .site-nav-links .site-menu .fork-on-github:before,
+.site-header .site-nav-links .site-menu .language-switcher:before {
+    width: 1px;
+    height: 12px;
+    top: 9px;
+    background-color: #3a3d47;
+    left: 0;
+    display: inline-block;
+    position: absolute;
+    content: "";
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu {
+    display: none;
+    position: absolute;
+    box-shadow: #ccc 0 0 5px;
+    background-color: #fff;
+    width: 100%;
+    left: 0;
+    top: 30px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li {
+    line-height: 30px;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li:hover {
+    background-color: #f7f8fe;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li + li {
+    border-top: 1px solid #dedfe5;
+}
+.site-header .site-nav-links .site-menu .language-switcher .dropdown-menu > li > a {
+    color: #2f323a;
+}
+.site-header .site-nav-links .site-menu .language-switcher.open .dropdown-menu {
+    display: inline-block;
+}
+.site-header .site-nav-links .doc-module {
+    display: block;
+    height: 50px;
+    line-height: 50px;
+}
+.site-header .site-nav-links .doc-module > ul > li {
+    display: inline-block;
+    float: left;
+}
+.site-header .site-nav-links .doc-module > ul > li > a {
+    color: #c9cbd0;
+    font-size: 14px;
+    display: inline-block;
+    height: 50px;
+    line-height: 50px;
+    border-bottom: 2px solid transparent;
+    padding: 0 20px;
+}
+.site-header .site-nav-links .doc-module > ul > li:hover > a {
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module > ul > li.current > a {
+    border-bottom-color: #fff;
+    color: #fff;
+}
+.site-header .site-nav-links .doc-module [role="search"]{
+    float: right;
+}
+.site-header .site-nav-links .doc-module [role="search"] input {
+    background-color: #3a3d47;
+    border-radius: 15px;
+    color: #a7adbd;
+    border: 1px solid transparent;
+    padding: 6px 15px;
+    width: 180px;
+    box-shadow: none;
+    transition: all .2s;
+    -webkit-transition: all .2s;
+    -moz-transition: all .2s;
+    -o-transition: all .2s;
+    background-repeat: no-repeat;
+    background-position: 145px center;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO7K8dhFMfx1w8LBqVM5DLxF7hMTGSQpAwmJSkDizAZLSb5Ayi3clsMFgwWISGXkoSyGYRSym15fvr27duvH5/leTqd8+6c83ye1NLatohqMIgWVOEV+5jDAr7ElBO5j+IIH+hBJRqwjDHsoTQOyAvnCPpRi4tYziVmMY2dkPMc7aAG42hPKE7rAwMBNhEfYQgzOJNZ3xhGL4qigGasyk43OEdjFFCGe9nrNtT8Al5Q8AdAMd6jgFPU/QFwiN0oYD4sJzdLwBiuo4A5vGEKqQyF1ahPcuInOsJrrKMiwWx9OMAWWpOc+BD2MImr4Ik7FIb4AzqRH6zdhU1IxT4TlKAJ5XjCMU6CkaANi2lIXsKsj1jJsIsNdKc7yfE/pSGTPwMABBFCGflm+rsAAAAASUVORK5CYII=");
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   width: 300px;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+    background-position: 265px center;
+}
+.site-header .site-nav-links .doc-module [role="search"] input:hover,
+.site-header .site-nav-links .doc-module [role="search"] input:focus {
+   color: #fff;
+   border-color: #597cf1;
+   background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAABAAAAAQCAYAAAAf8/9hAAAACXBIWXMAAA7EAAAOxAGVKw4bAAAKTWlDQ1BQaG90b3Nob3AgSUNDIHByb2ZpbGUAAHjanVN3WJP3Fj7f92UPVkLY8LGXbIEAIiOsCMgQWaIQkgBhhBASQMWFiApWFBURnEhVxILVCkidiOKgKLhnQYqIWotVXDjuH9yntX167+3t+9f7vOec5/zOec8PgBESJpHmomoAOVKFPDrYH49PSMTJvYACFUjgBCAQ5svCZwXFAADwA3l4fnSwP/wBr28AAgBw1S4kEsfh/4O6UCZXACCRAOAiEucLAZBSAMguVMgUAMgYALBTs2QKAJQAAGx5fEIiAKoNAOz0ST4FANipk9wXANiiHKkIAI0BAJkoRyQCQLsAYFWBUiwCwMIAoKxAIi4EwK4BgFm2MkcCgL0FAHaOWJAPQGAAgJlCLMwAIDgCAEMeE80DIEwDoDDSv+CpX3CFuEgBAMDLlc2XS9IzFLiV0Bp38vDg4iHiwmyxQmEXKRBmCeQinJebIxNI5wNMzgwAABr50cH+OD+Q5+bk4eZm52zv9MWi/mvwbyI+IfHf/ryMAgQAEE7P79pf5eXWA3DHAbB1v2upWwDaVgBo3/ldM9sJoFoK0Hr5i3k4/EAenqFQyDwdHAoLC+0lYqG9MOOLPv8z4W/gi372/EAe/tt68ABxmkCZrcCjg/1xYW52rlKO58sEQjFu9+cj/seFf/2OKdHiNLFcLBWK8ViJuFAiTcd5uVKRRCHJleIS6X8y8R+W/QmTdw0ArIZPwE62B7XLbMB+7gECiw5Y0nYAQH7zLYwaC5EAEGc0Mnn3AACTv/mPQCsBAM2XpOMAALzoGFyolBdMxggAAESggSqwQQcMwRSswA6cwR28wBcCYQZEQAwkwDwQQgbkgBwKoRiWQRlUwDrYBLWwAxqgEZrhELTBMTgN5+ASXIHrcBcGYBiewhi8hgkEQcgIE2EhOogRYo7YIs4IF5mOBCJhSDSSgKQg6YgUUSLFyHKkAqlCapFdSCPyLXIUOY1cQPqQ28ggMor8irxHMZSBslED1AJ1QLmoHxqKxqBz0XQ0D12AlqJr0Rq0Hj2AtqKn0UvodXQAfYqOY4DRMQ5mjNlhXIyHRWCJWBomxxZj5Vg1Vo81Yx1YN3YVG8CeYe8IJAKLgBPsCF6EEMJsgpCQR1hMWEOoJewjtBK6CFcJg4Qxwicik6hPtCV6EvnEeGI6sZBYRqwm7iEeIZ4lXicOE1+TSCQOyZLkTgohJZAySQtJa0jbSC2kU6Q+0hBpnEwm65Btyd7kCLKArCCXkbeQD5BPkvvJw+S3FDrFiOJMCaIkUqSUEko1ZT/lBKWfMkKZoKpRzame1AiqiDqfWkltoHZQL1OHqRM0dZolzZsWQ8ukLaPV0JppZ2n3aC/pdLoJ3YMeRZfQl9Jr6Afp5+mD9HcMDYYNg8dIYigZaxl7GacYtxkvmUymBdOXmchUMNcyG5lnmA+Yb1VYKvYqfBWRyhKVOpVWlX6V56pUVXNVP9V5qgtUq1UPq15WfaZGVbNQ46kJ1Bar1akdVbupNq7OUndSj1DPUV+jvl/9gvpjDbKGhUaghkijVGO3xhmNIRbGMmXxWELWclYD6yxrmE1iW7L57Ex2Bfsbdi97TFNDc6pmrGaRZp3mcc0BDsax4PA52ZxKziHODc57LQMtPy2x1mqtZq1+rTfaetq+2mLtcu0W7eva73VwnUCdLJ31Om0693UJuja6UbqFutt1z+o+02PreekJ9cr1Dund0Uf1bfSj9Rfq79bv0R83MDQINpAZbDE4Y/DMkGPoa5hpuNHwhOGoEctoupHEaKPRSaMnuCbuh2fjNXgXPmasbxxirDTeZdxrPGFiaTLbpMSkxeS+Kc2Ua5pmutG003TMzMgs3KzYrMnsjjnVnGueYb7ZvNv8jYWlRZzFSos2i8eW2pZ8ywWWTZb3rJhWPlZ5VvVW16xJ1lzrLOtt1ldsUBtXmwybOpvLtqitm63Edptt3xTiFI8p0in1U27aMez87ArsmuwG7Tn2YfYl9m32zx3MHBId1jt0O3xydHXMdmxwvOuk4TTDqcSpw+lXZxtnoXOd8zUXpkuQyxKXdpcXU22niqdun3rLleUa7rrStdP1o5u7m9yt2W3U3cw9xX2r+00umxvJXcM970H08PdY4nHM452nm6fC85DnL152Xlle+70eT7OcJp7WMG3I28Rb4L3Le2A6Pj1l+s7pAz7GPgKfep+Hvqa+It89viN+1n6Zfgf8nvs7+sv9j/i/4XnyFvFOBWABwQHlAb2BGoGzA2sDHwSZBKUHNQWNBbsGLww+FUIMCQ1ZH3KTb8AX8hv5YzPcZyya0RXKCJ0VWhv6MMwmTB7WEY6GzwjfEH5vpvlM6cy2CIjgR2yIuB9pGZkX+X0UKSoyqi7qUbRTdHF09yzWrORZ+2e9jvGPqYy5O9tqtnJ2Z6xqbFJsY+ybuIC4qriBeIf4RfGXEnQTJAntieTE2MQ9ieNzAudsmjOc5JpUlnRjruXcorkX5unOy553PFk1WZB8OIWYEpeyP+WDIEJQLxhP5aduTR0T8oSbhU9FvqKNolGxt7hKPJLmnVaV9jjdO31D+miGT0Z1xjMJT1IreZEZkrkj801WRNberM/ZcdktOZSclJyjUg1plrQr1zC3KLdPZisrkw3keeZtyhuTh8r35CP5c/PbFWyFTNGjtFKuUA4WTC+oK3hbGFt4uEi9SFrUM99m/ur5IwuCFny9kLBQuLCz2Lh4WfHgIr9FuxYji1MXdy4xXVK6ZHhp8NJ9y2jLspb9UOJYUlXyannc8o5Sg9KlpUMrglc0lamUycturvRauWMVYZVkVe9ql9VbVn8qF5VfrHCsqK74sEa45uJXTl/VfPV5bdra3kq3yu3rSOuk626s91m/r0q9akHV0IbwDa0b8Y3lG19tSt50oXpq9Y7NtM3KzQM1YTXtW8y2rNvyoTaj9nqdf13LVv2tq7e+2Sba1r/dd3vzDoMdFTve75TsvLUreFdrvUV99W7S7oLdjxpiG7q/5n7duEd3T8Wej3ulewf2Re/ranRvbNyvv7+yCW1SNo0eSDpw5ZuAb9qb7Zp3tXBaKg7CQeXBJ9+mfHvjUOihzsPcw83fmX+39QjrSHkr0jq/dawto22gPaG97+iMo50dXh1Hvrf/fu8x42N1xzWPV56gnSg98fnkgpPjp2Snnp1OPz3Umdx590z8mWtdUV29Z0PPnj8XdO5Mt1/3yfPe549d8Lxw9CL3Ytslt0utPa49R35w/eFIr1tv62X3y+1XPK509E3rO9Hv03/6asDVc9f41y5dn3m978bsG7duJt0cuCW69fh29u0XdwruTNxdeo94r/y+2v3qB/oP6n+0/rFlwG3g+GDAYM/DWQ/vDgmHnv6U/9OH4dJHzEfVI0YjjY+dHx8bDRq98mTOk+GnsqcTz8p+Vv9563Or59/94vtLz1j82PAL+YvPv655qfNy76uprzrHI8cfvM55PfGm/K3O233vuO+638e9H5ko/ED+UPPR+mPHp9BP9z7nfP78L/eE8/sl0p8zAAAAIGNIUk0AAHolAACAgwAA+f8AAIDpAAB1MAAA6mAAADqYAAAXb5JfxUYAAAEpSURBVHjanNO9K4ZhFMfxz4MFg1Im8jJ5/gIvExMZJCnFpCRlYBEGGS0m+QMoLwOyGCwyWISEvJQklM0glFLeluvR3d3d08Nvua5O53w751y/K9Uz+SyiNIbRihq8Yh+LWMaXmPIi93Ec4QN9qEYjVjGBPZTHAQXhHMMg6nARy7nEAuawE3Keox2kMYWOhOKMPjAUYNPxEUYwjzPZ9Y1R9KMkCmjButx0g3M0RQEVuJe7bkPNL+AFRX8AlOI9CjhF/R8Ah9iNApbCcvJzBEzgOgpYxBtmkcpSWIuGJCd+ojO8xgaqEsw2gANsoy3JiQ9hDzO4Cp64Q3GIP6ALhcHa3diCVOwzQRmaUYknHOMkGAnasZKBFCTM+oi1LLvYRG+mkzz/UwYy8zMAmkpBg3fGpFUAAAAASUVORK5CYII=");
+}
+.doc-menu-vertical {
+    display: inline-block;
+    float: left;
+    width: 240px;
+    height: 100%;
+    background-color: #ecedee;
+    position: absolute;
+    left: 0;
+    top: 0;
+    overflow: hidden;
+    padding: 0;
+    border-right: 1px solid #dddfe3;
+}
+.doc-menu-vertical > ul {
+    display: none;
+}
+.doc-menu-vertical > ul.current{
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1 {
+    display: none;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current {
+    display: block;
+}
+.doc-menu-vertical > ul.current > li.toctree-l1.current > a {
+    display: none;
+}
+.doc-menu-vertical .toctree-l2  a {
+    width: 100%;
+    overflow: hidden;
+    text-overflow: ellipsis;
+    white-space: nowrap;
+    padding-right: 30px;
+}
+.doc-menu-vertical .toctree-l2 > a {
+    font-size: 14px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 50px;
+    display: block;
+    font-weight: bold;
+    border-bottom: 1px solid #dddfe3;
+}
+.doc-menu-vertical .toctree-l2.has-child > a:after {
+    font-family: "FontAwesome";
+    display: inline-block;
+    font-style: normal;
+    font-weight: normal;
+    text-decoration: inherit;
+    content: "";
+    float: right;
+    line-height: 50px;
+    color: #a7adbd;
+    position: absolute;
+    right: 15px;
+}
+.doc-menu-vertical .toctree-l2.has-child.current > a:after {
+    content: "";
+}
+.doc-menu-vertical .toctree-l2 > a + ul{
+    background-color: #e4e6e9;
+    height: 0;
+    overflow: hidden;
+}
+.doc-menu-vertical .toctree-l2.current > a + ul {
+    border-bottom: 1px solid #dddfe3;
+    height: auto;
+}
+.doc-menu-vertical .toctree-l2 li.active > a {
+    background-color: #597cf1;
+    color: #fff;
+}
+.doc-menu-vertical .toctree-l3 > a {
+    font-size: 12px;
+    color: #2f323a;
+    padding-left: 30px;
+    line-height: 40px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l4 > a {
+    font-size: 12px;
+    color: #64697b;
+    padding-left: 50px;
+    line-height: 30px;
+    display: block;
+}
+.doc-menu-vertical .toctree-l5 > a {
+    font-size: 14px;
+    color: #ccc;
+    padding-left: 40px;
+    display: block;
+}
+.local-toc {
+    position: absolute;
+    height: 100%;
+    background-color: #f6f7f8;
+    top: 0;
+    left: 240px;
+    padding: 0;
+    z-index: 9;
+}
+.local-toc:after {
+    content: "";
+    position: absolute;
+    height: 100%;
+    width: 1px;
+    display: inline-block;
+    right: 0;
+    background-color: #dddfe3;
+    top: 0;
+    z-index: -1;
+}
+.local-toc:hover a {
+    width: auto;
+}
+.local-toc > ul > li a {
+    position: relative;
+    font-size: 12px;
+    overflow: hidden;
+    display: none;
+}
+.local-toc > ul > li > ul > li a {
+    display: block;
+    border-top: 1px solid transparent;
+    border-bottom: 1px solid transparent;
+    padding-right: 20px;
+    width: 50px;
+}
+.local-toc > ul > li > ul > li > ul > li > ul a {
+    display: none;
+}
+.local-toc > ul > li > ul li > a:after {
+    content: "";
+    display: inline-block;
+    width: 1px;
+    height: 100%;
+    background-color: transparent;
+    position: absolute;
+    right: 0;
+    top: 0;
+}
+.local-toc > ul > li > ul li a:hover{
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li a:hover:after {
+    background-color: #e6eaf7 !important;
+}
+.local-toc > ul > li > ul li.active > a {
+    color: #ff9711;
+    background-color: #fff;
+    border-top: 1px solid #dddfe3;
+    border-bottom: 1px solid #dddfe3;
+}
+.local-toc > ul > li > ul li.active > a:before {
+    background-color: #ff9711;
+    width: 10px;
+    height: 10px;
+    margin: 15px 20px;
+    border-radius: 5px;
+}
+.local-toc > ul > li > ul li.active > a:after {
+    background-color: #fff;
+}
+.local-toc > ul > li > ul > li {
+    position: relative;
+    line-height: 40px;
+    white-space: nowrap;
+}
+.local-toc > ul > li > ul > li > a {
+    color: #64697b;
+}
+.local-toc > ul > li > ul > li > a + ul {
+    display: none;
+}
+.local-toc > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.local-toc > ul > li > ul > li > ul > li > a {
+    color: #a7adbd;
+}
+.local-toc > ul > li > ul > li > ul > li > a:before {
+    display: inline-block;
+    content: "";
+    width: 6px;
+    height: 6px;
+    background-color: #ccc;
+    border-radius: 3px;
+    margin: 17px 22px;
+    float: left;
+}
+.main-content-wrap {
+    position: absolute;
+    width: 100%;
+    top: 80px;
+    bottom: 0;
+    overflow: auto;
+    background-color: #f6f7f8;
+}
+.doc-content-wrap {
+    margin-left: 290px;
+    height: 100%;
+    position: relative;
+    padding-top: 60px;
+    background-color: #fff;
+}
+.doc-content-wrap > div[role='navigation'] {
+    position: absolute;
+    top: 0;
+    width: 100%;
+    left: 0;
+    padding: 0 30px;
+    height: 60px;
+}
+.wy-breadcrumbs {
+    line-height: 50px;
+    height: 60px;
+    background-image: url("data:image/png;base64,iVBORw0KGgoAAAANSUhEUgAAAAEAAAAUCAYAAABMDlehAAAAGXRFWHRTb2Z0d2FyZQBBZG9iZSBJbWFnZVJlYWR5ccllPAAAA4ZpVFh0WE1MOmNvbS5hZG9iZS54bXAAAAAAADw/eHBhY2tldCBiZWdpbj0i77u/IiBpZD0iVzVNME1wQ2VoaUh6cmVTek5UY3prYzlkIj8+IDx4OnhtcG1ldGEgeG1sbnM6eD0iYWRvYmU6bnM6bWV0YS8iIHg6eG1wdGs9IkFkb2JlIFhNUCBDb3JlIDUuNS1jMDIxIDc5LjE1NTc3MiwgMjAxNC8wMS8xMy0xOTo0NDowMCAgICAgICAgIj4gPHJkZjpSREYgeG1sbnM6cmRmPSJodHRwOi8vd3d3LnczLm9yZy8xOTk5LzAyLzIyLXJkZi1zeW50YXgtbnMjIj4gPHJkZjpEZXNjcmlwdGlvbiByZGY6YWJvdXQ9IiIgeG1sbnM6eG1wTU09Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC9tbS8iIHhtbG5zOnN0UmVmPSJodHRwOi8vbnMuYWRvYmUuY29tL3hhcC8xLjAvc1R5cGUvUmVzb3VyY2VSZWYjIiB4bWxuczp4bXA9Imh0dHA6Ly9ucy5hZG9iZS5jb20veGFwLzEuMC8iIHhtcE1NOk9yaWdpbmFsRG9jdW1lbnRJRD0ieG1wLmRpZDpjMjhmMGQ3ZC0wODU3LTQ0ZTctOGRhZi00NGU3OTc1ZmM2MzkiIHhtcE1NOkRvY3VtZW50SUQ9InhtcC5kaWQ6NzRBN0NEODRBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcE1NOkluc3RhbmNlSUQ9InhtcC5paWQ6NzRBN0NEODNBRTM2MTFFNjlGMDI4RUM3M0VDQzY4NTkiIHhtcDpDcmVhdG9yVG9vbD0iQWRvYmUgUGhvdG9zaG9wIENDIDIwMTQgKE1hY2ludG9zaCkiPiA8eG1wTU06RGVyaXZlZEZyb20gc3RSZWY6aW5zdGFuY2VJRD0ieG1wLmlpZDozNWQwMzI1ZC01ZDAyLTQ1YTYtODUxOS1lNWUzNjU5NGFhMzAiIHN0UmVmOmRvY3VtZW50SUQ9ImFkb2JlOmRvY2lkOnBob3Rvc2hvcDozZGVmZmY0OS1mNjA4LTExNzktYTRlZC1kZjJiNGY3N2YwNzMiLz4gPC9yZGY6RGVzY3JpcHRpb24+IDwvcmRmOlJERj4gPC94OnhtcG1ldGE+IDw/eHBhY2tldCBlbmQ9InIiPz7FGmP1AAAAKUlEQVR42mK4/+DpfwY9Q0tBJgYGhv8g4h8uFoKLEGOAc9FYSARAgAEAUgMQYBNmQ7sAAAAASUVORK5CYII=");
+    background-repeat: repeat no-repeat;
+    background-position: center 50px;
+}
+.wy-breadcrumbs > li {
+    color: #ccc;
+}
+.wy-breadcrumbs > li a {
+    color: #ff9711;
+    padding: 0;
+}
+.wy-breadcrumbs > li:first-child a {
+    color: #597cf1;
+}
+.wy-nav-content{
+    max-width: none;
+    overflow: auto;
+    position: relative;
+    padding: 30px;
+    background-color: #fff;
+}
+.wy-nav-content h1 {
+    font-size: 24px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h2 {
+    font-size: 20px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h3 {
+    font-size: 18px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content h4 {
+    font-size: 16px;
+    color: #2f323a;
+    margin-bottom: 30px;
+}
+.wy-nav-content p + h1,
+.wy-nav-content p + h2,
+.wy-nav-content p + h3,
+.wy-nav-content p + h4 {
+    margin-top: 20px;
+}
+.wy-nav-content p{
+    color: #2f323a;
+    margin-bottom: 20px;
+    font-size: 14px;
+}
+#search-results h2 {
+    font-size: 24px;
+    margin: 20px 0 10px 0;
+}
+#search-results p {
+    color: #a7adbd;
+}
+#search-results ul.search > li {
+    border-bottom: none;
+}
+#search-results ul.search > li > a {
+    color: #597cf1;
+}
+.rst-content .highlighted{
+    background-color: transparent;
+    color: #ff9711;
+    padding: 0;
+}
--- a/doc_theme/static/images/PP_w.png
+++ b/doc_theme/static/images/PP_w.png
--- a/doc_theme/static/js/paddle_doc_init.js
+++ b/doc_theme/static/js/paddle_doc_init.js
+$(document).ready(function(){
+    $('.local-toc').on('click' ,'a.reference.internal', function (){
+        $('.local-toc li.active').removeClass('active');
+        $(this).parent('li').addClass('active');
+    });
+    if ($('.local-toc a:visible').length) {
+        $('.local-toc > ul').addClass('nav nav-stacked');
+        $('#doc-content').scrollspy({
+            target: '.local-toc'
+        });
+		$('.local-toc').perfectScrollbar();
+    } else {
+		$('.doc-content-wrap').css('margin-left', '-=50px');
+        $('.local-toc').remove();
+    }
+    if (!$('.doc-menu-vertical > ul > li.current > ul').length) {
+        $('.doc-content-wrap').css('margin-left', '-=240px');
+        $('.doc-menu-vertical').remove();
+        $('.local-toc').css('left', '0');
+    }
+	$('.doc-menu-vertical .toctree-l2').each(function (i, e){
+        $(e).toggleClass('has-child', !!$(e).find('ul').length);
+    });
+    $('.doc-menu-vertical').find('li.current').last().addClass('active');
+    $('.doc-menu-vertical').perfectScrollbar();
+});
\ No newline at end of file
--- a/doc_theme/templates/breadcrumbs.html
+++ b/doc_theme/templates/breadcrumbs.html
+{# Support for Sphinx 1.3+ page_source_suffix, but don't break old builds. #}
+{% if page_source_suffix %} 
+{% set suffix = page_source_suffix %}
+{% else %}
+{% set suffix = source_suffix %}
+{% endif %}
+{% if meta is defined and 'github_url' in meta %}
+{% set display_github = True %}
+{% endif %}
+{% if meta is defined and 'bitbucket_url' in meta %}
+{% set display_bitbucket = True %}
+{% endif %}
+<div role="navigation" aria-label="breadcrumbs navigation">
+  <ul class="wy-breadcrumbs">
+      {% for doc in parents %}
+        <li><a href="{{ doc.link|e }}">{{ doc.title }}</a> > </li>
+      {% endfor %}
+    <li>{{ title }}</li>
+  </ul>
+</div>
--- a/doc_theme/templates/layout.html
+++ b/doc_theme/templates/layout.html
+{# TEMPLATE VAR SETTINGS #}
+{%- set url_root = pathto('', 1) %}
+{%- if url_root == '#' %}{% set url_root = '' %}{% endif %}
+{%- if not embedded and docstitle %}
+  {%- set titlesuffix = " &mdash; "|safe + docstitle|e %}
+{%- else %}
+  {%- set titlesuffix = "" %}
+{%- endif %}
+<!DOCTYPE html>
+<!--[if IE 8]><html class="no-js lt-ie9" lang="en" > <![endif]-->
+<!--[if gt IE 8]><!--> <html class="no-js" lang="en" > <!--<![endif]-->
+<head>
+  <meta charset="utf-8">
+  {{ metatags }}
+  <meta name="viewport" content="width=device-width, initial-scale=1.0">
+  {% block htmltitle %}
+  <title>{{ title|striptags|e }}{{ titlesuffix }}</title>
+  {% endblock %}
+  {# FAVICON #}
+  {% if favicon %}
+    <link rel="shortcut icon" href="{{ pathto('_static/' + favicon, 1) }}"/>
+  {% endif %}
+  {# CSS #}
+  {# OPENSEARCH #}
+  {% if not embedded %}
+    {% if use_opensearch %}
+      <link rel="search" type="application/opensearchdescription+xml" title="{% trans docstitle=docstitle|e %}Search within {{ docstitle }}{% endtrans %}" href="{{ pathto('_static/opensearch.xml', 1) }}"/>
+    {% endif %}
+  {% endif %}
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <link rel="stylesheet" href="{{ pathto('_static/' + style, 1) }}" type="text/css" />
+  {% endif %}
+  {% for cssfile in css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+  {% for cssfile in extra_css_files %}
+    <link rel="stylesheet" href="{{ pathto(cssfile, 1) }}" type="text/css" />
+  {% endfor %}
+  {%- block linktags %}
+    {%- if hasdoc('about') %}
+        <link rel="author" title="{{ _('About these documents') }}"
+              href="{{ pathto('about') }}"/>
+    {%- endif %}
+    {%- if hasdoc('genindex') %}
+        <link rel="index" title="{{ _('Index') }}"
+              href="{{ pathto('genindex') }}"/>
+    {%- endif %}
+    {%- if hasdoc('search') %}
+        <link rel="search" title="{{ _('Search') }}" href="{{ pathto('search') }}"/>
+    {%- endif %}
+    {%- if hasdoc('copyright') %}
+        <link rel="copyright" title="{{ _('Copyright') }}" href="{{ pathto('copyright') }}"/>
+    {%- endif %}
+    <link rel="top" title="{{ docstitle|e }}" href="{{ pathto('index') }}"/>
+    {%- if parents %}
+        <link rel="up" title="{{ parents[-1].title|striptags|e }}" href="{{ parents[-1].link|e }}"/>
+    {%- endif %}
+    {%- if next %}
+        <link rel="next" title="{{ next.title|striptags|e }}" href="{{ next.link|e }}"/>
+    {%- endif %}
+    {%- if prev %}
+        <link rel="prev" title="{{ prev.title|striptags|e }}" href="{{ prev.link|e }}"/>
+    {%- endif %}
+  {%- endblock %}
+  {%- block extrahead %} 
+  <link rel="stylesheet" href="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/css/perfect-scrollbar.min.css" type="text/css" />
+  <link rel="stylesheet" href="{{pathto('_static/css/override.css', 1)}}" type="text/css" />
+  <script>
+  var _hmt = _hmt || [];
+  (function() {
+    var hm = document.createElement("script");
+    hm.src = "//hm.baidu.com/hm.js?b9a314ab40d04d805655aab1deee08ba";
+    var s = document.getElementsByTagName("script")[0]; 
+    s.parentNode.insertBefore(hm, s);
+  })();
+  </script>
+  {% endblock %}
+  {# Keep modernizr in head - http://modernizr.com/docs/#installing #}
+  <script src="{{ pathto('_static/js/modernizr.min.js', 1) }}"></script>
+</head>
+<body class="wy-body-for-nav" role="document">
+  {% block extrabody %}
+  <header class="site-header">
+    <div class="site-logo">
+      <a href="/"><img src="{{pathto('_static/images/PP_w.png', 1)}}"></a>
+    </div>
+    <div class="site-nav-links">
+      <div class="site-menu">
+        <a class="fork-on-github" href="https://github.com/PaddlePaddle/Paddle" target="_blank"><i class="fa fa-github"></i>Folk me on Github</a>
+        <div class="language-switcher dropdown">
+          <a type="button" data-toggle="dropdown">
+            <span>English</span>
+            <i class="fa fa-angle-up"></i>
+            <i class="fa fa-angle-down"></i>
+          </a>
+          <ul class="dropdown-menu">
+            <li><a href="/doc_cn">中文</a></li>
+            <li><a href="/doc">English</a></li>
+          </ul>
+        </div>
+        <ul class="site-page-links">
+          <li><a>Home</a></li>
+          <li><a>Get Started</a></li>
+          <li class="active"><a>Documentation</a></li>
+          <li><a>About Us</a></li>
+        </ul>
+      </div>
+      <div class="doc-module">
+        {%set modules = toctree(maxdepth=0, collapse=False, titles_only=True)%}
+        {{modules}}
+        {% include "searchbox.html" %}        
+      </div>
+    </div>
+  </header>
+  {% endblock %}
+  <div class="main-content-wrap">
+    {# SIDE NAV, TOGGLES ON MOBILE #}
+    <nav class="doc-menu-vertical" role="navigation">
+        {% block menu %}
+          {% set toctree = toctree(maxdepth=-1, collapse=False,titles_only=True, includehidden=True) %}
+          {{ toctree }}
+        {% endblock %}
+    </nav>
+    {% if toc %}
+    <nav class="local-toc">{{ toc }}</nav>
+    {% endif %}
+    <section class="doc-content-wrap">
+      {% include "breadcrumbs.html" %}
+      {# PAGE CONTENT #}
+      <div class="wy-nav-content" id="doc-content">
+        <div class="rst-content">
+          <div role="main" class="document" itemscope="itemscope" itemtype="http://schema.org/Article">
+           <div itemprop="articleBody">
+            {% block body %}{% endblock %}
+           </div>
+          </div>
+          {% include "footer.html" %}
+        </div>
+      </div>
+    </section>
+  </div>
+  {% include "versions.html" %}
+  {% if not embedded %}
+    <script type="text/javascript">
+        var DOCUMENTATION_OPTIONS = {
+            URL_ROOT:'{{ url_root }}',
+            VERSION:'{{ release|e }}',
+            COLLAPSE_INDEX:false,
+            FILE_SUFFIX:'{{ '' if no_search_suffix else file_suffix }}',
+            HAS_SOURCE:  {{ has_source|lower }}
+        };
+    </script>
+    {%- for scriptfile in script_files %}
+      <script type="text/javascript" src="{{ pathto(scriptfile, 1) }}"></script>
+    {%- endfor %}
+  {% endif %}
+  {# RTD hosts this file, so just load on non RTD builds #}
+  {% if not READTHEDOCS %}
+    <script type="text/javascript" src="{{ pathto('_static/js/theme.js', 1) }}"></script>
+  {% endif %}
+  <script src="https://maxcdn.bootstrapcdn.com/bootstrap/3.3.7/js/bootstrap.min.js" integrity="sha384-Tc5IQib027qvyjSMfHjOMaLkfuWVxZxUPnCJA7l2mCWNIpG9mGCD8wGNIcPD7Txa" crossorigin="anonymous"></script>
+  <script src="https://cdn.jsdelivr.net/perfect-scrollbar/0.6.14/js/perfect-scrollbar.jquery.min.js"></script>
+  <script src="{{ pathto('_static/js/paddle_doc_init.js', 1) }}"></script>
+  {%- block footer %} {% endblock %}
+</body>
+</html>
--- a/doc_theme/templates/search.html
+++ b/doc_theme/templates/search.html
+{#
+    basic/search.html
+    ~~~~~~~~~~~~~~~~~
+    Template for the search page.
+    :copyright: Copyright 2007-2013 by the Sphinx team, see AUTHORS.
+    :license: BSD, see LICENSE for details.
+#}
+{%- extends "layout.html" %}
+{% set title = _('Search') %}
+{% set script_files = script_files + ['_static/searchtools.js'] %}
+{% block footer %}
+  <script type="text/javascript">
+    jQuery(function() { Search.loadIndex("{{ pathto('searchindex.js', 1) }}"); });
+    jQuery('.doc-content-wrap > div[role="navigation"]').remove();
+    jQuery('.doc-content-wrap').css('padding-top', 0);
+  </script>
+  {# this is used when loading the search index using $.ajax fails,
+     such as on Chrome for documents on localhost #}
+  <script type="text/javascript" id="searchindexloader"></script>
+  {{ super() }}
+{% endblock %}
+{% block body %}
+  <noscript>
+  <div id="fallback" class="admonition warning">
+    <p class="last">
+      {% trans %}Please activate JavaScript to enable the search
+      functionality.{% endtrans %}
+    </p>
+  </div>
+  </noscript>
+  {% if search_performed %}
+    <h2>{{ _('Search Results') }}</h2>
+    {% if not search_results %}
+      <p>{{ _('Your search did not match any documents. Please make sure that all words are spelled correctly and that you\'ve selected enough categories.') }}</p>
+    {% endif %}
+  {% endif %}
+  <div id="search-results">
+  {% if search_results %}
+    <ul>
+    {% for href, caption, context in search_results %}
+      <li>
+        <a href="{{ pathto(item.href) }}">{{ caption }}</a>
+        <p class="context">{{ context|e }}</p>
+      </li>
+    {% endfor %}
+    </ul>
+  {% endif %}
+  </div>
+{% endblock %}
--- a/paddle/api/test/run_tests.sh
+++ b/paddle/api/test/run_tests.sh
@@ -20,11 +20,7 @@ popd > /dev/null
 cd $SCRIPTPATH
-if [ ! -f ../../dist/*.whl ] ; then  # Swig not compiled.
+rm -rf .test_env
-  exit 0
-fi
-rm .test_env -rf
 virtualenv .test_env
 source .test_env/bin/activate

--- a/paddle/cuda/CMakeLists.txt
+++ b/paddle/cuda/CMakeLists.txt
@@ -15,16 +15,24 @@ else()
 endif()
 set(CUDA_CXX_WITH_GPU_SOURCES
+    src/hl_cudart_wrap.cc
    src/hl_cuda_cublas.cc
    src/hl_cuda_cudnn.cc
    src/hl_cuda_device.cc)
-set_source_files_properties(${CUDA_CXX_WITH_GPU_SOURCES}
+if(WITH_GPU)
-                            PROPERTIES COMPILE_FLAGS "-D__NVCC__")
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc
+        ${CUDA_CXX_WITH_GPU_SOURCES})
-set(CUDA_DSO_SOURCES
+    set_source_files_properties(${CUDA_CXX_SOURCES}
-    src/hl_dso_loader.cc
+                                PROPERTIES COMPILE_FLAGS "-D__NVCC__")
-    src/hl_cudart_wrap.cc)
+else()
+    set(CUDA_CXX_SOURCES
+        src/hl_dso_loader.cc
+        src/hl_warpctc_wrap.cc)
+endif()
 set(CUDA_CU_SOURCES
    src/hl_perturbation_util.cu
@@ -41,6 +49,7 @@ set(CUDA_CU_SOURCES
 set(CUDA_HEADERS
    include/hl_time.h
    include/hl_dso_loader.h
+    include/hl_warpctc_wrap.h
    include/hl_sequence.h
    include/hl_cuda_cublas.h
    include/hl_batch_transpose.h
@@ -72,14 +81,14 @@ if(WITH_GPU)
    cuda_add_library(paddle_cuda
        ${CUDA_SOURCES}
        ${CUDA_CU_SOURCES}
-        ${CUDA_DSO_SOURCES}
+        ${CUDA_CXX_SOURCES})
-        ${CUDA_CXX_WITH_GPU_SOURCES})
 else()
-    add_library(paddle_cuda ${CUDA_SOURCES})
+    add_library(paddle_cuda
+                ${CUDA_SOURCES}
+                ${CUDA_CXX_SOURCES})
 endif()
 add_style_check_target(paddle_cuda
                       ${CUDA_SOURCES}
                       ${CUDA_HEADERS}
-                       ${CUDA_DSO_SOURCES}
+                       ${CUDA_CXX_SOURCES})
-                       ${CUDA_CXX_WITH_GPU_SOURCES})
--- a/paddle/cuda/include/hl_dso_loader.h
+++ b/paddle/cuda/include/hl_dso_loader.h
@@ -18,10 +18,6 @@ limitations under the License. */
 #include <dlfcn.h>
 #include <string>
 #include <memory>
-#include <cuda_runtime.h>
-#include <cublas_v2.h>
-#include <curand.h>
-#include <cudnn.h>
 #include "hl_base.h"
 /**
@@ -56,4 +52,12 @@ void GetCudartDsoHandle(void** dso_handle);
 */
 void GetCurandDsoHandle(void** dso_handle);
+/**
+ * @brief    load the DSO of warp-ctc
+ *
+ * @param    **dso_handle   dso handler
+ *
+ */
+void GetWarpCTCDsoHandle(void** dso_handle);
 #endif  // HL_DSO_LOADER_H_
--- a/paddle/cuda/include/hl_gpu.h
+++ b/paddle/cuda/include/hl_gpu.h
@@ -25,6 +25,7 @@ limitations under the License. */
 #include "hl_sparse.h"
 #include "hl_lstm.h"
 #include "hl_sequence.h"
+#include "hl_warpctc_wrap.h"
 #ifdef HPPL_STUB_FUNC
 #include "stub/hl_cuda_stub.h"

--- a/paddle/cuda/include/hl_matrix_type.cuh
+++ b/paddle/cuda/include/hl_matrix_type.cuh
@@ -12,14 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #ifndef HL_MATRIX_TYPE_CUH_
 #define HL_MATRIX_TYPE_CUH_
 #include "hl_base.h"
 #ifdef __CUDA_ARCH__
-// typedef void*  vecType;
 #include <vector_types.h>
 #ifndef PADDLE_TYPE_DOUBLE
 typedef float4 vecType;
@@ -37,4 +35,10 @@ typedef __m128d vecType;
 #endif
 #endif
-#endif /* HL_MATRIX_TYPE_CUH_ */
+#ifdef __CUDA_ARCH__
+#define INLINE   __device__ inline
+#else
+#define INLINE   inline
+#endif
+#endif  // HL_MATRIX_TYPE_CUH_
--- a/paddle/cuda/include/hl_sequence.h
+++ b/paddle/cuda/include/hl_sequence.h
@@ -172,6 +172,39 @@ extern void hl_sequence2batch_add(real* batch,
                                  int batchCount,
                                  bool seq2batch);
+/**
+ * @brief   Memory copy from sequence to batch,
+ *          while padding all sequences to the same length.
+ *
+ * if seq2batch == true
+ *
+ *    copy from sequence to batch:
+ *        batch[i] = sequence[sequenceStartPositions[i]]
+ *
+ * if seq2batch == false
+ *
+ *    copy from batch to sequence:
+ *        sequence[sequenceStartPositions[i]] = batch[i]
+ *
+ * @param[in,out]   batch                   batch matrix.
+ * @param[in,out]   sequence                sequence matrix.
+ * @param[in]       sequenceStartPositions  index vector.
+ * @param[in]       sequenceWidth           width of sequence.
+ * @param[in]       maxSequenceLength       maximum length of sequences.
+ * @param[in]       numSequences            number of sequences.
+ * @param[in]       normByTimes             whether dividing sequence's length.
+ * @param[in]       seq2batch               copy direction.
+ *
+ */
+extern void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch);
 /**
 * @brief  dst = Op(src), src is sequence.
 *

--- a/paddle/cuda/include/hl_tensor_ops.h
+++ b/paddle/cuda/include/hl_tensor_ops.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_TENSOR_OPS_H_
+#define HL_TENSOR_OPS_H_
+#include <cmath>
+#include "hl_matrix_type.cuh"
+namespace hppl {
+namespace unary {
+template <class T>
+class add_scale {
+private:
+  const T p;
+public:
+  INLINE add_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a + p; }
+};
+template <class T>
+class sub_scale {
+private:
+  const T p;
+public:
+  INLINE sub_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a - p; }
+};
+template <class T>
+class mul_scale {
+private:
+  const T p;
+public:
+  INLINE mul_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a * p; }
+};
+template <class T>
+class div_scale {
+private:
+  const T p;
+public:
+  INLINE div_scale(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a / p; }
+};
+template <class T>
+class neg {
+public:
+  INLINE T operator()(const T a) const { return -a; }
+};
+template <class T>
+class exp_op {
+public:
+  INLINE T operator()(const T a) const { return std::exp(a); }
+};
+template <class T>
+class log_op {
+public:
+  INLINE T operator()(const T a) const { return std::log(a); }
+};
+template <class T>
+class sqrt_op {
+public:
+  INLINE T operator()(const T a) const { return std::sqrt(a); }
+};
+template <class T>
+class square {
+public:
+  INLINE T operator()(const T a) const { return a * a; }
+};
+template <class T>
+class reciprocal {
+public:
+  INLINE T operator()(const T a) const { return T(1) / a; }
+};
+template <class T>
+class abs {
+public:
+  INLINE T operator()(const T a) const { return a > 0 ? a : -a; }
+};
+template <class T>
+class sign {
+public:
+  INLINE T operator()(const T a) const { return (a > 0) - (a < 0); }
+};
+template <class T>
+class min {
+private:
+  const T p;
+public:
+  INLINE min(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a > p ? p : a; }
+};
+template <class T>
+class max {
+private:
+  const T p;
+public:
+  INLINE max(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return a < p ? p : a; }
+};
+template <class T>
+class pow_op {
+private:
+  const T p;
+public:
+  INLINE pow_op(const T s) : p(s) {}
+  INLINE T operator()(const T a) const { return std::pow(a, p); }
+};
+template <class T>
+class constant {
+private:
+  const T p;
+public:
+  INLINE constant(const T s) : p(s) {}
+  INLINE T operator()(int i) const { return p; }
+  INLINE T operator()(int i, int j) const { return p; }
+};
+template <class T>
+class cmp_eq {
+private:
+  const T p;
+public:
+  INLINE cmp_eq(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a == p; }
+};
+template <class T>
+class cmp_ne {
+private:
+  const T p;
+public:
+  INLINE cmp_ne(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a != p; }
+};
+template <class T>
+class cmp_le {
+private:
+  const T p;
+public:
+  INLINE cmp_le(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a <= p; }
+};
+template <class T>
+class cmp_lt {
+private:
+  const T p;
+public:
+  INLINE cmp_lt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a < p; }
+};
+template <class T>
+class cmp_ge {
+private:
+  const T p;
+public:
+  INLINE cmp_ge(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a >= p; }
+};
+template <class T>
+class cmp_gt {
+private:
+  const T p;
+public:
+  INLINE cmp_gt(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a > p; }
+};
+template <class T>
+class and_op {
+private:
+  const T p;
+public:
+  INLINE and_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a && p; }
+};
+template <class T>
+class or_op {
+private:
+  const T p;
+public:
+  INLINE or_op(const T s) : p(s) {}
+  INLINE bool operator()(const T a) const { return a || p; }
+};
+}  // namespace unary
+namespace binary {
+template <class T>
+class add {
+public:
+  INLINE T operator()(const T a, const T b) const { return a + b; }
+};
+template <class T>
+class add_scale {
+private:
+  const T p1;
+  const T p2;
+public:
+  INLINE add_scale(const T s1, const T s2) : p1(s1), p2(s2) {}
+  INLINE T operator()(const T a, const T b) const { return p1 * a + p2 * b; }
+};
+template <class T>
+class sub {
+public:
+  INLINE T operator()(const T a, const T b) const { return a - b; }
+};
+template <class T>
+class mul {
+public:
+  INLINE T operator()(const T a, const T b) const { return a * b; }
+};
+template <class T>
+class div {
+public:
+  INLINE T operator()(const T a, const T b) const { return a / b; }
+};
+template <class T>
+class cmp_eq {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a == b; }
+};
+template <class T>
+class cmp_ne {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a != b; }
+};
+template <class T>
+class cmp_le {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a <= b; }
+};
+template <class T>
+class cmp_lt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a < b; }
+};
+template <class T>
+class cmp_ge {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a >= b; }
+};
+template <class T>
+class cmp_gt {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a > b; }
+};
+template <class T>
+class and_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a && b; }
+};
+template <class T>
+class or_op {
+public:
+  INLINE bool operator()(const T a, const T b) const { return a || b; }
+};
+template <class T>
+class min {
+public:
+  INLINE T operator()(const T a, const T b) const { return a > b ? b : a; }
+};
+template <class T>
+class max {
+public:
+  INLINE T operator()(const T a, const T b) const { return a < b ? b : a; }
+};
+}  // namespace binary
+}  // namespace hppl
+#endif  // HL_TENSOR_OPS_H_
--- a/paddle/cuda/include/hl_warpctc_wrap.h
+++ b/paddle/cuda/include/hl_warpctc_wrap.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifndef HL_WARPCTC_WRAP_H_
+#define HL_WARPCTC_WRAP_H_
+#include "hl_base.h"
+#include "warp-ctc/include/ctc.h"
+typedef ctcStatus_t hl_warpctc_status_t;
+typedef ctcOptions hl_warpctc_options_t;
+/**
+ * @brief Init ctc options.
+ *
+ * @param[in]   blank     blank label used in ctc loss function.
+ * @param[in]   useGpu    whether use gpu.
+ * @param[out]  options   handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_init(const size_t blank,
+                            bool useGpu,
+                            hl_warpctc_options_t* options);
+/**
+ * @brief Compute the connectionist temporal classification loss,
+ *        and optionally compute the gradient with respect to the inputs.
+ *
+ * if batchGrad == nullptr
+ *
+ *    only compute the ctc loss.
+ *
+ * if batchGrad != nullptr
+ *
+ *    compute both ctc loss and gradient.
+ *
+ * @param[in]   batchInput      batch matrix of input probabilities,
+ *                              in maxSequenceLength x numSequence x numClasses
+ *                              (row-major) format.
+ * @param[out]  batchGrad       batch matrix of gradient.
+ * @param[in]   cpuLabels       labels always in CPU memory.
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[out]  cpuCosts        cost of each sequence in CPU memory.
+ * @param[out]  workspace       workspace to store some temporary results.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ *
+ */
+extern void hl_warpctc_compute_loss(const real* batchInput,
+                                    real* batchGrad,
+                                    const int* cpuLabels,
+                                    const int* cpuLabelLengths,
+                                    const int* cpuInputLengths,
+                                    const size_t numClasses,
+                                    const size_t numSequences,
+                                    real* cpuCosts,
+                                    void* workspace,
+                                    hl_warpctc_options_t* options);
+/**
+ * @brief Compute the required workspace size.
+ *        There is no memory allocated operations within warp-ctc.
+ *
+ * @param[in]   cpuLabelLengths length of all labels in CPU memory.
+ * @param[in]   cpuInputLengths length of all sequences in CPU memory.
+ * @param[in]   numClasses      number of possible output symbols.
+ * @param[in]   numSequences    number of sequence.
+ * @param[in]   options         handle to store cpu or gpu informations.
+ * @param[out]  bytes           pointer to a scalar where the memory
+ *                              requirement in bytes will be placed.
+ *
+ */
+extern void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                          const int* cpuInputLengths,
+                                          const size_t numClasses,
+                                          const size_t numSequences,
+                                          hl_warpctc_options_t* options,
+                                          size_t* bytes);
+#endif  // HL_WARPCTC_WRAP_H_
--- a/paddle/cuda/include/stub/hl_sequence_stub.h
+++ b/paddle/cuda/include/stub/hl_sequence_stub.h
@@ -70,6 +70,15 @@ inline void hl_sequence2batch_add(real* batch,
                                  int batchCount,
                                  bool seq2batch) {}
+inline void hl_sequence2batch_copy_padding(real* batch,
+                                           real* sequence,
+                                           const int* sequenceStartPositions,
+                                           const size_t sequenceWidth,
+                                           const size_t maxSequenceLength,
+                                           const size_t numSequences,
+                                           bool normByTimes,
+                                           bool seq2batch) {}
 inline void hl_sequence_avg_forward(real* dst,
                                    real* src,
                                    const int* starts,

--- a/paddle/cuda/src/hl_cuda_sequence.cu
+++ b/paddle/cuda/src/hl_cuda_sequence.cu
@@ -447,6 +447,112 @@ void hl_sequence2batch_add(real *batch,
  CHECK_SYNC("hl_sequence2batch_add failed");
 }
+template<bool normByTimes, bool seq2batch>
+__global__
+void KeSequence2BatchPadding(real* batch,
+                             real* sequence,
+                             const int* sequenceStartPositions,
+                             const size_t sequenceWidth,
+                             const size_t maxSequenceLength,
+                             const size_t numSequences) {
+  int batchIdx = blockIdx.y;
+  int sequenceStart = sequenceStartPositions[batchIdx];
+  int sequenceLength = sequenceStartPositions[batchIdx + 1] - sequenceStart;
+  int sequenceIdx = blockIdx.x * blockDim.y + threadIdx.y;
+  int batchBaseIdx = (sequenceIdx * numSequences + batchIdx) * sequenceWidth;
+  int sequenceBaseIdx = (sequenceStart + sequenceIdx) * sequenceWidth;
+  real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+  if (sequenceIdx < sequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = scale * sequence[sequenceBaseIdx + i];
+      }
+    } else {
+      /* batch -> sequence */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        sequence[sequenceBaseIdx + i] = scale * batch[batchBaseIdx + i];
+      }
+    }
+  } else if (sequenceIdx < maxSequenceLength) {
+    if (seq2batch) {
+      /* sequence -> batch */
+      for (int i = threadIdx.x; i < sequenceWidth; i += blockDim.x) {
+        batch[batchBaseIdx + i] = 0;
+      }
+    }
+  }
+}
+void hl_sequence2batch_copy_padding(real* batch,
+                                    real* sequence,
+                                    const int* sequenceStartPositions,
+                                    const size_t sequenceWidth,
+                                    const size_t maxSequenceLength,
+                                    const size_t numSequences,
+                                    bool normByTimes,
+                                    bool seq2batch) {
+  CHECK_NOTNULL(batch);
+  CHECK_NOTNULL(sequence);
+  CHECK_NOTNULL(sequenceStartPositions);
+  if (!normByTimes && numSequences == 1) {
+    size_t elementCount = maxSequenceLength * sequenceWidth;
+    if (seq2batch) {
+      /* sequence -> batch */
+      hl_memcpy_device2device(batch, sequence, sizeof(real) * elementCount);
+    } else {
+      /* batch -> sequence */
+      hl_memcpy_device2device(sequence, batch, sizeof(real) * elementCount);
+    }
+    return;
+  }
+  const int CUDA_BLOCK_SIZE = 512;
+  /* At least use 32 threads to copy sequenceWidth elements,
+     and at least 8 elements for each thread. */
+  int blockDimX = ((((sequenceWidth + 7) >> 3) + 31) >> 5) << 5;
+  blockDimX = (blockDimX < CUDA_BLOCK_SIZE) ? blockDimX : CUDA_BLOCK_SIZE;
+  int blockDimY = CUDA_BLOCK_SIZE / blockDimX;
+  dim3 threads(blockDimX, blockDimY);
+  int gridDimX = (maxSequenceLength * blockDimX + CUDA_BLOCK_SIZE - 1) /
+      CUDA_BLOCK_SIZE;
+  int gridDimY = numSequences;
+  dim3 grid(gridDimX, gridDimY);
+  if (seq2batch) {
+    /* sequence -> batch */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 1><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  } else {
+    /* batch -> sequence */
+    if (normByTimes) {
+      KeSequence2BatchPadding<1, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    } else {
+      KeSequence2BatchPadding<0, 0><<< grid, threads, 0, STREAM_DEFAULT >>>(
+              batch, sequence, sequenceStartPositions,
+              sequenceWidth, maxSequenceLength, numSequences);
+    }
+  }
+  CHECK_SYNC("hl_sequence2batch_copy_padding failed");
+}
 __device__ inline float my_rsqrt(float x) {
  return rsqrtf(x);
 }

--- a/paddle/cuda/src/hl_cudart_wrap.cc
+++ b/paddle/cuda/src/hl_cudart_wrap.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef PADDLE_USE_DSO
 #include <mutex>
+#include <cuda_runtime.h>
 #include "hl_dso_loader.h"
 /**

--- a/paddle/cuda/src/hl_dso_loader.cc
+++ b/paddle/cuda/src/hl_dso_loader.cc
@@ -30,6 +30,8 @@ P_DEFINE_string(cuda_dir,
                "build-in function in cudart already ran before main entry). "
                "If default, dlopen will search cuda from LD_LIBRARY_PATH");
+P_DEFINE_string(warpctc_dir, "", "Specify path for loading libwarpctc.so.");
 static inline std::string join(const std::string& part1,
                               const std::string& part2) {
  // directory separator
@@ -92,27 +94,28 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
    *dso_handle = dlopen(dlPath.c_str(), dynload_flags);
    // if not found, search from default path
    if (nullptr == *dso_handle) {
-      LOG(WARNING) << "Failed to find cuda library: " << dlPath;
+      LOG(WARNING) << "Failed to find dynamic library: " << dlPath << " ("
+                   << dlerror() << ")";
      dlPath = dso_name;
      GetDsoHandleFromDefaultPath(dlPath, dso_handle, dynload_flags);
    }
  }
-  CHECK(nullptr != *dso_handle) << "Failed to find cuda library: " << dlPath
+  CHECK(nullptr != *dso_handle) << "Failed to find dynamic library: " << dlPath
-                                << std::endl
+                                << " (" << dlerror() << ") \n"
                                << "Please specify its path correctly using "
-                                   "one of the following ways: \n"  // NOLINT
+                                   "one of the following ways: \n"
                                << "Method 1. set cuda and cudnn lib path at "
                                   "runtime. "
                                << "http://www.paddlepaddle.org/doc/ui/"
                                   "cmd_argument/"
-                                   "argument_outline.html \n"  // NOLINT
+                                   "argument_outline.html \n"
                                << "For instance, issue command: paddle train "
                                   "--use_gpu=1 "
                                << "--cuda_dir=/usr/local/cuda/lib64 "
                                   "--cudnn_dir=/usr/local/cudnn/lib "
-                                   "...\n"  // NOLINT
+                                   "...\n"
                                << "Method 2. set environment variable "
                                   "LD_LIBRARY_PATH on Linux or "
@@ -124,7 +127,7 @@ static inline void GetDsoHandleFromSearchPath(const std::string& search_root,
                                   "DYLD_LIBRARY_PATH is impossible "
                                << "unless System Integrity Protection (SIP) "
                                   "is disabled. However, "
-                                   "method 1 "  // NOLINT
+                                   "method 1 "
                                << "always work well.";
 }
@@ -159,3 +162,11 @@ void GetCurandDsoHandle(void** dso_handle) {
  GetDsoHandleFromSearchPath(FLAGS_cuda_dir, "libcurand.so", dso_handle);
 #endif
 }
+void GetWarpCTCDsoHandle(void** dso_handle) {
+#if defined(__APPLE__) || defined(__OSX__)
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.dylib", dso_handle);
+#else
+  GetDsoHandleFromSearchPath(FLAGS_warpctc_dir, "libwarpctc.so", dso_handle);
+#endif
+}
--- a/paddle/cuda/src/hl_warpctc_wrap.cc
+++ b/paddle/cuda/src/hl_warpctc_wrap.cc
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <mutex>
+#include "hl_warpctc_wrap.h"
+#include "hl_dso_loader.h"
+#include "paddle/utils/Logging.h"
+namespace dynload {
+std::once_flag warpctc_dso_flag;
+void* warpctc_dso_handle = nullptr;
+/**
+ * The following macro definition can generate structs
+ * (for each function) to dynamic load warpctc routine
+ * via operator overloading. When PADDLE_USE_DSO is
+ * false, you need to add the path of libwarp-ctc.so to
+ * the linked-libs of paddle or to LD_PRELOAD.
+ */
+#ifdef PADDLE_USE_DSO
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                              \
+  struct DynLoad__##__name {                                           \
+    template <typename... Args>                                        \
+    auto operator()(Args... args) -> decltype(__name(args...)) {       \
+      using warpctcFunc = decltype(__name(args...)) (*)(Args...);      \
+      std::call_once(                                                  \
+          warpctc_dso_flag, GetWarpCTCDsoHandle, &warpctc_dso_handle); \
+      void* p_##_name = dlsym(warpctc_dso_handle, #__name);            \
+      return reinterpret_cast<warpctcFunc>(p_##_name)(args...);        \
+    }                                                                  \
+  } __name;  // struct DynLoad__##__name
+#else
+#define DYNAMIC_LOAD_WARPCTC_WRAP(__name)                        \
+  struct DynLoad__##__name {                                     \
+    template <typename... Args>                                  \
+    auto operator()(Args... args) -> decltype(__name(args...)) { \
+      return __name(args...);                                    \
+    }                                                            \
+  } __name;  // struct DynLoad__##__name
+#endif
+// include all needed warp-ctc functions
+DYNAMIC_LOAD_WARPCTC_WRAP(get_warpctc_version)
+DYNAMIC_LOAD_WARPCTC_WRAP(ctcGetStatusString)
+DYNAMIC_LOAD_WARPCTC_WRAP(compute_ctc_loss)
+DYNAMIC_LOAD_WARPCTC_WRAP(get_workspace_size)
+#undef DYNAMIC_LOAD_WARPCTC_WRAP
+} /* namespace dynload */
+#define WARPCTC_GET_VERSION dynload::get_warpctc_version
+#define WARPCTC_GET_STATUS_STRING dynload::ctcGetStatusString
+#ifndef PADDLE_TYPE_DOUBLE
+#define WARPCTC_COMPUTE_LOSS dynload::compute_ctc_loss
+#define WARPCTC_GET_WORKSPACE_SIZE dynload::get_workspace_size
+#else
+#define WARPCTC_LOG_FATAL                                \
+  LOG(FATAL) << "warp-ctc [version " << g_warpctcVersion \
+             << "] Error: not support double precision."
+#define WARPCTC_COMPUTE_LOSS(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#define WARPCTC_GET_WORKSPACE_SIZE(...) WARPCTC_LOG_FATAL(__VA_ARGS__)
+#endif
+/**
+ * Check build-in warp-ctc function using glog and it also
+ * support << operator for more details error info.
+ */
+static int g_warpctcVersion = -1;
+#define CHECK_WARPCTC(warpctcStat)                \
+  CHECK_EQ(CTC_STATUS_SUCCESS, warpctcStat)       \
+      << "warp-ctc [version " << g_warpctcVersion \
+      << "] Error: " << WARPCTC_GET_STATUS_STRING(warpctcStat) << " "
+void hl_warpctc_init(const size_t blank,
+                     bool useGpu,
+                     hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(options);
+  g_warpctcVersion = WARPCTC_GET_VERSION();
+  if (useGpu) {
+#ifdef __NVCC__
+    options->loc = CTC_GPU;
+    options->stream = STREAM_DEFAULT;
+#else
+    LOG(FATAL) << "[warpctc init] GPU is not enabled.";
+#endif
+  } else {
+    options->loc = CTC_CPU;
+    options->num_threads = 1;
+  }
+  options->blank_label = blank;
+}
+void hl_warpctc_compute_loss(const real* batchInput,
+                             real* batchGrad,
+                             const int* cpuLabels,
+                             const int* cpuLabelLengths,
+                             const int* cpuInputLengths,
+                             const size_t numClasses,
+                             const size_t numSequences,
+                             real* cpuCosts,
+                             void* workspace,
+                             hl_warpctc_options_t* options) {
+  CHECK_NOTNULL(batchInput);
+  CHECK_NOTNULL(cpuLabels);
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(cpuCosts);
+  CHECK_NOTNULL(workspace);
+  CHECK_NOTNULL(options);
+  CHECK_WARPCTC(WARPCTC_COMPUTE_LOSS(batchInput,
+                                     batchGrad,
+                                     cpuLabels,
+                                     cpuLabelLengths,
+                                     cpuInputLengths,
+                                     numClasses,
+                                     numSequences,
+                                     cpuCosts,
+                                     workspace,
+                                     *options));
+}
+void hl_warpctc_get_workspace_size(const int* cpuLabelLengths,
+                                   const int* cpuInputLengths,
+                                   const size_t numClasses,
+                                   const size_t numSequences,
+                                   hl_warpctc_options_t* options,
+                                   size_t* bytes) {
+  CHECK_NOTNULL(cpuLabelLengths);
+  CHECK_NOTNULL(cpuInputLengths);
+  CHECK_NOTNULL(options);
+  CHECK_NOTNULL(bytes);
+  CHECK_WARPCTC(WARPCTC_GET_WORKSPACE_SIZE(cpuLabelLengths,
+                                           cpuInputLengths,
+                                           numClasses,
+                                           numSequences,
+                                           *options,
+                                           bytes));
+}
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -289,7 +289,7 @@ void forward(Argument& act) {
                         useGpu(act.deviceId));
  act.in->copyFrom(*act.value);
-  act.value->abs(*act.value);
+  act.value->abs2(*act.value);
 }
 void backward(Argument& act) { act.grad->absDerivative(*act.in); }
@@ -311,7 +311,7 @@ void forward(Argument& act) {
                         useGpu(act.deviceId));
  act.in->copyFrom(*act.value);
-  act.value->square(*act.value);
+  act.value->square2(*act.value);
 }
 void backward(Argument& act) { act.grad->squareDerivative(*act.in); }
@@ -324,7 +324,7 @@ END_DEFINE_ACTIVATION(square)
 * \f]
 */
 BEGIN_DEFINE_ACTIVATION(exponential)
-void forward(Argument& act) { act.value->exp(*act.value); }
+void forward(Argument& act) { act.value->exp2(*act.value); }
 void backward(Argument& act) { act.grad->expDerivative(*act.value); }
 END_DEFINE_ACTIVATION(exponential)
@@ -345,7 +345,7 @@ void forward(Argument& act) {
                         useGpu(act.deviceId));
  act.in->copyFrom(*act.value);
-  act.value->log(*act.value);
+  act.value->log2(*act.value);
 }
 void backward(Argument& act) { act.grad->dotDiv(*act.grad, *act.in); }

--- a/paddle/gserver/layers/BatchNormalizationLayer.cpp
+++ b/paddle/gserver/layers/BatchNormalizationLayer.cpp
@@ -40,7 +40,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
  savedMean_->mulScalar(1.0 / numSamples);  // E[x]
  tmpMat_->assign(*mat);
-  tmpMat_->square();
+  tmpMat_->square2();
  savedInvVar_->zeroMem();
  savedInvVar_->accumulateColSum(*tmpMat_);
  savedInvVar_->mulScalar(1.0 / numSamples);   // E[x^2]
@@ -54,7 +54,7 @@ void BatchNormalizationLayer::calMeanAndStd(const MatrixPtr& mat) {
  calMovingMeanAndVar();
  savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 void BatchNormalizationLayer::calMovingMeanAndVar() {
@@ -85,7 +85,7 @@ void BatchNormalizationLayer::setMeanAndStd() {
  savedInvVar_->downClip(real(0.0));
  savedInvVar_->subScalar(-EPS);
-  savedInvVar_->sqrt(*savedInvVar_);
+  savedInvVar_->sqrt2(*savedInvVar_);
 }
 void BatchNormalizationLayer::expandMat(const MatrixPtr& in, MatrixPtr& out) {

--- a/paddle/gserver/layers/CostLayer.cpp
+++ b/paddle/gserver/layers/CostLayer.cpp
@@ -115,12 +115,12 @@ void MultiClassCrossEntropyWithSelfNorm::forwardImp(Matrix& output,
                                                    Matrix& target) {
  Matrix::resizeOrCreate(sftMaxSum_, output.getHeight(), 1, false, useGpu_);
  output.rowSum(*sftMaxSum_);
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
  target.oneHotCrossEntropy(output, *label.ids);
  target.add(*sftMaxSum_);
-  sftMaxSum_->square();
+  sftMaxSum_->square2();
  target.add(*sftMaxSum_, config_.softmax_selfnorm_alpha());
 }
@@ -131,12 +131,12 @@ void MultiClassCrossEntropyWithSelfNorm::backwardImp(Matrix& output,
  output.rowSum(*sftMaxSum_);
  Matrix::resizeOrCreate(sumInv_, output.getHeight(), 1, false, useGpu_);
-  sftMaxSum_->reciprocal(*sumInv_);
+  sftMaxSum_->reciprocal2(*sumInv_);
  outputG.oneHotCrossEntropyBp(output, *label.ids);
  outputG.addColumnVector(*sumInv_);
-  sftMaxSum_->log();
+  sftMaxSum_->log2();
  sumInv_->dotMul(*sumInv_, *sftMaxSum_);
  sumInv_->mulScalar(2 * config_.softmax_selfnorm_alpha());

--- a/paddle/gserver/layers/Layer.cpp
+++ b/paddle/gserver/layers/Layer.cpp
@@ -316,12 +316,12 @@ void Layer::showOutputStats() {
    auto tmpMat = dynamic_cast<CpuSparseMatrix*>(outSquare.get());
    min = tmpMat->getMin();
    max = tmpMat->getMax();
-    tmpMat->square();
+    tmpMat->square2();
    LOG(INFO) << "show statistics of [none zero values] in sparse matrix";
  } else {
    min = outSquare->getMin();
    max = outSquare->getMax();
-    outSquare->square();
+    outSquare->square2();
  }
  real std = (outSquare->getSum() / outSquare->getElementCnt()) - mean * mean;
  std = std > 0 ? std : 0;

--- a/paddle/gserver/layers/LinearChainCRF.cpp
+++ b/paddle/gserver/layers/LinearChainCRF.cpp
@@ -60,7 +60,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
  expX_->assign(*matX);
  // subtract max to avoid overflow or underflow
  expX_->mul(maxX_, ones_, (real)-1, (real)1);
-  expX_->exp();
+  expX_->exp2();
  real* a = a_->getData();
  real* b = b_->getData();
@@ -69,7 +69,7 @@ real LinearChainCRF::forward(real* x, int* s, int length) {
  real* expX = expX_->getData();
  real* maxX = maxX_->getData();
-  expW_->exp(*w_);
+  expW_->exp2(*w_);
  real* expW = expW_->getData();
  for (int i = 0; i < numClasses_; ++i) {

--- a/paddle/gserver/layers/PowerLayer.cpp
+++ b/paddle/gserver/layers/PowerLayer.cpp
@@ -99,7 +99,7 @@ void PowerLayer::backward(const UpdateCallback& callback) {
    Matrix::resizeOrCreate(tmpMtx, batchSize, dataDim, false, useGpu_);
    if (inG0) {
-      tmpMtx->log(*inV1);
+      tmpMtx->log2(*inV1);
      tmpMtx->dotMul(*tmpMtx, *outV);
      // inG0 += outG .* (log(inV1) * outV)

--- a/paddle/gserver/layers/WarpCTCLayer.cpp
+++ b/paddle/gserver/layers/WarpCTCLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "WarpCTCLayer.h"
+namespace paddle {
+REGISTER_LAYER(warp_ctc, WarpCTCLayer);
+bool WarpCTCLayer::init(const LayerMap& layerMap,
+                        const ParameterMap& parameterMap) {
+  /* Initialize the basic parament class */
+  Layer::init(layerMap, parameterMap);
+  CHECK_EQ(inputLayers_.size(), 2UL);
+  /* The inputLayers_[0] must be sequence output without softmax */
+  numClasses_ = config_.size();
+  CHECK_GE(numClasses_, 2UL);
+  CHECK_EQ(numClasses_, inputLayers_[0]->getSize());
+  blank_ = config_.blank();
+  CHECK_GE(blank_, 0UL);
+  CHECK_LT(blank_, numClasses_);
+  normByTimes_ = config_.norm_by_times();
+  // We don't need sequenceStartPositions because each sample of output_ is
+  // for the cost of one sequence.
+  setNeedSequenceInfo(false);
+  return true;
+}
+void WarpCTCLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  const Argument& output = getInput(0);
+  const Argument& labels = getInput(1);
+  CHECK(output.sequenceStartPositions);
+  CHECK(labels.sequenceStartPositions);
+  CHECK(labels.ids);
+  size_t numSequences = labels.sequenceStartPositions->getSize() - 1;
+  CHECK_EQ(numSequences, output.sequenceStartPositions->getSize() - 1);
+  resizeOutput(numSequences, 1);
+  const int* cpuLabelStartPositions =
+      labels.sequenceStartPositions->getData(false);
+  const int* cpuOutputStartPositions =
+      output.sequenceStartPositions->getData(false);
+  std::vector<int> cpuLabelLengths(numSequences);
+  std::vector<int> cpuOutputLengths(numSequences);
+  for (size_t i = 0; i < numSequences; i++) {
+    cpuLabelLengths[i] =
+        cpuLabelStartPositions[i + 1] - cpuLabelStartPositions[i];
+    cpuOutputLengths[i] =
+        cpuOutputStartPositions[i + 1] - cpuOutputStartPositions[i];
+  }
+  /* Get the maximum sequence length */
+  maxSequenceLength_ = 0;
+  maxSequenceLength_ = *std::max_element(
+      cpuOutputLengths.data(), cpuOutputLengths.data() + numSequences);
+  Matrix::resizeOrCreate(batchValue_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  Matrix::resizeOrCreate(batchGrad_,
+                         /* height */ numSequences * maxSequenceLength_,
+                         /* width */ numClasses_,
+                         /* trans */ false,
+                         /* useGpu */ useGpu_);
+  batchGrad_->zeroMem();
+  seq2batchPadding(output.value, batchValue_, output.sequenceStartPositions);
+  /* labels always in CPU memory */
+  IVector::resizeOrCreate(cpuLabels_,
+                          /* size */ (labels.ids)->getSize(),
+                          /* useGpu */ false);
+  cpuLabels_->copyFrom(*(labels.ids));
+  /* labels always in CPU memory */
+  Matrix::resizeOrCreate(cpuCosts_,
+                         /* height */ numSequences,
+                         /* width */ 1,
+                         /* trans */ false,
+                         /* useGpu */ false);
+  /* Init warp-ctc options */
+  hl_warpctc_options_t options;
+  hl_warpctc_init(blank_, useGpu_, &options);
+  /* Get the needed workspace size */
+  size_t workspaceBytes = 0;
+  hl_warpctc_get_workspace_size(cpuLabelLengths.data(),
+                                cpuOutputLengths.data(),
+                                numClasses_,
+                                numSequences,
+                                &options,
+                                &workspaceBytes);
+  CHECK_GT(workspaceBytes, 0UL);
+  size_t workspaceLength = workspaceBytes / sizeof(real) + 1;
+  Vector::resizeOrCreate(workspace_,
+                         /* size */ workspaceLength,
+                         /* useGpu */ useGpu_);
+  hl_warpctc_compute_loss(batchValue_->getData(),
+                          batchGrad_->getData(),
+                          cpuLabels_->getData(),
+                          cpuLabelLengths.data(),
+                          cpuOutputLengths.data(),
+                          numClasses_,
+                          numSequences,
+                          cpuCosts_->getData(),
+                          workspace_->getData(),
+                          &options);
+  /* Copy the costs */
+  output_.value->copyFrom(*cpuCosts_);
+}
+void WarpCTCLayer::backward(const UpdateCallback& callback) {
+  (void)callback;
+  const Argument& output = getInput(0);
+  CHECK(batchGrad_);
+  batch2seqPadding(
+      output.grad, batchGrad_, output.sequenceStartPositions, normByTimes_);
+}
+void WarpCTCLayer::seq2batchPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   false,
+                                   true);
+  } else {
+    for (size_t i = 0; i < maxSequenceLength_; i++) {
+      for (size_t j = 0; j < numSequences; j++) {
+        size_t sequenceStart = seqStartPositionsData[j];
+        size_t sequenceLength =
+            seqStartPositionsData[j + 1] - seqStartPositionsData[j];
+        if (i < sequenceLength) {
+          memcpy(batchData + (i * numSequences + j) * numClasses_,
+                 seqData + (sequenceStart + i) * numClasses_,
+                 numClasses_ * sizeof(real));
+        } else {
+          memset(batchData + (i * numSequences + j) * numClasses_,
+                 0,
+                 numClasses_ * sizeof(real));
+        }
+      }
+    }
+  }
+}
+void WarpCTCLayer::batch2seqPadding(const MatrixPtr& seqValue,
+                                    MatrixPtr& batchValue,
+                                    const ICpuGpuVectorPtr& seqStartPositions,
+                                    bool normByTimes) {
+  size_t numSequences = seqStartPositions->getSize() - 1;
+  const int* seqStartPositionsData = seqStartPositions->getData(useGpu_);
+  real* seqData = seqValue->getData();
+  real* batchData = batchValue->getData();
+  if (useGpu_) {
+    hl_sequence2batch_copy_padding(batchData,
+                                   seqData,
+                                   seqStartPositionsData,
+                                   numClasses_,
+                                   maxSequenceLength_,
+                                   numSequences,
+                                   normByTimes,
+                                   false);
+  } else {
+    for (size_t i = 0; i < numSequences; i++) {
+      int sequenceStart = seqStartPositionsData[i];
+      int sequenceLength =
+          seqStartPositionsData[i + 1] - seqStartPositionsData[i];
+      real scale = normByTimes ? (1.0f / (real)sequenceLength) : 1.0f;
+      for (int j = 0; j < sequenceLength; j++) {
+        for (size_t k = 0; k < numClasses_; k++) {
+          seqData[(sequenceStart + j) * numClasses_ + k] =
+              batchData[(j * numSequences + i) * numClasses_ + k] * scale;
+        }
+      }
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/WarpCTCLayer.h
+++ b/paddle/gserver/layers/WarpCTCLayer.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "Layer.h"
+namespace paddle {
+/**
+ * @brief A layer integrating the open-source warp-ctc library
+ *        <https://github.com/baidu-research/warp-ctc> to compute connectionist
+ *        temporal classification cost.
+ *
+ * The config file api is warp_ctc_layer.
+ */
+class WarpCTCLayer : public Layer {
+public:
+  explicit WarpCTCLayer(const LayerConfig& config) : Layer(config) {}
+  ~WarpCTCLayer() {}
+  virtual bool init(const LayerMap& layerMap, const ParameterMap& parameterMap);
+  virtual void forward(PassType passType);
+  virtual void backward(const UpdateCallback& callback);
+protected:
+  /**
+   * sequence matrix and batch matrix copy:
+   * sequence (s0, s0, s0, s0; s1, s1; s2, s2, s2; s3)
+   * batch    (s0, s1, s2, s3; s0, s1, s2, 0; s0, 0, s2, 0; s0, 0, 0, 0)
+   */
+  void seq2batchPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions);
+  void batch2seqPadding(const MatrixPtr& seqValue,
+                        MatrixPtr& batchValue,
+                        const ICpuGpuVectorPtr& seqStartPositions,
+                        bool normByTimes);
+protected:
+  size_t numClasses_;
+  size_t blank_;
+  size_t maxSequenceLength_;
+  bool normByTimes_;
+  MatrixPtr batchValue_;
+  MatrixPtr batchGrad_;
+  VectorPtr workspace_;
+  IVectorPtr cpuLabels_;
+  MatrixPtr cpuCosts_;
+};
+}  // namespace paddle
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -77,6 +77,17 @@ add_unittest(test_RecurrentLayer
    test_RecurrentLayer.cpp
    TestUtil.cpp)
+############### test_WarpCTCLayer #######################
+if(NOT WITH_DOUBLE)
+    add_unittest_without_exec(test_WarpCTCLayer
+        test_WarpCTCLayer.cpp
+        TestUtil.cpp)
+    add_test(NAME test_WarpCTCLayer
+        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${PROJ_ROOT}/warp-ctc/build
+        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+endif()
 ############### test_RecurrentGradientMachine ###############
 # TODO(yuyang18): There is some bug in test_RecurrentGradientMachine
 # I will fix it.

--- a/paddle/gserver/tests/test_PyDataProvider2.cpp
+++ b/paddle/gserver/tests/test_PyDataProvider2.cpp
@@ -15,16 +15,16 @@ limitations under the License. */
 #ifndef PADDLE_NO_PYTHON
 #include <gtest/gtest.h>
 #include <fstream>
-#include "paddle/utils/Util.h"
-#include "paddle/utils/PythonUtil.h"
 #include "paddle/gserver/dataproviders/DataProvider.h"
+#include "paddle/utils/PythonUtil.h"
+#include "paddle/utils/Util.h"
 P_DEFINE_string(train_list, "unittest.list", "file list for unittest");
 namespace paddle {
 namespace unittest {
 namespace pydp2 {
-extern void setOnPoolFilledHook(const std::function<void(size_t)>& func);
+extern void setOnPoolFilledHook(const std::function<void(size_t)> &func);
 extern void clearOnPoolFilledHook();
 }  // namespace pydp2
@@ -33,8 +33,8 @@ extern void clearOnPoolFilledHook();
 const paddle::real epsilon = 1e-5;
-static inline int64_t readDataBatch(paddle::DataBatch* batch,
+static inline int64_t readDataBatch(paddle::DataBatch *batch,
-                                    const std::string& funcName,
+                                    const std::string &funcName,
                                    int64_t batchSize = 65535) {
  paddle::DataConfig config;
  config.set_type("py2");
@@ -143,7 +143,7 @@ TEST(PyDataProvider2, init_hook) {
  paddle::DataBatch batch;
  int64_t num = provider->getNextBatchInternal(100000, &batch);
  ASSERT_EQ(num, 200);
-  auto& mat = batch.getStreams()[0].value;
+  auto &mat = batch.getStreams()[0].value;
  ASSERT_EQ((size_t)mat->getWidth(), (size_t)20);
  for (size_t i = 0; i < 200; ++i) {
    for (size_t j = 0; j < 20; ++j) {
@@ -170,7 +170,7 @@ TEST(PyDataProvider2, sparse_no_value_no_seq) {
  CHECK(csm != nullptr);
  for (int i = 0; i < 200; ++i) {
    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int* cols = csm->getRowCols(i);
+    int *cols = csm->getRowCols(i);
    for (int j = 0; j < 10; ++j) {
      CHECK_EQ(cols[j], (i + 1) * (j + 1));
    }
@@ -185,8 +185,8 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
  CHECK(csm != nullptr);
  for (int i = 0; i < 200; ++i) {
    CHECK_EQ(csm->getColNum(i), (size_t)10);
-    int* cols = csm->getRowCols(i);
+    int *cols = csm->getRowCols(i);
-    real* dat = csm->getRowValues(i);
+    real *dat = csm->getRowValues(i);
    for (int j = 0; j < 10; ++j) {
      EXPECT_EQ(cols[j], (i + 1) * (j + 1));
      EXPECT_EQ(dat[j], real(j) / real(i + 1));
@@ -197,7 +197,7 @@ TEST(PyDataProvider2, sparse_value_no_seq) {
 TEST(PyDataProvider2, index_seq) {
  paddle::DataBatch batch;
  CHECK_EQ(readDataBatch(&batch, "test_index_seq"), 200);
-  auto& arg = batch.getStreams()[0];
+  auto &arg = batch.getStreams()[0];
  CHECK_EQ((int)arg.ids->getSize(), (200 + 1) * 200 / 2);
  size_t tmp = 0;
  for (size_t i = 0; i < 200; ++i) {  // CHECK DATA CORRECT
@@ -219,7 +219,7 @@ TEST(PyDataProvider2, index_seq) {
 TEST(PyDataProvider2, index_sub_seq) {
  paddle::DataBatch batch;
  ASSERT_EQ(readDataBatch(&batch, "test_index_sub_seq"), 200);
-  auto& arg = batch.getStreams()[0];
+  auto &arg = batch.getStreams()[0];
  size_t tmp = 0;
  for (size_t i = 0; i < 200; ++i) {
    for (size_t j = 0; j < i + 1; ++j) {
@@ -268,7 +268,7 @@ TEST(PyDataProvider2, min_pool_size) {
    }
  });
  while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
    if (realBatchSize) {
      totalData -= realBatchSize;
    } else {
@@ -291,7 +291,7 @@ TEST(PyDataProvider2, can_over_batch_size) {
  provider->reset();
  constexpr size_t batchSize = 100;
  while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
    if (realBatchSize) {
      CHECK_LE(realBatchSize, batchSize);
    } else {
@@ -317,12 +317,12 @@ TEST(PyDataProvider2, input_order) {
  provider->reset();
  constexpr size_t batchSize = 100;
  while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(batchSize, &batch);
    if (!realBatchSize) {
      break;
    }
-    ASSERT_EQ(batch.getStreams().size(), (size_t)2);
+    ASSERT_EQ(batch.getStreams().size(), static_cast<size_t>(2));
-    for (size_t i = 0; i < realBatchSize; ++i) {
+    for (int64_t i = 0; i < realBatchSize; ++i) {
      ASSERT_EQ(batch.getStream(0).ids->getData()[i], 0);
      ASSERT_EQ(batch.getStream(1).ids->getData()[i], 1);
    }
@@ -341,11 +341,11 @@ TEST(PyDataProvider2, test_check) {
      paddle::DataProvider::create(config, false));
  provider->reset();
  while (true) {
-    size_t realBatchSize = provider->getNextBatchInternal(100, &batch);
+    int64_t realBatchSize = provider->getNextBatchInternal(100, &batch);
    if (!realBatchSize) {
      break;
    } else {
-      auto& ivec = batch.getStream(0).ids;
+      auto &ivec = batch.getStream(0).ids;
      for (size_t i = 0; i < ivec->getSize(); ++i) {
        CHECK_LT(ivec->getData()[i], 10);
      }
@@ -370,7 +370,30 @@ TEST(PyDataProvider2, multiThread) {
  provider.reset();
 }
-int main(int argc, char** argv) {
+TEST(PyDataProvider2, minPoolSizeWithCache) {
+  paddle::DataConfig config;
+  config.set_type("py2");
+  config.set_files(FLAGS_train_list.c_str());
+  config.set_load_data_module("test_PyDataProvider2");
+  config.set_load_data_object("test_min_pool_size_with_cache");
+  config.set_async_load_data(true);
+  std::unique_ptr<paddle::DataProvider> provider(
+      paddle::DataProvider::create(config, false));
+  paddle::DataBatch batch;
+  for (int i = 0; i < 10; ++i) {
+    provider->reset();
+    int64_t sum = 0;
+    while (int64_t actualNum = provider->getNextBatch(100, &batch)) {
+      sum += actualNum;
+    }
+    ASSERT_EQ(1 << 20, sum);
+  }
+}
+int main(int argc, char **argv) {
  testing::InitGoogleTest(&argc, argv);
  paddle::initMain(argc, argv);
  paddle::initPython(argc, argv);

--- a/paddle/gserver/tests/test_PyDataProvider2.py
+++ b/paddle/gserver/tests/test_PyDataProvider2.py
@@ -111,3 +111,13 @@ def test_check(settings, filename):
            if i < 10:
                yield_good_value = True
            yield i
+@provider(
+    input_types=[index_slot(10)],
+    min_pool_size=1000,
+    cache=CacheType.CACHE_PASS_IN_MEM, )
+def test_min_pool_size_with_cache(settings, filename):
+    import random
+    for _ in xrange(2**20):
+        yield random.randint(0, 9)
--- a/paddle/gserver/tests/test_WarpCTCLayer.cpp
+++ b/paddle/gserver/tests/test_WarpCTCLayer.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include <paddle/utils/Version.h>
+#include "paddle/gserver/layers/Layer.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/gserver/layers/CTCLayer.h"
+#include "paddle/gserver/layers/WarpCTCLayer.h"
+#include "ModelConfig.pb.h"
+#include "TestUtil.h"
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+P_DECLARE_bool(use_gpu);
+const real* getData(const Matrix& matrix) {
+  if (matrix.useGpu()) {
+    MatrixPtr cpuMatrix = Matrix::create(
+        matrix.getHeight(), matrix.getWidth(), matrix.isTransposed(), false);
+    cpuMatrix->copyFrom(matrix);
+    return cpuMatrix->getData();
+  } else {
+    return matrix.getData();
+  }
+}
+int checkError(const Matrix& matrix1, const Matrix& matrix2) {
+  CHECK_EQ(matrix1.getHeight(), matrix2.getHeight());
+  CHECK_EQ(matrix1.getWidth(), matrix2.getWidth());
+  CHECK_EQ(matrix1.isTransposed(), matrix2.isTransposed());
+#ifndef PADDLE_TYPE_DOUBLE
+  real err = 1e-3;
+#else
+  real err = 1e-10;
+#endif
+  int height = matrix1.getHeight();
+  int width = matrix1.getWidth();
+  const real* data1 = getData(matrix1);
+  const real* data2 = getData(matrix2);
+  int count = 0;
+  for (int i = 0; i < height; i++) {
+    for (int j = 0; j < width; j++) {
+      if (fabs(data1[i * width + j] - data2[i * width + j]) > err) {
+        count++;
+      }
+    }
+  }
+  EXPECT_EQ(count, 0) << "There are " << count << " different element.";
+  return count;
+}
+void initArgument(size_t batchSize,
+                  int layerSize,
+                  bool useGpu,
+                  Argument& data) {
+  data.value = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.grad = Matrix::create(batchSize, layerSize, false, useGpu);
+  data.value->randomizeUniform();
+  data.value->add(-0.5);
+  data.grad->zeroMem();
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+}
+LayerPtr createDataLayer(
+    string name, size_t batchSize, int layerSize, bool useGpu, Argument& data) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(layerSize);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+  DataLayerPtr dataLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  dataLayer->setData(data);
+  dataLayer->forward(PASS_GC);
+  return layer;
+}
+LayerPtr createLabelLayer(string name,
+                          size_t batchSize,
+                          size_t numClasses,
+                          bool useGpu) {
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("data");
+  layerConfig.set_size(1);
+  LayerPtr layer = LayerPtr(new DataLayer(layerConfig));
+  Argument data;
+  data.ids = IVector::create(batchSize, useGpu);
+  data.ids->rand(numClasses - 1);
+  generateSequenceStartPositions(batchSize, data.sequenceStartPositions);
+  DataLayerPtr labelLayer = std::dynamic_pointer_cast<DataLayer>(layer);
+  labelLayer->setData(data);
+  labelLayer->forward(PASS_GC);
+  return layer;
+}
+LayerPtr createCTCLayer(string name,
+                        size_t numClasses,
+                        bool useGpu,
+                        bool normByTimes,
+                        LayerPtr dataLayer,
+                        LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+  ParameterMap parameterMap;
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_norm_by_times(normByTimes);
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+  LayerPtr layer = LayerPtr(new CTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+  ActivationFunction* softmaxActivation = ActivationFunction::create("softmax");
+  softmaxActivation->forward(dataLayer->getOutput());
+  layer->forward(PASS_GC);
+  layer->backward();
+  softmaxActivation->backward(dataLayer->getOutput());
+  return layer;
+}
+LayerPtr createWarpCTCLayer(string name,
+                            size_t numClasses,
+                            bool useGpu,
+                            bool normByTimes,
+                            LayerPtr dataLayer,
+                            LayerPtr labelLayer) {
+  LayerMap layerMap;
+  layerMap[dataLayer->getName()] = dataLayer;
+  layerMap[labelLayer->getName()] = labelLayer;
+  ParameterMap parameterMap;
+  LayerConfig layerConfig;
+  layerConfig.set_name(name);
+  layerConfig.set_type("warp_ctc");
+  layerConfig.set_size(numClasses);
+  layerConfig.set_blank(numClasses - 1);
+  layerConfig.set_norm_by_times(normByTimes);
+  layerConfig.add_inputs();
+  LayerInputConfig& input0 = *(layerConfig.mutable_inputs(0));
+  input0.set_input_layer_name(dataLayer->getName());
+  layerConfig.add_inputs();
+  LayerInputConfig& input1 = *(layerConfig.mutable_inputs(1));
+  input1.set_input_layer_name(labelLayer->getName());
+  LayerPtr layer = LayerPtr(new WarpCTCLayer(layerConfig));
+  layerMap[layer->getName()] = layer;
+  layer->init(layerMap, parameterMap);
+  layer->forward(PASS_GC);
+  layer->backward();
+  return layer;
+}
+TEST(Layer, WarpCTCLayer) {
+  for (auto layerSize : {10, 64}) {
+    for (auto batchSize : {1, 10, 32}) {
+      for (auto normByTimes : {false, true}) {
+        for (auto useGpu : {false, true}) {
+#ifdef PADDLE_ONLY_CPU
+          if (useGpu) continue;
+#endif
+          LOG(INFO) << "layerSize=" << layerSize << " batchSize=" << batchSize
+                    << " normByTimes = " << normByTimes << " useGpu=" << useGpu;
+          FLAGS_use_gpu = useGpu;
+          Argument data0;
+          initArgument(batchSize, layerSize, useGpu, data0);
+          Argument data1;
+          data1.resizeAndCopyFrom(data0);
+          LayerPtr dataLayer0 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data0);
+          LayerPtr dataLayer1 =
+              createDataLayer("data", batchSize, layerSize, useGpu, data1);
+          LayerPtr labelLayer =
+              createLabelLayer("label", batchSize, layerSize, useGpu);
+          LayerPtr warpctcLayer = createWarpCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer0, labelLayer);
+          LayerPtr ctcLayer = createCTCLayer(
+              "cost", layerSize, useGpu, normByTimes, dataLayer1, labelLayer);
+          /// Check cost
+          LOG(INFO) << "Check cost: "
+                    << checkError(*(warpctcLayer->getOutput().value),
+                                  *(ctcLayer->getOutput().value))
+                    << " different elements.";
+          /// Check gradients
+          LOG(INFO) << "Check gradients: "
+                    << checkError(*(dataLayer0->getOutput().grad),
+                                  *(dataLayer1->getOutput().grad))
+                    << " different elements";
+        }
+      }
+    }
+  }
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/math/BaseMatrix.cu
+++ b/paddle/math/BaseMatrix.cu
@@ -355,11 +355,11 @@ void BaseMatrixT<T>::neg() { applyUnary(unary::Neg<T>()); }
 DEFINE_MATRIX_UNARY_OP(Exp, a = exp(a));
 template<>
-void BaseMatrixT<real>::exp() { applyUnary(unary::Exp<real>()); }
+void BaseMatrixT<real>::exp2() { applyUnary(unary::Exp<real>()); }
 DEFINE_MATRIX_UNARY_OP(Log, a = log(a));
 template<>
-void BaseMatrixT<real>::log() {
+void BaseMatrixT<real>::log2() {
  if (useGpu_) {
    applyUnary(unary::Log<real>());
  } else {
@@ -369,23 +369,23 @@ void BaseMatrixT<real>::log() {
 DEFINE_MATRIX_UNARY_OP(Sqrt, a = sqrt(a));
 template<>
-void BaseMatrixT<real>::sqrt() { applyUnary(unary::Sqrt<real>()); }
+void BaseMatrixT<real>::sqrt2() { applyUnary(unary::Sqrt<real>()); }
 DEFINE_MATRIX_UNARY_OP(Square, a = a * a);
 template<class T>
-void BaseMatrixT<T>::square() { applyUnary(unary::Square<T>()); }
+void BaseMatrixT<T>::square2() { applyUnary(unary::Square<T>()); }
 DEFINE_MATRIX_UNARY_OP(Reciprocal, a = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal() { applyUnary(unary::Reciprocal<T>()); }
+void BaseMatrixT<T>::reciprocal2() { applyUnary(unary::Reciprocal<T>()); }
 DEFINE_MATRIX_UNARY_OP(Abs, a = a > 0 ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs() { applyUnary(unary::Abs<T>()); }
+void BaseMatrixT<T>::abs2() { applyUnary(unary::Abs<T>()); }
 DEFINE_MATRIX_UNARY_OP(Sign, a = (a > 0) - (a < 0));
 template<class T>
-void BaseMatrixT<T>::sign() { applyUnary(unary::Sign<T>()); }
+void BaseMatrixT<T>::sign2() { applyUnary(unary::Sign<T>()); }
 DEFINE_MATRIX_UNARY_OP(Zero, a = 0);
 template<class T>
@@ -405,7 +405,7 @@ void BaseMatrixT<T>::one() { applyUnary(unary::One<T>()); }
 DEFINE_MATRIX_UNARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(a, p));
 template<>
-void BaseMatrixT<real>::pow(real p) {
+void BaseMatrixT<real>::pow2(real p) {
  if (useGpu_) {
    applyUnary(unary::Pow<real>(p));
  } else {
@@ -534,7 +534,7 @@ void BaseMatrixT<T>::add(BaseMatrixT& b, T p) {
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Pow, ONE_PARAMETER, a = pow(b, p));
 template<>
-void BaseMatrixT<real>::pow(BaseMatrixT& b, real p) {
+void BaseMatrixT<real>::pow2(BaseMatrixT& b, real p) {
  if (useGpu_) {
    applyBinary(binary::Pow<real>(p), b);
  } else {
@@ -615,7 +615,7 @@ void BaseMatrixT<T>::breluDerivative(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Square, b = a * a);
 template<class T>
-void BaseMatrixT<T>::square(BaseMatrixT& b) {
+void BaseMatrixT<T>::square2(BaseMatrixT& b) {
  applyBinary(binary::Square<T>(), b);
 }
@@ -657,7 +657,7 @@ void BaseMatrixT<T>::scaledTanhDerivative(BaseMatrixT& b, T p1, T p2) {
 DEFINE_MATRIX_BINARY_OP(Reciprocal, b = 1.0f / a);
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b) {
  applyBinary(binary::Reciprocal<T>(), b);
 }
@@ -669,7 +669,7 @@ void BaseMatrixT<T>::reciprocalDerivative(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Abs, b = a > 0.0f ? a : -a);
 template<class T>
-void BaseMatrixT<T>::abs(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
+void BaseMatrixT<T>::abs2(BaseMatrixT& b) { applyBinary(binary::Abs<T>(), b); }
 DEFINE_MATRIX_BINARY_OP(AbsDerivative, a = (b > 0) ? a : (b < 0) ? -a : 0);
 template<class T>
@@ -729,17 +729,19 @@ void BaseMatrixT<T>::expDerivative(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Sign, b = a > 0.0f ? 1.0f : -1.0f);
 template<class T>
-void BaseMatrixT<T>::sign(BaseMatrixT& b) { applyBinary(binary::Sign<T>(), b); }
+void BaseMatrixT<T>::sign2(BaseMatrixT& b) {
+  applyBinary(binary::Sign<T>(), b);
+}
 DEFINE_MATRIX_BINARY_OP(Exp, a = exp(b));
 template<>
-void BaseMatrixT<real>::exp(BaseMatrixT& b) {
+void BaseMatrixT<real>::exp2(BaseMatrixT& b) {
  applyBinary(binary::Exp<real>(), b);
 }
 DEFINE_MATRIX_BINARY_OP(Log, a = log(b));
 template<>
-void BaseMatrixT<real>::log(BaseMatrixT& b) {
+void BaseMatrixT<real>::log2(BaseMatrixT& b) {
  if (useGpu_) {
    applyBinary(binary::Log<real>(), b);
  } else {
@@ -749,7 +751,7 @@ void BaseMatrixT<real>::log(BaseMatrixT& b) {
 DEFINE_MATRIX_BINARY_OP(Sqrt, a = sqrt(b));
 template<>
-void BaseMatrixT<real>::sqrt(BaseMatrixT& b) {
+void BaseMatrixT<real>::sqrt2(BaseMatrixT& b) {
  applyBinary(binary::Sqrt<real>(), b);
 }
@@ -1065,7 +1067,7 @@ void BaseMatrixT<T>::biggerThan(BaseMatrixT& b,
 DEFINE_MATRIX_TERNARY_OP(Max, a = (b > c) ? b : c);
 template<class T>
-void BaseMatrixT<T>::max(BaseMatrixT& b, BaseMatrixT& c) {  // NOLINT
+void BaseMatrixT<T>::max2(BaseMatrixT& b, BaseMatrixT& c) {
  applyTernary(ternary::Max<T>(), b, c);
 }
@@ -1168,7 +1170,7 @@ void BaseMatrixT<T>::reciprocalSum(BaseMatrixT& b, BaseMatrixT& c, T p1, T p2,
 DEFINE_MATRIX_BINARY_PARAMETER_OP(Reciprocal2, TWO_PARAMETER,
                                  a = 1 / (p1 * b + p2));
 template<class T>
-void BaseMatrixT<T>::reciprocal(BaseMatrixT& b, T p1, T p2) {
+void BaseMatrixT<T>::reciprocal2(BaseMatrixT& b, T p1, T p2) {
  applyBinary(binary::Reciprocal2<T>(p1, p2), b);
 }

--- a/paddle/math/BaseMatrix.h
+++ b/paddle/math/BaseMatrix.h
@@ -16,6 +16,7 @@ limitations under the License. */
 #include <cstddef>
 #include <stdint.h>
 #include "paddle/utils/TypeDefs.h"
+#include "TensorExpression.h"
 namespace paddle {
@@ -70,7 +71,7 @@ public:
 };
 template <class T>
-class BaseMatrixT {
+class BaseMatrixT : public TensorExpression<BaseMatrixT<T>, T> {
 public:
  size_t height_, width_;
  size_t stride_;
@@ -427,14 +428,14 @@ public:
   *
   */
  void neg();
-  void exp();
+  void exp2();
-  void pow(T p);
+  void pow2(T p);
-  void log();
+  void log2();
-  void sqrt();
+  void sqrt2();
-  void square();
+  void square2();
-  void reciprocal();
+  void reciprocal2();
-  void abs();
+  void abs2();
-  void sign();
+  void sign2();
  void zero();
  /**
@@ -603,7 +604,7 @@ public:
   * b = this * this
   * @endcode
   */
-  void square(BaseMatrixT& b);
+  void square2(BaseMatrixT& b);
  void squareDerivative(BaseMatrixT& b);
  /**
@@ -627,7 +628,7 @@ public:
   * b = 1.0f / this
   * @endcode
   */
-  void reciprocal(BaseMatrixT& b);
+  void reciprocal2(BaseMatrixT& b);
  void reciprocalDerivative(BaseMatrixT& b);
  /**
@@ -635,7 +636,7 @@ public:
   * b = this > 0.0f ? this : -this
   * @endcode
   */
-  void abs(BaseMatrixT& b);
+  void abs2(BaseMatrixT& b);
  void absDerivative(BaseMatrixT& b);
  /**
@@ -653,12 +654,12 @@ public:
   */
  void expDerivative(BaseMatrixT& b);
-  void sign(BaseMatrixT& b);
+  void sign2(BaseMatrixT& b);
-  void exp(BaseMatrixT& b);
+  void exp2(BaseMatrixT& b);
-  void pow(BaseMatrixT& b, T p);
+  void pow2(BaseMatrixT& b, T p);
-  void log(BaseMatrixT& b);
+  void log2(BaseMatrixT& b);
-  void sqrt(BaseMatrixT& b);
+  void sqrt2(BaseMatrixT& b);
  void addScalar(BaseMatrixT& b, T p);
  void subScalar(BaseMatrixT& b, T p);
  void mulScalar(BaseMatrixT& b, T p);
@@ -828,7 +829,7 @@ public:
   * this = b>c ? b : c
   * @endcode
   */
-  void max(BaseMatrixT& b, BaseMatrixT& c);  //  NOLINT
+  void max2(BaseMatrixT& b, BaseMatrixT& c);
  /**
   * @code
@@ -927,7 +928,7 @@ public:
   * this = 1 / (p1 * b + p2)
   * @endcode
   */
-  void reciprocal(BaseMatrixT& b, T p1, T p2);
+  void reciprocal2(BaseMatrixT& b, T p1, T p2);
  /**
   * @code
@@ -1050,6 +1051,32 @@ public:
  void rowPow(size_t cCol, BaseMatrixT& b, BaseMatrixT& c);
  virtual bool isSparse() const { return false; }
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
+  template <typename ExpressionType>
+  void operator+=(const ExpressionType& expr) {
+    (*this) = (*this) + expr;
+  }
+  template <typename ExpressionType>
+  void operator-=(const ExpressionType& expr) {
+    (*this) = (*this) - expr;
+  }
+  template <typename ExpressionType>
+  void operator*=(const ExpressionType& expr) {
+    (*this) = (*this) * expr;
+  }
+  template <typename ExpressionType>
+  void operator/=(const ExpressionType& expr) {
+    (*this) = (*this) / expr;
+  }
 };
 typedef BaseMatrixT<real> BaseMatrix;

--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -16,10 +16,12 @@ file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 set(MATH_SOURCES
    "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
+    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
    ${MATH_SOURCES})
 if(NOT WITH_GPU)
    # then compile BaseMatrix.cu as c++ file
    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
    add_library(paddle_math STATIC
        ${MATH_SOURCES})
 else()

--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -136,7 +136,7 @@ public:
    return sum;
  }
-  virtual void square() {
+  virtual void square2() {
    CHECK(isContiguous());
    if (valueType_ == NO_VALUE) {
      return;

--- a/paddle/math/Matrix.h
+++ b/paddle/math/Matrix.h
@@ -1122,6 +1122,7 @@ public:
  virtual void paramReluBackwardDiff(Matrix& oGrad, Matrix& data, Matrix& W) {
    LOG(FATAL) << "Not implemented";
  }
  virtual void bilinearForward(const Matrix& in,
                               const size_t inImgH,
                               const size_t inImgW,
@@ -1142,6 +1143,15 @@ public:
                                const real ratioW) {
    LOG(FATAL) << "Not implemented";
  }
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (useGpu_) {
+      TensorGpuApply<real>(*this, expr);
+    } else {
+      TensorCpuApply<real>(*this, expr);
+    }
+  }
 };
 inline std::ostream& operator<<(std::ostream& os, const Matrix& mat) {
@@ -1518,6 +1528,11 @@ public:
  void multiBinaryLabelCrossEntropy(Matrix& output, Matrix& label);
  void multiBinaryLabelCrossEntropyBp(Matrix& output, Matrix& label);
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<real>(*this, expr);
+  }
 };
 class CpuMatrix : public Matrix {
@@ -1917,6 +1932,11 @@ public:
                        const size_t numChannels,
                        const real ratioH,
                        const real ratioW);
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<real>(*this, expr);
+  }
 };
 class SharedCpuMatrix : public CpuMatrix {
@@ -1957,6 +1977,7 @@ public:
  void add(real p1, real p2);
 private:
+  using Matrix::mul;
  void initShared(int blockNum);
  void initBlock(int blockNum);

--- a/paddle/math/SparseRowMatrix.cpp
+++ b/paddle/math/SparseRowMatrix.cpp
@@ -15,15 +15,14 @@ limitations under the License. */
 #include "SparseRowMatrix.h"
 #include "CpuSparseMatrix.h"
-#include <cmath>
 #include <algorithm>
 #include "paddle/utils/Logging.h"
 #include "SIMDFunctions.h"
-#include "paddle/utils/Util.h"
 #include "paddle/utils/Thread.h"
+#include "paddle/utils/Util.h"
 P_DEFINE_bool(allow_inefficient_sparse_update,
              false,
@@ -34,8 +33,6 @@ namespace paddle {
 const unsigned int SparseRowCpuMatrix::kUnusedId_ = -1U;
 void SparseRowCpuMatrix::init(size_t height, size_t width) {
-  // @TODO(yuyang18) Just remove this limit
-  CHECK(simd::vec_check(width)) << width;
  height_ = height;
  if (!indexDictHandle_) {
    indexDictHandle_.reset(new IndexDict);

--- a/paddle/math/TensorApply.h
+++ b/paddle/math/TensorApply.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+namespace paddle {
+/**
+ * \brief The tensor evaluator classes.
+ */
+template <typename Derived, class T>
+class TensorApply {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+  INLINE T& applyRef(int i, int j) { return data_[i * stride_ + j]; }
+  INLINE T& applyRef(int index) { return data_[index]; }
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+  T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+/**
+ * \brief The tensor evaluator classes.
+ * evaluator for rvalues
+ */
+template <typename Derived, class T>
+class TensorApply<const Derived, T> {
+public:
+  explicit INLINE TensorApply(const Derived& p)
+      : data_(p.data_),
+        stride_(p.stride_),
+        height_(p.height_),
+        width_(p.width_),
+        useGpu_(p.useGpu_) {}
+  INLINE T apply(int i, int j) const { return data_[i * stride_ + j]; }
+  INLINE T apply(int index) const { return data_[index]; }
+  INLINE size_t getWidth() const { return width_; }
+  INLINE size_t getHeight() const { return height_; }
+  INLINE bool isContiguous() const { return stride_ == width_ || height_ == 1; }
+  INLINE bool useGpu() const { return useGpu_; }
+  const T* data_;
+  size_t stride_;
+  size_t height_;
+  size_t width_;
+  bool useGpu_;
+};
+template <typename Derived, class T>
+class TensorApply<const TensorExpression<Derived, T>, T> {
+public:
+  explicit TensorApply(const TensorExpression<Derived, T>& expr)
+      : expr_(expr.derived()) {}
+  INLINE T apply(int i, int j) const { return expr_.apply(i, j); }
+  INLINE T apply(int index) const { return expr_.apply(index); }
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+  TensorApply<const Derived, T> expr_;
+};
+/**
+ * \brief The unary expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorUnaryOp<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorUnaryOp<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+  INLINE T apply(int i, int j) const { return op_(expr_.apply(i, j)); }
+  INLINE T apply(int index) const { return op_(expr_.apply(index)); }
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return expr_.isContiguous(); }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+/**
+ * \brief The binary expression evaluator classes.
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorApply<const TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorBinaryOp<OP, LhsType, RhsType, T>& expr)
+      : op_(expr.op_), lhs_(expr.lhs_), rhs_(expr.rhs_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+  INLINE T apply(int i, int j) const {
+    return op_(lhs_.apply(i, j), rhs_.apply(i, j));
+  }
+  INLINE T apply(int index) const {
+    return op_(lhs_.apply(index), rhs_.apply(index));
+  }
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+  const OP op_;
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<RhsType, T> rhs_;
+};
+/**
+ * \brief The ternary expression evaluator classes.
+ */
+template <typename ArgType1, typename ArgType2, typename ArgType3, class T>
+class TensorApply<const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>, T> {
+public:
+  explicit INLINE TensorApply(
+      const TensorTernaryOp<ArgType1, ArgType2, ArgType3, T>& expr)
+      : expr1_(expr.expr1_), expr2_(expr.expr2_), expr3_(expr.expr3_) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(expr1_.getWidth(), expr2_.getWidth());
+    CHECK_EQ(expr1_.getWidth(), expr3_.getWidth());
+    CHECK_EQ(expr1_.getHeight(), expr2_.getHeight());
+    CHECK_EQ(expr1_.getHeight(), expr3_.getHeight());
+    CHECK_EQ(expr1_.useGpu(), expr2_.useGpu());
+    CHECK_EQ(expr1_.useGpu(), expr3_.useGpu());
+#endif
+  }
+  INLINE T apply(int i, int j) const {
+    return expr1_.apply(i, j) ? expr2_.apply(i, j) : expr3_.apply(i, j);
+  }
+  INLINE T apply(int index) const {
+    return expr1_.apply(index) ? expr2_.apply(index) : expr3_.apply(index);
+  }
+  INLINE size_t getWidth() const { return expr1_.getWidth(); }
+  INLINE size_t getHeight() const { return expr1_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return expr1_.isContiguous() && expr2_.isContiguous() &&
+           expr3_.isContiguous();
+  }
+  INLINE bool useGpu() const { return expr1_.useGpu(); }
+  TensorApply<ArgType1, T> expr1_;
+  TensorApply<ArgType2, T> expr2_;
+  TensorApply<ArgType3, T> expr3_;
+};
+/**
+ * \brief The const expression evaluator classes.
+ */
+template <class OP, typename ArgType, class T>
+class TensorApply<const TensorConstant<OP, ArgType, T>, T> {
+public:
+  explicit INLINE TensorApply(const TensorConstant<OP, ArgType, T>& expr)
+      : op_(expr.op_), expr_(expr.expr_) {}
+  INLINE T apply(int i, int j) const { return op_(i, j); }
+  INLINE T apply(int index) const { return op_(index); }
+  INLINE size_t getWidth() const { return expr_.getWidth(); }
+  INLINE size_t getHeight() const { return expr_.getHeight(); }
+  INLINE bool isContiguous() const { return true; }
+  INLINE bool useGpu() const { return expr_.useGpu(); }
+  const OP op_;
+  TensorApply<ArgType, T> expr_;
+};
+}  // namespace paddle
--- a/paddle/math/TensorAssign.h
+++ b/paddle/math/TensorAssign.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+namespace paddle {
+/**
+ * \brief Tensor Assign Expression(return by lazyAssign,
+ * and evaluated by AssignEvaluate)
+ */
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp {
+public:
+  explicit TensorAssignOp(const LhsType& lhs, const RhsType& rhs)
+      : lhs_(lhs), rhs_(rhs) {
+#ifndef __CUDA_ARCH__
+    CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+    CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+    CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+#endif
+  }
+  INLINE void apply(const int i, const int j) {
+    lhs_.applyRef(i, j) = rhs_.apply(i, j);
+  }
+  INLINE void apply(const int index) {
+    lhs_.applyRef(index) = rhs_.apply(index);
+  }
+  INLINE size_t getWidth() const { return lhs_.getWidth(); }
+  INLINE size_t getHeight() const { return rhs_.getHeight(); }
+  INLINE bool isContiguous() const {
+    return lhs_.isContiguous() && rhs_.isContiguous();
+  }
+  INLINE bool useGpu() const { return lhs_.useGpu(); }
+private:
+  TensorApply<LhsType, T> lhs_;
+  TensorApply<const RhsType, T> rhs_;
+};
+template <typename Assign, typename... AssignOp>
+void AssignCpuEvaluate(int height,
+                       int width,
+                       bool isContiguous,
+                       Assign&& assign,
+                       AssignOp&&... args) {
+  if (isContiguous) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      assign.apply(index);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(index), 0)...};
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        assign.apply(i, j);
+        __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+      }
+    }
+  }
+}
+#ifdef __NVCC__
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate1(const int border,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    assign.apply(idx);
+    __attribute__((unused)) int dummy[] = {(((args)).apply(idx), 0)...};
+  }
+}
+template <typename Assign, typename... AssignOp>
+__global__ void AssignGpuEvaluate2(const int height,
+                                   const int width,
+                                   Assign assign,
+                                   AssignOp... args) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < height; i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < width; j += gridDim.x * blockDim.x) {
+      assign.apply(i, j);
+      __attribute__((unused)) int dummy[] = {(((args)).apply(i, j), 0)...};
+    }
+  }
+}
+#endif
+/**
+ * \brief Evaluate one or more TensorAssignOp objects.
+ *
+ * \note At least one assignment expression is required
+ */
+template <typename Assign, typename... AssignOp>
+void AssignEvaluate(Assign&& assign, AssignOp&&... args) {
+  const bool useGpu_ = assign.useGpu();
+  bool isContiguous_ = assign.isContiguous();
+  const size_t height = assign.getHeight();
+  const size_t width = assign.getWidth();
+  const int packSize = sizeof...(args);
+  const bool packUseGpu[] = {((args)).useGpu()...};
+  const bool packIsContiguous[] = {((args)).isContiguous()...};
+  const size_t packHeight[] = {((args)).getHeight()...};
+  const size_t packWidth[] = {((args)).getWidth()...};
+  for (int i = 0; i < packSize; i++) {
+    CHECK_EQ(useGpu_, packUseGpu[i]);
+    CHECK_EQ(height, packHeight[i]);
+    CHECK_EQ(width, packWidth[i]);
+    isContiguous_ = isContiguous_ && packIsContiguous[i];
+  }
+  if (useGpu_) {
+#ifdef __NVCC__
+    if (isContiguous_) {
+      int size = height * width;
+      int blockSize = size <= 1024 ? size : 1024;
+      int gridSize = (size + 1024 - 1) / 1024;
+      AssignGpuEvaluate1<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+          size, assign, args...);
+    } else {
+      int blockSizeY = std::min(32, (int)height);
+      int blockSizeX = (32 / blockSizeY) * 32;
+      int gridSizeX = std::min(32, (int)(width + blockSizeX - 1) / blockSizeX);
+      int gridSizeY = std::min(32, (int)(height + blockSizeY - 1) / blockSizeY);
+      dim3 threads(blockSizeX, blockSizeY);
+      dim3 grid(gridSizeX, gridSizeY);
+      AssignGpuEvaluate2<<<grid, threads, 0, STREAM_DEFAULT>>>(
+          height, width, assign, args...);
+    }
+    CHECK_SYNC("AssignEvaluate failed");
+#endif
+  } else {
+    AssignCpuEvaluate(height, width, isContiguous_, assign, args...);
+  }
+}
+}  // namespace paddle
--- a/paddle/math/TensorEvaluate.h
+++ b/paddle/math/TensorEvaluate.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <algorithm>
+#include "paddle/utils/Logging.h"
+#include "hl_base.h"
+namespace paddle {
+/**
+ * \brief The tensor cpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorCpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+  int height = lhs_.getHeight();
+  int width = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = height * width;
+    for (int index = 0; index < size; index++) {
+      lhs_.applyRef(index) = rhs_.apply(index);
+    }
+  } else {
+    for (int i = 0; i < height; i++) {
+      for (int j = 0; j < width; j++) {
+        lhs_.applyRef(i, j) = rhs_.apply(i, j);
+      }
+    }
+  }
+}
+#ifdef __NVCC__
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs,
+                                    RightType rhs,
+                                    const int border) {
+  const int idx = blockIdx.x * blockDim.x + threadIdx.x;
+  if (idx < border) {
+    lhs.applyRef(idx) = rhs.apply(idx);
+  }
+}
+template <typename LeftType, typename RightType>
+__global__ void TensorElementWiseOp(LeftType lhs, RightType rhs) {
+  const int colIdx = blockIdx.x * blockDim.x + threadIdx.x;
+  const int rowIdx = blockIdx.y * blockDim.y + threadIdx.y;
+  for (int i = rowIdx; i < lhs.getHeight(); i += gridDim.y * blockDim.y) {
+    for (int j = colIdx; j < lhs.getWidth(); j += gridDim.x * blockDim.x) {
+      lhs.applyRef(i, j) = rhs.apply(i, j);
+    }
+  }
+}
+/**
+ * \brief The tensor gpu evaluate api.
+ */
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, const RightType& rhs) {
+  TensorApply<LeftType, T> lhs_(lhs);
+  TensorApply<const RightType, T> rhs_(rhs);
+  CHECK_EQ(lhs_.getWidth(), rhs_.getWidth());
+  CHECK_EQ(lhs_.getHeight(), rhs_.getHeight());
+  CHECK_EQ(lhs_.useGpu(), rhs_.useGpu());
+  int dimM = lhs_.getHeight();
+  int dimN = lhs_.getWidth();
+  if (lhs_.isContiguous() && rhs_.isContiguous()) {
+    int size = dimM * dimN;
+    int blockSize = size <= 1024 ? size : 1024;
+    int gridSize = (size + 1024 - 1) / 1024;
+    TensorElementWiseOp<<<gridSize, blockSize, 0, STREAM_DEFAULT>>>(
+        lhs_, rhs_, size);
+  } else {
+    int blockSizeY = std::min(32, dimM);
+    int blockSizeX = (32 / blockSizeY) * 32;
+    int gridSizeX = std::min(32, (dimN + blockSizeX - 1) / blockSizeX);
+    int gridSizeY = std::min(32, (dimM + blockSizeY - 1) / blockSizeY);
+    dim3 threads(blockSizeX, blockSizeY);
+    dim3 grid(gridSizeX, gridSizeY);
+    TensorElementWiseOp<<<grid, threads, 0, STREAM_DEFAULT>>>(lhs_, rhs_);
+  }
+  CHECK_SYNC("TensorGpuApply failed");
+}
+#else
+template <class T, typename LeftType, typename RightType>
+inline void TensorGpuApply(LeftType& lhs, RightType& rhs) {}
+#endif
+}  // namespace paddle
--- a/paddle/math/TensorExpression.h
+++ b/paddle/math/TensorExpression.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <cstddef>
+#include <stdint.h>
+#include "paddle/utils/TypeDefs.h"
+#include "paddle/utils/Logging.h"
+#include "hl_tensor_ops.h"
+namespace paddle {
+template <class OP, typename ExprType, class T>
+class TensorConstant;
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp;
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp;
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp;
+template <typename LhsType, typename RhsType, class T>
+class TensorAssignOp;
+/**
+ * \brief Tensor base class.
+ *
+ * This is the base class of all Tensor and Expression class.
+ */
+template <typename Derived, class T>
+class TensorExpression {
+public:
+  /**
+   * Element wise unary expression.
+   */
+  template <typename UnaryOp>
+  const TensorUnaryOp<UnaryOp, const Derived, T> unaryExpression(
+      const UnaryOp& op) const {
+    return TensorUnaryOp<UnaryOp, const Derived, T>(op, derived());
+  }
+  const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+      T p) const {
+    return unaryExpression(hppl::unary::add_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::sub_scale<T>, const Derived, T> operator-(
+      T p) const {
+    return unaryExpression(hppl::unary::sub_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+      T p) const {
+    return unaryExpression(hppl::unary::mul_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::div_scale<T>, const Derived, T> operator/(
+      T p) const {
+    return unaryExpression(hppl::unary::div_scale<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::neg<T>, const Derived, T> operator-() const {
+    return unaryExpression(hppl::unary::neg<T>());
+  }
+  const TensorUnaryOp<hppl::unary::exp_op<T>, const Derived, T> exp() const {
+    return unaryExpression(hppl::unary::exp_op<T>());
+  }
+  const TensorUnaryOp<hppl::unary::log_op<T>, const Derived, T> log() const {
+    return unaryExpression(hppl::unary::log_op<T>());
+  }
+  const TensorUnaryOp<hppl::unary::sqrt_op<T>, const Derived, T> sqrt() const {
+    return unaryExpression(hppl::unary::sqrt_op<T>());
+  }
+  const TensorUnaryOp<hppl::unary::square<T>, const Derived, T> square() const {
+    return unaryExpression(hppl::unary::square<T>());
+  }
+  const TensorUnaryOp<hppl::unary::reciprocal<T>, const Derived, T> reciprocal()
+      const {
+    return unaryExpression(hppl::unary::reciprocal<T>());
+  }
+  const TensorUnaryOp<hppl::unary::abs<T>, const Derived, T> abs() const {
+    return unaryExpression(hppl::unary::abs<T>());
+  }
+  const TensorUnaryOp<hppl::unary::sign<T>, const Derived, T> sign() const {
+    return unaryExpression(hppl::unary::sign<T>());
+  }
+  const TensorUnaryOp<hppl::unary::pow_op<T>, const Derived, T> pow(T p) const {
+    return unaryExpression(hppl::unary::pow_op<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::min<T>, const Derived, T> min(T p) const {
+    return unaryExpression(hppl::unary::min<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::max<T>, const Derived, T> max(T p) const {
+    return unaryExpression(hppl::unary::max<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_eq<T>, const Derived, T> operator==(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_eq<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_ne<T>, const Derived, T> operator!=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ne<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_le<T>, const Derived, T> operator<=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_le<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_lt<T>, const Derived, T> operator<(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_lt<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_ge<T>, const Derived, T> operator>=(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_ge<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::cmp_gt<T>, const Derived, T> operator>(
+      T p) const {
+    return unaryExpression(hppl::unary::cmp_gt<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::and_op<T>, const Derived, T> operator&&(
+      T p) const {
+    return unaryExpression(hppl::unary::and_op<T>(p));
+  }
+  const TensorUnaryOp<hppl::unary::or_op<T>, const Derived, T> operator||(
+      T p) const {
+    return unaryExpression(hppl::unary::or_op<T>(p));
+  }
+  /**
+   * Element wise binary expression.
+   */
+  template <typename BinaryOp, typename ExpressionType>
+  const TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>
+  binaryExpression(const BinaryOp& op, const ExpressionType& expr) const {
+    return TensorBinaryOp<BinaryOp, const Derived, const ExpressionType, T>(
+        op, derived(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_eq<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator==(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_eq<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ne<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator!=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ne<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_le<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_le<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_lt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator<(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_lt<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_ge<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>=(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_ge<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::cmp_gt<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator>(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::cmp_gt<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::and_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator&&(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::and_op<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::or_op<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator||(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::or_op<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::add<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator+(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::add<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::sub<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator-(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::sub<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::mul<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator*(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::mul<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::div<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  operator/(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::div<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::min<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  min(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::min<T>(), expr);
+  }
+  template <typename ExpressionType>
+  const TensorBinaryOp<hppl::binary::max<T>,
+                       const Derived,
+                       const ExpressionType,
+                       T>
+  max(const ExpressionType& expr) const {
+    return binaryExpression(hppl::binary::max<T>(), expr);
+  }
+  /**
+   * Element wise ternary expression.
+   *
+   * ternary conditional operator(?: operator).
+   * The conditional expression returns one of two values depending on
+   * the result of derived expression.
+   * If derived expression evaluates to true, then expression1 is evaluated.
+   * If derived expression evaluates to false, then expression2 is evaluated.
+   */
+  template <typename ExprType1, typename ExprType2>
+  const TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>
+  condition(const ExprType1& expr1, const ExprType2& expr2) const {
+    return TensorTernaryOp<const Derived, const ExprType1, const ExprType2, T>(
+        derived(), expr1, expr2);
+  }
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const ExprType,
+      T>
+  condition(T p, const ExprType& expr) const {
+    return condition(constant(p), expr);
+  }
+  template <typename ExprType>
+  const TensorTernaryOp<
+      const Derived,
+      const ExprType,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(const ExprType& expr, T p) const {
+    return condition(expr, constant(p));
+  }
+  const TensorTernaryOp<
+      const Derived,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      const TensorConstant<hppl::unary::constant<T>, const Derived, T>,
+      T>
+  condition(T p1, T p2) const {
+    return condition(constant(p1), constant(p2));
+  }
+  /**
+   * return a TensorConstant. A TensorConstant object hold a constant value.
+   */
+  const TensorConstant<hppl::unary::constant<T>, const Derived, T> constant(
+      T p) const {
+    return TensorConstant<hppl::unary::constant<T>, const Derived, T>(
+        hppl::unary::constant<T>(p), derived());
+  }
+  /**
+   * return a TensorAssignOp, and use AssignEvaluate to evaluate one or more
+   * TensorAssignOp objects.
+   */
+  template <typename ExpressionType>
+  TensorAssignOp<Derived, ExpressionType, T> lazyAssign(
+      const ExpressionType& expr) const {
+    return TensorAssignOp<Derived, ExpressionType, T>(derived(), expr);
+  }
+protected:
+  const Derived& derived() const { return *static_cast<const Derived*>(this); }
+};
+/**
+ * \brief Unary Operator Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorUnaryOp
+    : public TensorExpression<TensorUnaryOp<OP, ExprType, T>, T> {
+public:
+  explicit TensorUnaryOp(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+  const OP op_;
+  const ExprType expr_;
+};
+/**
+ * \brief Binary Operator Expression
+ */
+template <class OP, typename LhsType, typename RhsType, class T>
+class TensorBinaryOp
+    : public TensorExpression<TensorBinaryOp<OP, LhsType, RhsType, T>, T> {
+public:
+  explicit TensorBinaryOp(const OP op, const LhsType& lhs, const RhsType& rhs)
+      : op_(op), lhs_(lhs), rhs_(rhs) {}
+  const OP op_;
+  const LhsType lhs_;
+  const RhsType rhs_;
+};
+/**
+ * \brief Ternary Operator Expression
+ */
+template <typename ExprType1, typename ExprType2, typename ExprType3, class T>
+class TensorTernaryOp : public TensorExpression<
+                            TensorTernaryOp<ExprType1, ExprType2, ExprType3, T>,
+                            T> {
+public:
+  explicit TensorTernaryOp(const ExprType1& expr1,
+                           const ExprType2& expr2,
+                           const ExprType3& expr3)
+      : expr1_(expr1), expr2_(expr2), expr3_(expr3) {}
+  const ExprType1 expr1_;
+  const ExprType2 expr2_;
+  const ExprType3 expr3_;
+};
+/**
+ * \brief Constant Expression
+ */
+template <class OP, typename ExprType, class T>
+class TensorConstant
+    : public TensorExpression<TensorConstant<OP, ExprType, T>, T> {
+public:
+  explicit TensorConstant(const OP op, const ExprType& expr)
+      : op_(op), expr_(expr) {}
+  const OP op_;
+  const ExprType expr_;
+};
+/**
+ * \brief operator+ overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::add_scale<T>, const Derived, T> operator+(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr + p;
+}
+/**
+ * \brief operator* overload
+ * \return a unary operator expression
+ */
+template <typename Derived, class T>
+const TensorUnaryOp<hppl::unary::mul_scale<T>, const Derived, T> operator*(
+    T p, const TensorExpression<Derived, T>& expr) {
+  return expr * p;
+}
+}  // namespace paddle
+#include "TensorApply.h"
+#include "TensorEvaluate.h"
--- a/paddle/math/TrainingAlgorithmOp.cu
+++ b/paddle/math/TrainingAlgorithmOp.cu
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/utils/Logging.h"
+#include "BaseMatrix.h"
+#include "TrainingAlgorithmOp.h"
+#if __cplusplus > 199711L
+#include "TensorAssign.h"
+namespace paddle {
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  auto expr1 = momU.lazyAssign(momU - (alpha * gamma * learningRate) * grad);
+  auto expr2 = momV.lazyAssign(
+    momV + (tau * alpha * gamma * learningRate) * grad);
+  auto expr3 = value.lazyAssign(
+    (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV);
+  AssignEvaluate(expr1, expr2, expr3);
+}
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  auto expr1 = accum.lazyAssign(rou * accum + ((real)1 - rou) * grad.square());
+  auto expr2 = lr.lazyAssign(
+    ((accum_update + epsilon) / (accum + epsilon)).sqrt());
+  auto expr3 = accum_update.lazyAssign(
+    rou * accum_update + ((real)1 - rou) * (grad * lr).square());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+  AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+}
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  auto expr1 = accum.lazyAssign(accum + grad.square());
+  auto expr2 = lr.lazyAssign(
+    (accum_buffer + accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+  AssignEvaluate(expr1, expr2, expr3, expr4);
+}
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  auto expr2 = f.lazyAssign(accumulatedRou * f + ((real)1 - rou) * grad);
+  auto expr3 = lr.lazyAssign((g - f.square() + epsilon).sqrt().reciprocal());
+  auto expr4 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr5 = value.lazyAssign(value + mom);
+  if (firstTime) {
+    auto expr1 = g.lazyAssign(accumulatedRou * g + grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  } else {
+    auto expr1 = g.lazyAssign(
+      accumulatedRou * g + ((real)1 - rou) * grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4, expr5);
+  }
+}
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  auto expr2 = lr.lazyAssign((accum + epsilon).sqrt().reciprocal());
+  auto expr3 = mom.lazyAssign(
+    mom * momentum - learningRate * lr * (grad + value * decayRate));
+  auto expr4 = value.lazyAssign(value + mom);
+  if (firstTime) {
+    auto expr1 = accum.lazyAssign(accumulatedRou * accum + grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  } else {
+    auto expr1 = accum.lazyAssign(
+      accumulatedRou * accum + ((real)1 - rou) * grad.square());
+    AssignEvaluate(expr1, expr2, expr3, expr4);
+  }
+}
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = v.lazyAssign(beta2 * v + ((real)1 - beta2) * grad.square());
+  auto expr3 = value.lazyAssign(
+    value - (mom * alpha) / (v.sqrt() + epsilon));
+  AssignEvaluate(expr1, expr2, expr3);
+}
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  auto expr1 = mom.lazyAssign(beta1 * mom + ((real)1 - beta1) * grad);
+  auto expr2 = u.lazyAssign(
+    (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs()));
+  auto expr3 = value.lazyAssign(
+    value - (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u));
+  AssignEvaluate(expr1, expr2, expr3);
+}
+}  // namespace paddle
+#else
+namespace paddle {
+void sparseMomentumApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& momU,
+                         BaseMatrix& momV,
+                         real alpha,
+                         real beta,
+                         real gamma,
+                         real tau,
+                         real learningRate) {
+  /**
+   * \alpha_t = \alpha_{t-1} / k
+   * \beta_t = \beta_{t-1} / (1 + \lambda\gamma_t)
+   * u_t = u_{t-1} - \alpha_t \gamma_t g_t
+   * v_t = v_{t-1} + \tau_{t-1} \alpha_t \gamma_t g_t
+   * \tau_t = \tau_{t-1} + \beta_t / \alpha_t
+   */
+  momU -= (alpha * gamma * learningRate) * grad;
+  momV += (tau * alpha * gamma * learningRate) * grad;
+  value = (tau / beta + (real)1 / alpha) * momU + ((real)1 / beta) * momV;
+}
+void adadeltaApply(BaseMatrix& value,
+                   BaseMatrix& grad,
+                   BaseMatrix& mom,
+                   BaseMatrix& accum,
+                   BaseMatrix& accum_update,
+                   BaseMatrix& lr,
+                   real rou,
+                   real epsilon,
+                   real learningRate,
+                   real momentum,
+                   real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  accum = rou * accum + ((real)1 - rou) * grad.square();
+  // learn_rate: sqrt(( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ))
+  lr = ((accum_update + epsilon) / (accum + epsilon)).sqrt();
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  accum_update = rou * accum_update + ((real)1 - rou) * (grad * lr).square();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void adagradApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& accum_buffer,
+                  BaseMatrix& accum,
+                  BaseMatrix& lr,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate) {
+  accum += grad.square();
+  lr = (accum_buffer + accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void rmspropApply(BaseMatrix& value,
+                  BaseMatrix& grad,
+                  BaseMatrix& mom,
+                  BaseMatrix& g,
+                  BaseMatrix& f,
+                  BaseMatrix& lr,
+                  real accumulatedRou,
+                  real rou,
+                  real epsilon,
+                  real learningRate,
+                  real momentum,
+                  real decayRate,
+                  bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    g = accumulatedRou * g + grad.square();
+  } else {
+    g = accumulatedRou * g + ((real)1 - rou) * grad.square();
+  }
+  // E(f_t) = \rou * E(f_{t-1}) + (1-\rou) * g
+  f = accumulatedRou * f + ((real)1 - rou) * grad;
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(f_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  lr = (g - f.square() + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void decayedAdagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& mom,
+                         BaseMatrix& accum,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  if (firstTime) {
+    accum = accumulatedRou * accum + grad.square();
+  } else {
+    accum = accumulatedRou * accum + ((real)1 - rou) * grad.square();
+  }
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  lr = (accum + epsilon).sqrt().reciprocal();
+  mom = mom * momentum - learningRate * lr * (grad + value * decayRate);
+  value += mom;
+}
+void adamApply(BaseMatrix& value,
+               BaseMatrix& grad,
+               BaseMatrix& mom,  // firse moment
+               BaseMatrix& v,    // second moment
+               real beta1,
+               real beta2,
+               real beta1_power,
+               real beta2_power,
+               real epsilon,
+               real learningRate) {
+  real alpha = learningRate *
+      std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  v = beta2 * v + ((real)1 - beta2) * grad.square();
+  value -=  (mom * alpha) / (v.sqrt() + epsilon);
+}
+void adamaxApply(BaseMatrix& value,
+                 BaseMatrix& grad,
+                 BaseMatrix& mom,  // firse moment
+                 BaseMatrix& u,    // weighted infinity norm
+                 real beta1,
+                 real beta2,
+                 int64_t step,
+                 real alpha) {
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  mom = beta1 * mom + ((real)1 - beta1) * grad;
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u = (beta2 * u > grad.abs()).condition(beta2 * u, grad.abs());
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  value -= (alpha / ((real)1 - (real)std::pow(beta1, step))) * (mom / u);
+}
+}  // namespace paddle
+#endif
--- a/paddle/math/TrainingAlgorithmOp.h
+++ b/paddle/math/TrainingAlgorithmOp.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/utils/Logging.h"
+#include "BaseMatrix.h"
+namespace paddle {
+/**
+ * \brief Sparse Momentum optimizer.
+ */
+extern void sparseMomentumApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& momU,
+                                BaseMatrix& momV,
+                                real alpha,
+                                real beta,
+                                real gamma,
+                                real tau,
+                                real learningRate);
+/**
+ * \brief AdaDelta optimizer.
+ */
+extern void adadeltaApply(BaseMatrix& value,
+                          BaseMatrix& grad,
+                          BaseMatrix& sum,
+                          BaseMatrix& sum1,
+                          BaseMatrix& mom,
+                          BaseMatrix& lr,
+                          real rou,
+                          real epsilon,
+                          real learningRate,
+                          real momentum,
+                          real decayRate);
+/**
+ * \brief AdaGrad optimizer.
+ */
+extern void adagradApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& sum,
+                         BaseMatrix& sum1,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate);
+/**
+ * \brief RMSProp optimizer.
+ */
+extern void rmspropApply(BaseMatrix& value,
+                         BaseMatrix& grad,
+                         BaseMatrix& g,
+                         BaseMatrix& f,
+                         BaseMatrix& mom,
+                         BaseMatrix& lr,
+                         real accumulatedRou,
+                         real rou,
+                         real epsilon,
+                         real learningRate,
+                         real momentum,
+                         real decayRate,
+                         bool firstTime);
+/**
+ * \brief Decayed AdaGrad optimizer.
+ */
+extern void decayedAdagradApply(BaseMatrix& value,
+                                BaseMatrix& grad,
+                                BaseMatrix& mom,
+                                BaseMatrix& accum,
+                                BaseMatrix& lr,
+                                real accumulatedRou,
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate,
+                                bool firstTime);
+/**
+ * \brief Adam optimizer.
+ */
+extern void adamApply(BaseMatrix& value,
+                      BaseMatrix& grad,
+                      BaseMatrix& mom,
+                      BaseMatrix& v,
+                      real beta1,
+                      real beta2,
+                      real beta1_power,
+                      real beta2_power,
+                      real epsilon,
+                      real learningRate);
+/**
+ * \brief AdaMax optimizer.
+ */
+extern void adamaxApply(BaseMatrix& value,
+                        BaseMatrix& grad,
+                        BaseMatrix& mom,  // firse moment
+                        BaseMatrix& u,    // weighted infinity norm
+                        real beta1,
+                        real beta2,
+                        int64_t step,
+                        real alpha);
+}  // namespace paddle
--- a/paddle/math/Vector.h
+++ b/paddle/math/Vector.h
@@ -265,6 +265,15 @@ public:
  /// print the "idx" element of the Vector
  virtual void printOneElement(std::ostream& os, size_t idx) const = 0;
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    if (BaseVector<T>::useGpu_) {
+      TensorGpuApply<T>(*this, expr);
+    } else {
+      TensorCpuApply<T>(*this, expr);
+    }
+  }
 protected:
  friend class GpuVectorT<T>;
  friend class CpuVectorT<T>;
@@ -322,6 +331,11 @@ public:
  virtual void print(std::ostream& os, size_t num) const;
  virtual void printOneElement(std::ostream& os, size_t idx) const;
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorGpuApply<T>(*this, expr);
+  }
 protected:
  virtual void copyTo(CpuVectorT<T>* dest) const;
  virtual void copyTo(GpuVectorT<T>* dest) const;
@@ -385,6 +399,11 @@ public:
  virtual T get(size_t pos);
  virtual void print(std::ostream& os, size_t num) const;
  virtual void printOneElement(std::ostream& os, size_t idx) const;
+  template <typename ExpressionType>
+  void operator=(const ExpressionType& expr) {
+    TensorCpuApply<T>(*this, expr);
+  }
 };
 template <class T>

--- a/paddle/math/tests/CMakeLists.txt
+++ b/paddle/math/tests/CMakeLists.txt
@@ -2,6 +2,7 @@
 add_simple_unittest(test_ExecViaCpu)
 add_simple_unittest(test_SIMDFunctions)
+add_simple_unittest(test_TrainingAlgorithm)
 add_simple_unittest(test_SparseMatrix)
 # TODO(yuyang18): Refactor TestUtil.cpp. Remove this cross module reference.
@@ -13,6 +14,21 @@ add_simple_unittest(test_sparseMatrixCompare)
 add_simple_unittest(test_perturbation)
 add_simple_unittest(test_CpuGpuVector)
 add_simple_unittest(test_Allocator)
+if(WITH_GPU)
+    if(COMPILER_SUPPORT_CXX11)
+    	CUDA_ADD_EXECUTABLE(test_Tensor test_Tensor.cu)
+		link_paddle_test(test_Tensor)
+        CUDA_ADD_EXECUTABLE(test_lazyAssign test_lazyAssign.cu)
+        link_paddle_test(test_lazyAssign)
+    endif()
+else()
+    compile_cu_as_cpp(test_Tensor.cu)
+    add_unittest(test_Tensor test_Tensor.cu)
+    compile_cu_as_cpp(test_lazyAssign.cu)
+    add_unittest(test_lazyAssign test_lazyAssign.cu)
+endif(WITH_GPU)
 add_simple_unittest(test_FPException)
 add_simple_unittest(test_GpuProfiler)
 add_simple_unittest(test_BaseMatrix)

--- a/paddle/math/tests/OriginalOptimizerApi.h
+++ b/paddle/math/tests/OriginalOptimizerApi.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "paddle/utils/GlobalConstants.h"
+#include "paddle/math/Vector.h"
+using namespace paddle;  // NOLINT
+void SparseMomentumParameterOptimizer(const VectorPtr vecs[],
+                                      real alpha,
+                                      real beta,
+                                      real gamma,
+                                      real tau,
+                                      real learningRate) {
+  vecs[PARAMETER_MOMENTUM_UT]->add(*vecs[PARAMETER_GRADIENT],
+                                   -alpha * gamma * learningRate);
+  vecs[PARAMETER_MOMENTUM_VT]->add(*vecs[PARAMETER_GRADIENT],
+                                   tau * alpha * gamma * learningRate);
+  vecs[PARAMETER_VALUE]->add(*vecs[PARAMETER_MOMENTUM_UT],
+                             tau / beta + 1.0 / alpha,
+                             *vecs[PARAMETER_MOMENTUM_VT],
+                             1.0 / beta);
+}
+void AdagradParameterOptimizer(const VectorPtr vecs[],
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate) {
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+                                                1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void AdaDeltaParameterOptimizer(const VectorPtr vecs[],
+                                real rou,
+                                real epsilon,
+                                real learningRate,
+                                real momentum,
+                                real decayRate) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], rou, 1.0f - rou);
+  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
+                                        epsilon,
+                                        epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->sqrt2();
+  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+      *vecs[PARAMETER_GRADIENT],
+      *vecs[PARAMETER_LEARNING_RATE],
+      rou,
+      1.0f - rou);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void RMSPropParameterOptimizer(const VectorPtr vecs[],
+                               real accumulatedRou,
+                               real rou,
+                               real epsilon,
+                               real learningRate,
+                               real momentum,
+                               real decayRate,
+                               bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou);
+  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+  // Basiclly if the sign of the gradient changes more often,
+  // the learning rate will be decreased.
+  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+                                           -1.0f);
+  vecs[PARAMETER_LEARNING_RATE]->add(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void DecayedAdagradParameterOptimizer(const VectorPtr vecs[],
+                                      real accumulatedRou,
+                                      real rou,
+                                      real epsilon,
+                                      real learningRate,
+                                      real momentum,
+                                      real decayRate,
+                                      bool firstTime) {
+  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  // For the first time update, make the sum be the current square
+  // so that the initial estimation of E(g_t^2) will not be too small.
+  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+      *vecs[PARAMETER_GRADIENT], accumulatedRou, firstTime ? 1.0f : 1.0f - rou);
+  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+  // Basiclly if the bigger the magnitude gradient is,
+  // the smaller the learning rate will be.
+  vecs[PARAMETER_LEARNING_RATE]->assign(epsilon);
+  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                                   *vecs[PARAMETER_MOMENTUM],
+                                   *vecs[PARAMETER_LEARNING_RATE],
+                                   learningRate,
+                                   momentum,
+                                   decayRate);
+}
+void AdamParameterOptimizer(const VectorPtr vecs[],
+                            real beta1,
+                            real beta2,
+                            real beta1_power,
+                            real beta2_power,
+                            real epsilon,
+                            real learningRate) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
+  g->square2();
+  v->add(*g, beta2, 1 - beta2);
+  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+  g->sqrt2(*v);
+  g->dotDiv(*m, *g, 0., epsilon);
+  real alpha =
+      learningRate * std::sqrt((real)1 - beta2_power) / ((real)1 - beta1_power);
+  theta->add(*theta, 1.0, *g, -alpha);
+}
+void AdamaxParameterOptimizer(
+    const VectorPtr vecs[], real beta1, real beta2, int64_t step, real alpha) {
+  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
+  Vector* theta = vecs[PARAMETER_VALUE].get();
+  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  m->add(*g, beta1, 1 - beta1);
+  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  u->mulScalar(beta2);
+  g->abs2();
+  u->max2(*u, *g);
+  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  g->dotDiv(*m, *u);
+  real learningRate = alpha / (1 - std::pow(beta1, step));
+  theta->add(*theta, 1.0, *g, -learningRate);
+}
--- a/paddle/math/tests/PerfUtils.h
+++ b/paddle/math/tests/PerfUtils.h
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+// Performance Check
+#ifdef PADDLE_DISABLE_TIMER
+#define EXPRESSION_PERFORMANCE(expression) expression;
+#else
+#include "paddle/utils/Stat.h"
+using namespace paddle;  // NOLINT
+#define EXPRESSION_PERFORMANCE(expression)                             \
+  do {                                                                 \
+    char expr[30];                                                     \
+    strncpy(expr, #expression, 30);                                    \
+    if (expr[29] != '\0') {                                            \
+      expr[27] = '.';                                                  \
+      expr[28] = '.';                                                  \
+      expr[29] = '\0';                                                 \
+    }                                                                  \
+    expression;                                                        \
+    for (int i = 0; i < 20; i++) {                                     \
+      REGISTER_TIMER(expr);                                            \
+      expression;                                                      \
+    }                                                                  \
+    LOG(INFO) << std::setiosflags(std::ios::left) << std::setfill(' ') \
+              << *globalStat.getStat(expr);                            \
+    globalStat.reset();                                                \
+  } while (0)
+#endif
--- a/paddle/math/tests/test_BaseMatrix.cpp
+++ b/paddle/math/tests/test_BaseMatrix.cpp
@@ -37,13 +37,13 @@ TEST(BaseMatrix, void) {
      };
      compare(&BaseMatrix::neg);
-      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::sqrt2);
-      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::square2);
-      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::reciprocal2);
-      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::abs2);
-      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::sign2);
      compare(&BaseMatrix::zero);
      compare(&BaseMatrix::one);
    }
@@ -59,7 +59,7 @@ TEST(BaseMatrix, real) {
        test.cmpWithoutArg<0>(f, height, width);
      };
-      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::pow2);
      compare(&BaseMatrix::subScalar);
      compare(&BaseMatrix::mulScalar);
      compare(&BaseMatrix::divScalar);
@@ -88,21 +88,21 @@ TEST(BaseMatrix, BaseMatrix) {
      compare(&BaseMatrix::softreluDerivative);
      compare(&BaseMatrix::brelu);
      compare(&BaseMatrix::breluDerivative);
-      compare(&BaseMatrix::square);
+      compare(&BaseMatrix::square2);
      compare(&BaseMatrix::squareDerivative);
      compare(&BaseMatrix::tanh);
      compare(&BaseMatrix::tanhDerivative);
-      compare(&BaseMatrix::reciprocal);
+      compare(&BaseMatrix::reciprocal2);
      compare(&BaseMatrix::reciprocalDerivative);
-      compare(&BaseMatrix::abs);
+      compare(&BaseMatrix::abs2);
      compare(&BaseMatrix::absDerivative);
      compare(&BaseMatrix::sigmoid);
      compare(&BaseMatrix::sigmoidDerivative);
      compare(&BaseMatrix::expDerivative);
-      compare(&BaseMatrix::sign);
+      compare(&BaseMatrix::sign2);
-      compare(&BaseMatrix::exp);
+      compare(&BaseMatrix::exp2);
-      compare(&BaseMatrix::log);
+      compare(&BaseMatrix::log2);
-      compare(&BaseMatrix::sqrt);
+      compare(&BaseMatrix::sqrt2);
      compare(&BaseMatrix::dotMul);
      compare(&BaseMatrix::dotMulSquare);
      compare(&BaseMatrix::dotSquareMul);
@@ -143,7 +143,7 @@ TEST(BaseMatrix, BaseMatrix_real) {
      compare(&BaseMatrix::addBias);
      compare(&BaseMatrix::add);
      compare(&BaseMatrix::sub);
-      compare(&BaseMatrix::pow);
+      compare(&BaseMatrix::pow2);
      compare(&BaseMatrix::addScalar);
      compare(&BaseMatrix::subScalar);
      compare(&BaseMatrix::mulScalar);
@@ -176,7 +176,7 @@ TEST(BaseMatrix, BaseMatrix_BaseMatrix) {
      compare(&BaseMatrix::logisticRegressionLoss);
      compare(&BaseMatrix::logisticRegressionLossBp);
      compare(&BaseMatrix::biggerThan);
-      compare(&BaseMatrix::max);
+      compare(&BaseMatrix::max2);
      compare(&BaseMatrix::dotMulSquare);
      compare(&BaseMatrix::dotSquareSquare);
    }

--- a/paddle/math/tests/test_Tensor.cu
+++ b/paddle/math/tests/test_Tensor.cu
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "TensorCheck.h"
+using paddle::Matrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using paddle::CpuVector;
+using paddle::GpuVector;
+using paddle::CpuIVector;
+using paddle::GpuIVector;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+#define INIT_UNARY(A1, A2)                  \
+    Tensor A1(height, width);               \
+    Tensor A2(height, width);               \
+    A1.randomizeUniform();                  \
+    A2.copyFrom(A1)
+#define INIT_BINARY(A1, A2, B)              \
+    INIT_UNARY(A1, A2);                     \
+    Tensor B(height, width);                \
+    B.randomizeUniform()
+#define INIT_TERNARY(A1, A2, B, C)          \
+    INIT_BINARY(A1, A2, B);                 \
+    Tensor C(height, width);                \
+    C.randomizeUniform()
+#define INIT_QUATERNARY(A1, A2, B, C, D)    \
+    INIT_TERNARY(A1, A2, B, C);             \
+    Tensor D(height, width);                \
+    D.randomizeUniform()
+template<typename Tensor>
+struct TestUnaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+  explicit TestUnaryMatrix(UnaryFunc testUnaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_UNARY(A1, A2);
+        testUnaryFunc(A1, A2);
+      }
+    }
+  }
+};
+template<typename Tensor>
+struct TestBinaryMatrix {
+  typedef std::function<void(Tensor& A1, Tensor& A2, Tensor& B)> BinaryFunc;
+  explicit TestBinaryMatrix(BinaryFunc testBinaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_BINARY(A1, A2, B);
+        testBinaryFunc(A1, A2, B);
+      }
+    }
+  }
+};
+template<typename Tensor>
+struct TestTernaryMatrix {
+  typedef std::function<void(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C)> TernaryFunc;
+  explicit TestTernaryMatrix(TernaryFunc testTernaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_TERNARY(A1, A2, B, C);
+        testTernaryFunc(A1, A2, B, C);
+      }
+    }
+  }
+};
+template<typename Tensor>
+struct TestQuaternaryMatrix {
+  typedef std::function<void(
+    Tensor& A1, Tensor& A2, Tensor& B, Tensor& C, Tensor& D)> QuaternaryFunc;
+  explicit TestQuaternaryMatrix(QuaternaryFunc testQuaternaryFunc) {
+    for (auto height : {1, 11, 73, 128, 200, 330}) {
+      for (auto width : {1, 32, 100, 512, 1000, 3210}) {
+        LOG(INFO) << " height=" << height << " width=" << width;
+        INIT_QUATERNARY(A1, A2, B, C, D);
+        testQuaternaryFunc(A1, A2, B, C, D);
+      }
+    }
+  }
+};
+template<typename Tensor, class T>
+struct TestUnaryVectorT {
+  typedef std::function<void(Tensor& A1, Tensor& A2)> UnaryFunc;
+  explicit TestUnaryVectorT(UnaryFunc testUnaryFunc) {
+    for (auto size : {1, 11, 73, 128, 200, 330, 512, 1000, 4210}) {
+      LOG(INFO) << " size=" << size;
+      Tensor A1(size);
+      Tensor A2(size);
+      if (typeid(T) == typeid(real)) {
+        A1.rand();
+      } else {
+        A1.rand(1000);
+      }
+      A2.copyFrom(A1);
+      testUnaryFunc(A1, A2);
+    }
+  }
+};
+void SetTensorValue(Matrix& matrix, real value) {
+  int height = matrix.getHeight();
+  int width = matrix.getWidth();
+  int stride = matrix.getStride();
+  real* data = matrix.getData();
+  for (int i = 0; i < height; i++) {
+    int j = rand() % width;  // NOLINT
+    if (typeid(matrix) == typeid(CpuMatrix)) {
+      data[i * stride + j] = value;
+    } else if (typeid(matrix) == typeid(GpuMatrix)) {
+      hl_memcpy(&data[i * stride + j], &value, sizeof(real));
+    } else {
+    }
+  }
+}
+template<typename Tensor>
+void testTensorAddScalar(Tensor& A1, Tensor& A2) {
+  real p1 = 2.5;
+  real p2 = 3.0;
+  A1.add(p1);   // a += p
+  A2 += p1;
+  TensorCheckEqual(A1, A2);
+  A1.add(p1, p2);  // a = a * p1 + p2
+  A2 = A2 * p1 + p2;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSubScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.subScalar(p);  // a -= p
+  A2 -= p;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorMulScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.mulScalar(p);  // a *= p
+  A2 *= p;
+  TensorCheckEqual(A1, A2);
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(learningRate, decayRate);
+  A2 = A2 * (1.0f / (1.0f + learningRate * decayRate));
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorDivScalar(Tensor& A1, Tensor& A2) {
+  real p = 2.5;
+  A1.divScalar(p);  // a /= p
+  A2 /= p;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorNeg(Tensor& A1, Tensor& A2) {
+  A1.neg();  // a = -a
+  A2 = -A2;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2) {
+  A1.abs2();  // a = a > 0 ? a : -a
+  A2 = A2.abs();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2) {
+  A1.square2();  // a = a * a
+  A2 = A2.square();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2) {
+  A1.reciprocal2();  // a = 1.0f / a
+  A2 = A2.reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2) {
+  A1.sign2();  // a = (a > 0) - (a < 0)
+  A2 = A2.sign();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2) {
+  A1.assign(1.5);   // a = p
+  A2 = A2.constant(1.5);
+  TensorCheckEqual(A1, A2);
+  A1.one();  // a = 1
+  A2 = A2.constant(1.0);
+  TensorCheckEqual(A1, A2);
+  A1.zero();  // a = 0
+  A2 = A2.constant(0.0);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testUnaryBaseOp(Tensor& A1, Tensor& A2) {
+  testTensorAddScalar(A1, A2);
+  testTensorSubScalar(A1, A2);
+  testTensorMulScalar(A1, A2);
+  testTensorDivScalar(A1, A2);
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+  testTensorSquare(A1, A2);
+  testTensorReciprocal(A1, A2);
+  testTensorSign(A1, A2);
+  testTensorAssign(A1, A2);
+}
+template<typename Tensor>
+void testUnaryBaseOpInt(Tensor& A1, Tensor& A2) {
+  A1.add(2);   // a += p
+  A2 += 2;
+  TensorCheckEqual(A1, A2);
+  A1.add(3, 2);  // a = a * p1 + p2
+  A2 = A2 * 3 + 2;
+  TensorCheckEqual(A1, A2);
+  testTensorNeg(A1, A2);
+  testTensorAbs(A1, A2);
+}
+TEST(Unary, BaseOp) {
+  TestUnaryMatrix<CpuMatrix> testCpuMatrix(testUnaryBaseOp<CpuMatrix>);
+  TestUnaryVectorT<CpuVector, real> testCpuVector(testUnaryBaseOp<CpuVector>);
+  TestUnaryVectorT<CpuIVector, int>
+    testCpuIVector(testUnaryBaseOpInt<CpuIVector>);
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpuMatrix(testUnaryBaseOp<GpuMatrix>);
+  TestUnaryVectorT<GpuVector, real> testGpuVector(testUnaryBaseOp<GpuVector>);
+  TestUnaryVectorT<GpuIVector, int>
+    testGpuIVector(testUnaryBaseOpInt<GpuIVector>);
+#endif
+}
+template<typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2) {
+  A1.exp2();  // a = exp(a)
+  A2 = A2.exp();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2) {
+  A1.log2();  // a = log(a)
+  A2 = A2.log();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2) {
+  A1.sqrt2();  // a = sqrt(a)
+  A2 = A2.sqrt();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2) {
+  A1.pow2(3.2);  // a = pow(a, p)
+  A2 = A2.pow(3.2);
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testUnayrMathOp(Tensor& A1, Tensor& A2) {
+  testTensorExp(A1, A2);
+  testTensorLog(A1, A2);
+  testTensorSqrt(A1, A2);
+  testTensorPow(A1, A2);
+}
+TEST(Unary, MathOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrMathOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrMathOp<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testTensorClip(Tensor& A1, Tensor& A2) {
+  real p1 = 0.003f;
+  real p2 = 0.877f;
+  A1.clip(p1, p2);  // a = a < p1 ? p1 : (a > p2 ? p2 : a)
+  // A2 = A2.min(0.877f).max(0.003f);
+  A2 = (A2 < p1).condition(p1, (A2 > p2).condition(p2, A2));
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorBiggerThanScalar(Tensor& A1, Tensor& A2) {
+  real p = 0.5f;
+  A1.biggerThanScalar(p);  // a = a > p ? 1.0f : 0.0f
+  A2 = (A2 > p).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2) {
+  /**
+   * T lambda = p;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(learningRate, decayRate);
+  A2 = (A2 > (learningRate * decayRate)).condition(
+    (A2 - (learningRate * decayRate)),
+    (A2 < -(learningRate * decayRate)).condition(
+      (A2 + (learningRate * decayRate)), (real)0.0));
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testUnayrCompareOp(Tensor& A1, Tensor& A2) {
+  testTensorClip(A1, A2);
+  testTensorBiggerThanScalar(A1, A2);
+  A1.randomizeUniform();
+  A1.subScalar(0.5f);
+  A2.copyFrom(A1);
+  testTensorapplyL1(A1, A2);
+}
+TEST(Unary, CompareOp) {
+  TestUnaryMatrix<CpuMatrix> testCpu(testUnayrCompareOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestUnaryMatrix<GpuMatrix> testGpu(testUnayrCompareOp<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.2;
+  A1.add(B);  // a += b
+  A2 += B;
+  TensorCheckEqual(A1, A2);
+  A1.add(B, p1);  // a += b * p
+  A2 += B * p1;
+  TensorCheckEqual(A1, A2);
+  A1.add(B, p1, p2);  // a = p1 * a + p2 * b
+  A2 = A2 * p1 + B * p2;
+  TensorCheckEqual(A1, A2);
+  A1.addScalar(B, p1);  // a = b + p
+  A2 = B + p1;
+  TensorCheckEqual(A1, A2);
+  A1.addSquare(B, p1);  // a += p * b * b
+  A2 += B.constant(p1) * B * B;
+  TensorCheckEqual(A1, A2);
+  A1.decayAddSquare(B, p1, p2);  // a = p1 * a + p2 * b * b
+  A2 = A2 * p1 + B.constant(p2) * B * B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.sub(B);  // a -= b
+  A2 -= B;
+  TensorCheckEqual(A1, A2);
+  A1.sub(B, p);  // a -= b * p
+  A2 -= B * p;
+  TensorCheckEqual(A1, A2);
+  A1.subScalar(B, p);  // a = b - p
+  A2 = B - p;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.mulScalar(B, p);  // a = b * p
+  A2 = B * p;
+  TensorCheckEqual(A1, A2);
+  A1.dotMulSquare(B);  // a *= b * b
+  A2 *= B * B;
+  TensorCheckEqual(A1, A2);
+  A1.dotSquareMul(B);  // a = a * a * b
+  A2 = A2 * A2 * B;
+  TensorCheckEqual(A1, A2);
+  A1.dotMul(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 2.5;
+  A1.divScalar(B, p);  // a = b / p
+  A2 = B / p;
+  TensorCheckEqual(A1, A2);
+  A1.scalarDiv(B, p);  // a = p / b
+  A2 = B.constant(p) / B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorAssign(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.assign(B);  // a = b
+  A2 = B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSquare(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.square2(A1);   // b = a * a
+  A2 = B.square();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSquareDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.squareDerivative(B);  // a *= 2.0 * b
+  A2 = A2 * (real)2.0 * B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.reciprocal2(A1);  // b = 1.0f / a
+  A2 = B.reciprocal();
+  TensorCheckEqual(A1, A2);
+  real p1 = 0.58;
+  real p2 = 0.32;
+  A1.reciprocal2(B, p1, p2);  // a = 1 / (p1 * b + p2)
+  A2 = (B * p1 + p2).reciprocal();
+  TensorCheckEqual(A1, A2);
+  real learningRate = 0.7f;
+  real decayRate = 1.2f;
+  A1.applyL2(B, learningRate, decayRate);  // a *= (1.0f / (1.0f + p * b))
+  A2 *= (B.constant(1.0f) +
+    B.constant(learningRate * decayRate) * B).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorReciprocalDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reciprocalDerivative(B);  // a *= -b * b
+  A2 *= (-B) * B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSign(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.sign2(A1);  // b = a > 0.0f ? 1.0f : -1.0f
+  A2 = B.sign();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorAbs(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.abs2(A1);  // b = a > 0.0f ? a : -a
+  A2 = B.abs();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testBinaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorAdd(A1, A2, B);
+  testTensorSub(A1, A2, B);
+  testTensorMul(A1, A2, B);
+  testTensorDiv(A1, A2, B);
+  testTensorSquare(A1, A2, B);
+  testTensorSquareDerivative(A1, A2, B);
+  testTensorReciprocal(A1, A2, B);
+  testTensorReciprocalDerivative(A1, A2, B);
+  testTensorAbs(A1, A2, B);
+  testTensorSign(A1, A2, B);
+  testTensorAssign(A1, A2, B);
+}
+TEST(Binary, BaseOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryBaseOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryBaseOp<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testTensorExp(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = exp(b)
+  A1.exp2(B);
+  A2 = B.exp();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorExpDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.expDerivative(B);  // a *= b
+  A2 *= B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorLog(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = log(b)
+  A1.log2(B);
+  A2 = B.log();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = sqrt(b)
+  A1.sqrt2(B);
+  A2 = B.sqrt();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorInvSqrt(Tensor& A1, Tensor& A2, Tensor& B) {
+  // a = 1.0f / sqrt(b)
+  A1.invSqrt(B);
+  A2 = B.sqrt().reciprocal();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorPow(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.pow2(B, 2.5f);  // a = pow(b, p)
+  A2 = B.pow(2.5f);
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorSoftrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * b = log(1.0 +
+   *         exp((a > THRESHOLD) ? THRESHOLD
+   *             : ((a < -THRESHOLD) ? (-THRESHOLD) : a)))
+   */
+  B.softrelu(A1);
+  real THRESHOLD = 40.0;
+  A2 = (B.constant(1.0f) +
+        (B > THRESHOLD).condition(
+          THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B)).exp()).log();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorSoftreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * const T THRESHOLD = 40.0;
+   * a *= (1.0 - exp(-1.0 * ((b > THRESHOLD)
+   *                             ? THRESHOLD
+   *                             : ((b < -THRESHOLD) ? (-THRESHOLD) : b)))));
+   */
+  A1.softreluDerivative(B);
+  real THRESHOLD = 40.0;
+  A2 = A2 * (B.constant(1.0f) -
+             (B.constant(-1.0f) *
+              (B > THRESHOLD).condition(
+                THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B))).exp());
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorSigmoid(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+    const T THRESHOLD_MIN = -40.0;
+    const T THRESHOLD_MAX = 13.0;
+    T tmp = (a < THRESHOLD_MIN) ? THRESHOLD_MIN
+            : ((a > THRESHOLD_MAX) ? THRESHOLD_MAX : a);
+    b = 1.0f / (1.0f + exp(-tmp)))
+   */
+  B.sigmoid(A1);
+  const real THRESHOLD_MIN = -40.0;
+  const real THRESHOLD_MAX = 13.0;
+  auto tmp = (B < THRESHOLD_MIN).condition(
+    THRESHOLD_MIN, (B > THRESHOLD_MAX).condition(THRESHOLD_MAX, B));
+  A2 = (B.constant(1.0f) + (-tmp).exp()).reciprocal();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorSigmoidDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.sigmoidDerivative(B);  // a *= b * (1 - b)
+  A2 *= B * (B.constant(1.0f) - B);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.tanh(A1);  // b = 2.0 / (1.0 + exp(-2 * a)) - 1.0
+  A2 = B.constant(2.0f) / ((B * ((real)-2.0f)).exp() + (real)1.0f) - (real)1.0f;
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.tanhDerivative(B);  // a *= 1 - b * b
+  A2 *= B.constant(1.0f) - B * B;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorScaledTanh(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // b = p1 * (2.0 / (1.0 + exp(-2 * p2 * a)) - 1.0)
+  B.scaledTanh(A1, p1, p2);
+  A2 = B.constant(p1) *
+      (B.constant(2.0f) / ((B.constant(-2.0f) * p2 * B).exp() + (real)1.0)
+       - (real)1.0);
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorScaledTanhDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p1 = 2.5;
+  real p2 = 3.1;
+  // a *= (p2 / p1) * (p1 * p1 - b * b));
+  A1.scaledTanhDerivative(B, p1, p2);
+  A2 = A2 * (B.constant(p2 / p1) * (B.constant(p1 * p1) - B * B));
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testBinaryMathOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  testTensorTanhDerivative(A1, A2, B);
+  testTensorScaledTanhDerivative(A1, A2, B);
+  testTensorSigmoidDerivative(A1, A2, B);
+  testTensorExpDerivative(A1, A2, B);
+  testTensorScaledTanh(A1, A2, B);
+  testTensorTanh(A1, A2, B);
+  testTensorExp(A1, A2, B);
+  testTensorLog(A1, A2, B);
+  testTensorSqrt(A1, A2, B);
+  testTensorInvSqrt(A1, A2, B);
+  testTensorPow(A1, A2, B);
+  testTensorSoftrelu(A1, A2, B);
+  testTensorSoftreluDerivative(A1, A2, B);
+  testTensorSigmoid(A1, A2, B);
+}
+TEST(Binary, MathOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryMathOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryMathOp<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testTensorRelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.relu(A1);  // b = a > 0.0f ? a : 0.0f
+  A2 = (B > (real)0.0f).condition(B, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorReluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.reluDerivative(B);  // a *= (b > 0.0f ? 1.0f : 0.0f)
+  A2 *= (B > (real)0.0).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorBrelu(Tensor& A1, Tensor& A2, Tensor& B) {
+  /*
+   * b = a > p1 ? a : p1
+   * b = b < p2 ? b : p2
+   * int p1 = 0, p2 = 24;
+   */
+  SetTensorValue(B, 32.0f);
+  B.brelu(A1);
+  auto tmp = (B > (real)0.0f).condition(B, (real)0.0f);
+  A2 = (tmp < (real)24.0f).condition(tmp, (real)24.0f);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorBreluDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  SetTensorValue(B, 32.0f);
+  /*
+   * a *= (b > p1 && b < p2) ? 1.0 : 0.0
+   * int p1 = 0, p2 = 24;
+   */
+  A1.breluDerivative(B);
+  A2 *= (B > (real)0.0f && B < (real)24.0f).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorAbsDerivative(Tensor& A1, Tensor& A2, Tensor& B) {
+  A1.absDerivative(B);  // a = (b > 0) ? a : (b < 0) ? -a : 0
+  A2 = (B > (real)0.0f).condition(A2,
+    (B < (real)0.0f).condition(-A2, (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorIsEqualTo(Tensor& A1, Tensor& A2, Tensor& B) {
+  real p = 0.613;
+  SetTensorValue(B, p);
+  A1.isEqualTo(B, p);  // a = (b == p)
+  A2 = (B == p);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorapplyL1(Tensor& A1, Tensor& A2, Tensor& B) {
+  /**
+   * T lambda = p * b;
+   * a = (a > lambda) ? (a - lambda)
+   *                  : (a < -lambda) ? (a + lambda) : 0
+   *
+   * p = learningRate * decayRate;
+   */
+  real learningRate = 0.7f;
+  real decayRate = 0.6f;
+  A1.applyL1(B, learningRate, decayRate);
+  auto lambda = B.constant(learningRate * decayRate) * B;
+  A2 = (A2 > lambda).condition(
+    (A2 - lambda), (A2 < -lambda).condition((A2 + lambda), (real)0.0f));
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testBinaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B) {
+  B.subScalar(0.5f);
+  SetTensorValue(B, 0.0f);
+  testTensorReluDerivative(A1, A2, B);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  testTensorBreluDerivative(A1, A2, B);
+  testTensorAbsDerivative(A1, A2, B);
+  testTensorRelu(A1, A2, B);
+  testTensorBrelu(A1, A2, B);
+  testTensorIsEqualTo(A1, A2, B);
+}
+TEST(Binary, CompareOp) {
+  TestBinaryMatrix<CpuMatrix> testCpu(testBinaryCompareOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestBinaryMatrix<GpuMatrix> testGpu(testBinaryCompareOp<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testTensorAdd(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.add(B, C);  // a = b + c
+  A2 = B + C;
+  TensorCheckEqual(A1, A2);
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.8;
+  A1.add(B, p1, C, p2);  // a = p1 * b + p2 * c
+  A2 = B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+  A1.add2(B, C);  // a = a + b + c
+  A2 = A2 + B + C;
+  TensorCheckEqual(A1, A2);
+  A1.add2(B, C, p1, p2, p3);  // a = p1 * a + p2 * b + p3 * c
+  A2 = A2 * p1 + B * p2 + C * p3;
+  TensorCheckEqual(A1, A2);
+  A1.decayAddSquareMul(B, C, p1, p2);  // a = p1 * a + p2 * b * b * c * c
+  A2 = A2 * p1 + B.constant(p2) * B * B * C * C;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSub(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.sub(B, C);  // a = b - c
+  A2 = B - C;
+  TensorCheckEqual(A1, A2);
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.sub(B, p1, C, p2);  // a = p1 * b - p2 * c
+  A2 = B * p1 - C * p2;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorMul(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotMul(B, C);  // a = b * c
+  A2 = B * C;
+  TensorCheckEqual(A1, A2);
+  A1.dotMulSquare(B, C);  // a = b * c * c
+  A2 = B * C * C;
+  TensorCheckEqual(A1, A2);
+  A1.dotSquareSquare(B, C);  // a = b * b * c * c
+  A2 = B * B * C * C;
+  TensorCheckEqual(A1, A2);
+  real p1 = 1.5;
+  real p2 = 2.5;
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a *= tmp * tmp
+   */
+  A1.dotMulSquareSum(B, C, p1, p2);
+  auto tmp = B * p1 + C * p2;
+  A2 *= tmp * tmp;
+  TensorCheckEqual(A1, A2);
+  /*
+   * T tmp = p1 * b + p2 * c;
+   * a = tmp * tmp
+   */
+  A1.dotSquareSum(B, C, p1, p2);
+  auto tmp2 = B * p1 + C * p2;
+  A2 = tmp2 * tmp2;
+  TensorCheckEqual(A1, A2);
+  // a *= p1 * b + p2 * c
+  A1.dotMulSum(B, C, p1, p2);
+  A2 *= B * p1 + C * p2;
+  TensorCheckEqual(A1, A2);
+  // a = p1 * a + p2 * b * c
+  A1.addDotMul(B, C, p1, p2);
+  A2 = A2 * p1 + B.constant(p2) * B * C;
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorDiv(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.dotDiv(B, C);  // a = (b == 0.0) ? 0.0 : b / c
+  A2 = (B == (real)0.0).condition((real)0.0, B / C);
+  TensorCheckEqual(A1, A2);
+  real p1 = 1.5;
+  real p2 = 2.5;
+  A1.dotDiv(B, C, p1, p2);  // a = (b + p1) / (c + p2)
+  A2 = (B + p1) / (C + p2);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorReciprocal(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  real p1 = 1.5;
+  real p2 = 2.5;
+  real p3 = 3.5;
+  A1.reciprocalSum(B, C, p1, p2, p3);  // a = 1 / (p1 * b + p2 * c + p3)
+  A2 = (B * p1 + C * p2 + p3).reciprocal();
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorSoftCrossEntropy(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.softCrossEntropy(B, C);  // a = -c * log(b) - (1 - c) * log(1 - b)
+  A2 = -C * B.log() - (C.constant(1.0f) - C) * (B.constant(1.0f) - B).log();
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorSoftCrossEntropyBp(Tensor& A1,
+                                  Tensor& A2,
+                                  Tensor& B,
+                                  Tensor& C) {
+  A1.softCrossEntropyBp(B, C);  // a += (b - c) / (b * (1 - b))
+  A2 += (B - C) / (B * (B.constant(1.0f) - B));
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTernaryBaseOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorAdd(A1, A2, B, C);
+  testTensorSub(A1, A2, B, C);
+  testTensorMul(A1, A2, B, C);
+  testTensorDiv(A1, A2, B, C);
+  testTensorReciprocal(A1, A2, B, C);
+  testTensorSoftCrossEntropyBp(A1, A2, B, C);
+  testTensorSoftCrossEntropy(A1, A2, B, C);
+}
+TEST(Ternary, BaseOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryBaseOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryBaseOp<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testTensorBinaryLabelCrossEntropy(Tensor& A1,
+                                       Tensor& A2,
+                                       Tensor& B,
+                                       Tensor& C) {
+  A1.binaryLabelCrossEntropy(B, C);  // a = c > 0.5 ? -log(b) : -log(1.0 - b)
+  A2 = (C > (real)0.5).condition(
+    -(B.log()), -((B.constant(1.0f) - B).log()));
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorBinaryLabelCrossEntropyBp(Tensor& A1,
+                                         Tensor& A2,
+                                         Tensor& B,
+                                         Tensor& C) {
+  // a += c > 0.5 ? -1.0 / b : 1.0 / (1.0 - b)
+  A1.binaryLabelCrossEntropyBp(B, C);
+  A2 += (C > (real)0.5).condition(
+    (B.constant(-1.0f) / B), (B.constant(1.0f) - B).reciprocal());
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorLogisticRegressionLoss(Tensor& A1,
+                                      Tensor& A2,
+                                      Tensor& B,
+                                      Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * a = log(1 + exp(x)) - c * x
+   */
+  A1.logisticRegressionLoss(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp = (B > THRESHOLD).condition(
+    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  A2 = (C.constant(1.0f) + tmp.exp()).log() - C * tmp;
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorLogisticRegressionLossBp(Tensor& A1,
+                                        Tensor& A2,
+                                        Tensor& B,
+                                        Tensor& C) {
+  SetTensorValue(B, 50.0f);
+  SetTensorValue(B, -50.0f);
+  /**
+   * const T THRESHOLD = 40.0;
+   * T x = (b > THRESHOLD) ? THRESHOLD : (b < -THRESHOLD)
+   *                                        ? -THRESHOLD
+   *                                        : b;
+   * x = exp(x); a = x / (1 + x) - c
+   */
+  A1.logisticRegressionLossBp(B, C);
+  real THRESHOLD = 40.0;
+  auto tmp = (B > THRESHOLD).condition(
+    THRESHOLD, (B < -THRESHOLD).condition(-THRESHOLD, B));
+  auto tmp2 = tmp.exp();
+  A2 = tmp2 / (C.constant(1.0) + tmp2) - C;
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorBiggerThan(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.biggerThan(B, C);  // a = (b > c) ? 1.0f : 0.0f
+  A2 = (B > C).condition((real)1.0f, (real)0.0f);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorMax(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  A1.max2(B, C);  // a = (b > c) ? b : c
+  A2 = (B > C).condition(B, C);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTernaryCompareOp(Tensor& A1, Tensor& A2, Tensor& B, Tensor& C) {
+  testTensorBinaryLabelCrossEntropyBp(A1, A2, B, C);
+  testTensorBinaryLabelCrossEntropy(A1, A2, B, C);
+  testTensorBiggerThan(A1, A2, B, C);
+  testTensorMax(A1, A2, B, C);
+  testTensorLogisticRegressionLoss(A1, A2, B, C);
+  testTensorLogisticRegressionLossBp(A1, A2, B, C);
+}
+TEST(Ternary, CompareOp) {
+  TestTernaryMatrix<CpuMatrix> testCpu(testTernaryCompareOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestTernaryMatrix<GpuMatrix> testGpu(testTernaryCompareOp<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testQuaternaryAdd(Tensor& A1,
+                       Tensor& A2,
+                       Tensor& B,
+                       Tensor& C,
+                       Tensor& D) {
+  // A1.add3(B, C, D, 1.5f, 2.5f, 3.5f);  // a = p1 * b + p2 * c + p3 * d
+  // A2 = B * 1.5f + C * 2.5f + D * 3.5f;
+  // TensorCheckEqual(A1, A2);
+  /*
+   * T tmp = p1 * b + p2 * c + p3 * d;
+   * a += tmp * tmp
+   */
+  real p1 = 1.5f;
+  real p2 = 2.5f;
+  real p3 = 3.5f;
+  A1.addSquareSum(B, C, D, p1, p2, p3);
+  auto tmp = B * p1 + C * p2 + D * p3;
+  A2 += tmp * tmp;
+  TensorCheckEqual(A1, A2);
+}
+TEST(Quaternary, BaseOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryAdd<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryAdd<GpuMatrix>);
+#endif
+}
+template<typename Tensor>
+void testTensorBiggerThan(Tensor& A1,
+                          Tensor& A2,
+                          Tensor& B,
+                          Tensor& C,
+                          Tensor& D) {
+  // a = ((b > c && d > 0.5f) || (b < c && d < 0.5f)) ? 1.0f : 0.0f);
+  A1.biggerThan(B, C, D);
+  A2 = ((B > C && D > (real)0.5)
+        || (B < C && D < (real)0.5)).condition((real)1.0, (real)0.0);
+  TensorCheckEqual(A1, A2);
+}
+template<typename Tensor>
+void testTensorRankLoss(Tensor& A1,
+                        Tensor& A2,
+                        Tensor& B,
+                        Tensor& C,
+                        Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = log(1 + exp(a)) - a * d
+   */
+  A1.rankLoss(B, C, D);
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 = (tmp > THRESHOLD).condition(
+    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  A2 = (D.constant(1.0f) + tmp2.exp()).log() - tmp2 * D;
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testTensorRankLossBp(Tensor& A1,
+                          Tensor& A2,
+                          Tensor& B,
+                          Tensor& C,
+                          Tensor& D) {
+  /**
+   * const T THRESHOLD = 40.0; a = b - c;
+   * a = (a > THRESHOLD)
+   *         ? THRESHOLD
+   *         : ((a < -THRESHOLD) ? (-THRESHOLD) : a);
+   * a = exp(a); a = (a / (1 + a) - d)
+   */
+  A1.rankLossBp(B, C, D);
+  real THRESHOLD = 40.0;
+  auto tmp = B - C;
+  auto tmp2 = (tmp > THRESHOLD).condition(
+    THRESHOLD, (tmp < -THRESHOLD).condition(-THRESHOLD, tmp));
+  auto tmp3 = tmp2.exp();
+  A2 = tmp3 / (D.constant(1.0f) + tmp3) - D;
+  TensorCheckErr(A1, A2);
+}
+template<typename Tensor>
+void testQuaternaryCompareOp(Tensor& A1,
+                             Tensor& A2,
+                             Tensor& B,
+                             Tensor& C,
+                             Tensor& D) {
+  testTensorBiggerThan(A1, A2, B, C, D);
+  testTensorRankLoss(A1, A2, B, C, D);
+  testTensorRankLossBp(A1, A2, B, C, D);
+}
+TEST(Quaternary, CompareOp) {
+  TestQuaternaryMatrix<CpuMatrix> testCpu(testQuaternaryCompareOp<CpuMatrix>);
+#ifndef PADDLE_ONLY_CPU
+  TestQuaternaryMatrix<GpuMatrix> testGpu(testQuaternaryCompareOp<GpuMatrix>);
+#endif
+}
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  hl_start();
+  hl_init(0);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/math/tests/test_TrainingAlgorithm.cpp
+++ b/paddle/math/tests/test_TrainingAlgorithm.cpp
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/utils/Util.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
+#include "OriginalOptimizerApi.h"
+#include "TensorCheck.h"
+#include "PerfUtils.h"
+using namespace paddle;  // NOLINT
+#ifndef PADDLE_TYPE_DOUBLE
+P_DEFINE_double(max_diff, 1e-5, "max diff allowed");
+#else
+P_DEFINE_double(max_diff, 1e-13, "max diff allowed");
+#endif
+class SetMaxDiff {
+public:
+  explicit SetMaxDiff(double max_diff) {
+    max_diff_ = FLAGS_max_diff;
+    FLAGS_max_diff = max_diff;
+  }
+  ~SetMaxDiff() { FLAGS_max_diff = max_diff_; }
+private:
+  double max_diff_;
+};
+#define COPY_VECTOR_TO_CPU(cpuVec, vector)               \
+  do {                                                   \
+    if (vector->useGpu()) {                              \
+      cpuVec = Vector::create(vector->getSize(), false); \
+      cpuVec->copyFrom(*vector);                         \
+    } else {                                             \
+      cpuVec = vector;                                   \
+    }                                                    \
+  } while (0)
+int VectorCheckErr(const Vector& vector1, const Vector& vector2) {
+  CHECK(vector1.getSize() == vector2.getSize());
+  const real* data1 = vector1.getData();
+  const real* data2 = vector2.getData();
+  size_t size = vector1.getSize();
+  int count = 0;
+  for (size_t i = 0; i < size; i++) {
+    real a = data1[i];
+    real b = data2[i];
+    if (fabs(a - b) > FLAGS_max_diff) {
+      if ((fabsf(a - b) / fabsf(a)) > (FLAGS_max_diff / 10.0f)) {
+        count++;
+      }
+    }
+  }
+  return count;
+}
+int VectorCheckErr(const VectorPtr& vector1, const VectorPtr& vector2) {
+  VectorPtr tmp1;
+  VectorPtr tmp2;
+  COPY_VECTOR_TO_CPU(tmp1, vector1);
+  COPY_VECTOR_TO_CPU(tmp2, vector2);
+  return VectorCheckErr(*tmp1, *tmp2);
+}
+#ifdef PADDLE_DISABLE_TIMER
+#define CHECK_VECTORPTR(vector1, vector2) \
+  EXPECT_EQ(VectorCheckErr(vector1, vector2), 0)
+#else
+#define CHECK_VECTORPTR(vector1, vector2)
+#endif
+typedef std::function<void(size_t size, bool useGpu)> testMatrixFunc;
+void testCase(testMatrixFunc matrixFunc) {
+#ifndef PADDLE_ONLY_CPU
+  for (auto useGpu : {false, true}) {
+#else
+  for (auto useGpu : {false}) {
+#endif
+    for (auto size : {1,
+                      32,
+                      64,
+                      128,
+                      512,
+                      1024,
+                      4096,
+                      32768,
+                      65536,
+                      131072,
+                      262144,
+                      524288,
+                      1048576,
+                      2097152}) {
+      LOG(INFO) << " size=" << size << " useGpu=" << useGpu;
+      matrixFunc(size, useGpu);
+    }
+  }
+}
+#define INIT_VECTOR(vec1, vec2, type, size, useGpu) \
+  vec1[type] = Vector::create(size, useGpu);        \
+  vec2[type] = Vector::create(size, useGpu);        \
+  vec1[type]->rand();                               \
+  vec2[type]->copyFrom(*vec1[type]);
+void testAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  EXPRESSION_PERFORMANCE(AdagradParameterOptimizer(
+      bufs1, epsilon, learningRate, momentum, decayRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum_buffer = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(adagradApply(value,
+                                      grad,
+                                      mom,
+                                      accum_buffer,
+                                      accum,
+                                      lr,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, Adagrad) { testCase(testAdagrad); }
+void testAdaDelta(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  EXPRESSION_PERFORMANCE(AdaDeltaParameterOptimizer(
+      bufs1, rou, epsilon, learningRate, momentum, decayRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& accum_update = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(adadeltaApply(value,
+                                       grad,
+                                       mom,
+                                       accum,
+                                       accum_update,
+                                       lr,
+                                       rou,
+                                       epsilon,
+                                       learningRate,
+                                       momentum,
+                                       decayRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, AdaDelta) { testCase(testAdaDelta); }
+template <bool isFirstTime>
+void testRMSProp(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM1, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  /* make sure 'g - f.square()' greater than 0 */
+  bufs1[PARAMETER_GRADIENT_SQURESUM]->add(1.0);
+  bufs2[PARAMETER_GRADIENT_SQURESUM]->copyFrom(
+      *bufs1[PARAMETER_GRADIENT_SQURESUM]);
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+  EXPRESSION_PERFORMANCE(RMSPropParameterOptimizer(bufs1,
+                                                   accumulatedRou,
+                                                   rou,
+                                                   epsilon,
+                                                   learningRate,
+                                                   momentum,
+                                                   decayRate,
+                                                   isFirstTime));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *bufs2[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(rmspropApply(value,
+                                      grad,
+                                      mom,
+                                      sum,
+                                      sum1,
+                                      lr,
+                                      accumulatedRou,
+                                      rou,
+                                      epsilon,
+                                      learningRate,
+                                      momentum,
+                                      decayRate,
+                                      isFirstTime));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM1],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM1]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, RMSProp) {
+  testCase(testRMSProp<true>);
+  testCase(testRMSProp<false>);
+}
+template <bool isFirstTime>
+void testDecayedAdagrad(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT_SQURESUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_LEARNING_RATE, size, useGpu);
+  real rou = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real momentum = (real)rand() / (real)RAND_MAX;      // NOLINT
+  real decayRate = (real)rand() / (real)RAND_MAX;     // NOLINT
+  real accumulatedRou = rou;
+  if (isFirstTime) {
+    bufs1[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+    bufs2[PARAMETER_GRADIENT_SQURESUM]->zeroMem();
+  }
+  EXPRESSION_PERFORMANCE(DecayedAdagradParameterOptimizer(bufs1,
+                                                          accumulatedRou,
+                                                          rou,
+                                                          epsilon,
+                                                          learningRate,
+                                                          momentum,
+                                                          decayRate,
+                                                          isFirstTime));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *bufs2[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *bufs2[PARAMETER_LEARNING_RATE];
+  EXPRESSION_PERFORMANCE(decayedAdagradApply(value,
+                                             grad,
+                                             mom,
+                                             sum,
+                                             lr,
+                                             accumulatedRou,
+                                             rou,
+                                             epsilon,
+                                             learningRate,
+                                             momentum,
+                                             decayRate,
+                                             isFirstTime));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_GRADIENT_SQURESUM],
+                  bufs2[PARAMETER_GRADIENT_SQURESUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_LEARNING_RATE],
+                  bufs2[PARAMETER_LEARNING_RATE]);
+}
+TEST(Training, DecayedAdagrad) {
+  testCase(testDecayedAdagrad<false>);
+  testCase(testDecayedAdagrad<true>);
+}
+void testAdam(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_SECOND_MOMENTUM, size, useGpu);
+  real beta1 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta1_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real beta2_power = (real)rand() / (real)RAND_MAX;   // NOLINT
+  real epsilon = (real)rand() / (real)RAND_MAX;       // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  EXPRESSION_PERFORMANCE(AdamParameterOptimizer(
+      bufs1, beta1, beta2, beta1_power, beta2_power, epsilon, learningRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *bufs2[PARAMETER_SECOND_MOMENTUM];
+  EXPRESSION_PERFORMANCE(adamApply(value,
+                                   grad,
+                                   mom,
+                                   v,
+                                   beta1,
+                                   beta2,
+                                   beta1_power,
+                                   beta2_power,
+                                   epsilon,
+                                   learningRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_SECOND_MOMENTUM],
+                  bufs2[PARAMETER_SECOND_MOMENTUM]);
+}
+TEST(Training, Adam) { testCase(testAdam); }
+void testAdamax(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_WEIGHTED_INFINITY_NORM, size, useGpu);
+  real beta1 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real beta2 = (real)rand() / (real)RAND_MAX;  // NOLINT
+  real alpha = (real)rand() / (real)RAND_MAX;  // NOLINT
+  int64_t step = 2;
+  EXPRESSION_PERFORMANCE(
+      AdamaxParameterOptimizer(bufs1, beta1, beta2, step, alpha));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *bufs2[PARAMETER_MOMENTUM];
+  BaseMatrix& u = *bufs2[PARAMETER_WEIGHTED_INFINITY_NORM];
+  EXPRESSION_PERFORMANCE(
+      adamaxApply(value, grad, mom, u, beta1, beta2, step, alpha));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM], bufs2[PARAMETER_MOMENTUM]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_WEIGHTED_INFINITY_NORM],
+                  bufs2[PARAMETER_WEIGHTED_INFINITY_NORM]);
+}
+TEST(Training, Adamax) {
+#ifndef PADDLE_TYPE_DOUBLE
+  SetMaxDiff diff(1e-4);
+#endif
+  testCase(testAdamax);
+}
+void testSparseMomentum(size_t size, bool useGpu) {
+  VectorPtr bufs1[NUM_PARAMETER_TYPES];
+  VectorPtr bufs2[NUM_PARAMETER_TYPES];
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_VALUE, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_GRADIENT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_UT, size, useGpu);
+  INIT_VECTOR(bufs1, bufs2, PARAMETER_MOMENTUM_VT, size, useGpu);
+  real alpha = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real beta = (real)rand() / (real)RAND_MAX;          // NOLINT
+  real gamma = (real)rand() / (real)RAND_MAX;         // NOLINT
+  real tau = (real)rand() / (real)RAND_MAX;           // NOLINT
+  real learningRate = (real)rand() / (real)RAND_MAX;  // NOLINT
+  EXPRESSION_PERFORMANCE(SparseMomentumParameterOptimizer(
+      bufs1, alpha, beta, gamma, tau, learningRate));
+  BaseMatrix& value = *bufs2[PARAMETER_VALUE];
+  BaseMatrix& grad = *bufs2[PARAMETER_GRADIENT];
+  BaseMatrix& momU = *bufs2[PARAMETER_MOMENTUM_UT];
+  BaseMatrix& momV = *bufs2[PARAMETER_MOMENTUM_VT];
+  EXPRESSION_PERFORMANCE(sparseMomentumApply(
+      value, grad, momU, momV, alpha, beta, gamma, tau, learningRate));
+  CHECK_VECTORPTR(bufs1[PARAMETER_VALUE], bufs2[PARAMETER_VALUE]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_UT], bufs2[PARAMETER_MOMENTUM_UT]);
+  CHECK_VECTORPTR(bufs1[PARAMETER_MOMENTUM_VT], bufs2[PARAMETER_MOMENTUM_VT]);
+}
+TEST(Training, SparseMomentum) { testCase(testSparseMomentum); }
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  hl_start();
+  hl_init(FLAGS_gpu_id);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/math/tests/test_lazyAssign.cu
+++ b/paddle/math/tests/test_lazyAssign.cu
+/* Copyright (c) 2016 Baidu, Inc. All Rights Reserve.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <gtest/gtest.h>
+#include "paddle/math/Matrix.h"
+#include "paddle/math/TensorAssign.h"
+#include "TensorCheck.h"
+#include "PerfUtils.h"
+using paddle::BaseMatrix;
+using paddle::CpuMatrix;
+using paddle::GpuMatrix;
+using autotest::TensorCheckEqual;
+using autotest::TensorCheckErr;
+typedef std::function<void(int height, int width)> testMatrixFunc;
+void testMatrixCase(testMatrixFunc matrixFunc) {
+  for (auto height : {1}) {
+    for (auto width : {1, 32, 64, 128, 512, 1024, 4096, 32768, 65536, 131072,
+                       262144, 524288, 1048576, 2097152, 4194304, 8388608}) {
+      matrixFunc(height, width);
+    }
+  }
+}
+template<typename Tensor>
+void testLazyAssign(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor B(height, width);
+  Tensor C(height, width);
+  Tensor D(height, width);
+  A1.randomizeUniform();
+  B.randomizeUniform();
+  C.randomizeUniform();
+  D.randomizeUniform();
+  A2.copyFrom(A1);
+  EXPRESSION_PERFORMANCE(A1 = B + C; A1 = A1 * D;);
+  EXPRESSION_PERFORMANCE(
+    auto expr1 = A2.lazyAssign(B + C);
+    auto expr2 = A2.lazyAssign(A2 * D);
+    AssignEvaluate(expr1, expr2););
+  TensorCheckErr(A1, A2);
+}
+TEST(lazyAssign, CPU) {
+  testMatrixCase(testLazyAssign<CpuMatrix>);
+}
+#ifndef PADDLE_ONLY_CPU
+TEST(lazyAssign, GPU) {
+  testMatrixCase(testLazyAssign<GpuMatrix>);
+}
+#endif
+template<typename Tensor>
+void sgdUpdateTensor(Tensor& A, Tensor& B, Tensor& C, Tensor& D,
+     real p1, real p2, real p3) {
+  C = C * p2 - D * (B + A * p3) * p1;
+  A += C;
+}
+void sgdUpdateLazyAssign(BaseMatrix& A, BaseMatrix& B,
+    BaseMatrix& C, BaseMatrix& D,
+    real p1, real p2, real p3) {
+  auto expr1 = C.lazyAssign(C * p2 - D * (B + A * p3) * p1);
+  auto expr2 = A.lazyAssign(A + C);
+  AssignEvaluate(expr1, expr2);
+}
+template<typename Tensor>
+void testSgdUpdate(int height, int width) {
+  Tensor A1(height, width);
+  Tensor A2(height, width);
+  Tensor A3(height, width);
+  A1.randomizeUniform();
+  A2.copyFrom(A1);
+  A3.copyFrom(A1);
+  Tensor B(height, width);
+  B.randomizeUniform();
+  Tensor C1(height, width);
+  Tensor C2(height, width);
+  Tensor C3(height, width);
+  C1.randomizeUniform();
+  C2.copyFrom(C1);
+  C3.copyFrom(C1);
+  Tensor D(height, width);
+  D.randomizeUniform();
+  real p1 = 0.2;
+  real p2 = 0.3;
+  real p3 = 0.5;
+  /**
+   * c = p2 * c - p1 * (b + p3 * a);
+   * a = a + c;
+   */
+  // BaseMatrix API
+  EXPRESSION_PERFORMANCE(
+  A1.sgdUpdate(B, C1, D, p1, p2, p3););
+  // Tensor expression
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateTensor(A2, B, C2, D, p1, p2, p3));
+  // lazyAssign
+  EXPRESSION_PERFORMANCE(
+    sgdUpdateLazyAssign(A3, B, C3, D, p1, p2, p3));
+  TensorCheckErr(A1, A2);
+  TensorCheckErr(A1, A3);
+  TensorCheckErr(C1, C2);
+  TensorCheckErr(C1, C3);
+}
+TEST(sgdUpdate, CPU) {
+  testMatrixCase(testSgdUpdate<CpuMatrix>);
+}
+#ifndef PADDLE_ONLY_CPU
+TEST(sgdUpdate, GPU) {
+  testMatrixCase(testSgdUpdate<GpuMatrix>);
+}
+#endif
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  hl_start();
+  hl_init(0);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/parameter/FirstOrderOptimizer.cpp
+++ b/paddle/parameter/FirstOrderOptimizer.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #include "paddle/utils/Util.h"
 #include "paddle/utils/Flags.h"
+#include "paddle/math/TrainingAlgorithmOp.h"
 #include "FirstOrderOptimizer.h"
 #include <cmath>
@@ -115,19 +115,28 @@ void SparseMomentumParameterOptimizer::finishBatch() {
 void AdagradParameterOptimizer::update(const VectorPtr vecs[],
                                       const ParameterConfig& config,
                                       size_t sparseId) const {
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->addSquare(*vecs[PARAMETER_GRADIENT],
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-                                                1.0f);
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM],
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-                                     *vecs[PARAMETER_GRADIENT_SQURESUM1]);
+  BaseMatrix& accum_buffer = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+  real epsilon = optConfig_.ada_epsilon();
-                                   *vecs[PARAMETER_MOMENTUM],
+  real learningRate = learningRate_ * config.learning_rate();
-                                   *vecs[PARAMETER_LEARNING_RATE],
+  real momentum = config.momentum();
-                                   learningRate_ * config.learning_rate(),
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
+  adagradApply(value,
+               grad,
+               mom,
+               accum_buffer,
+               accum,
+               lr,
+               epsilon,
+               learningRate,
+               momentum,
+               decayRate);
 }
 ParameterOptimizer::TraverseCallback
@@ -152,37 +161,41 @@ void AdaDeltaParameterOptimizer::update(const VectorPtr vecs[],
                                        const ParameterConfig& config,
                                        size_t sparseId) const {
  CHECK(sparseId == -1LU) << "Sparse update is not supported";
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-      *vecs[PARAMETER_GRADIENT], rou_, 1.0f - rou_);
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& accum = *vecs[PARAMETER_GRADIENT_SQURESUM];
-  // learn_rate = sqrt( ( E(dx_{t-1}^2) + epsilon ) / ( E(g_t^2) + epsilon ) )
+  BaseMatrix& accum_update = *vecs[PARAMETER_GRADIENT_SQURESUM1];
-  vecs[PARAMETER_LEARNING_RATE]->dotDiv(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
-                                        *vecs[PARAMETER_GRADIENT_SQURESUM],
-                                        epsilon_,
+  real learningRate = learningRate_ * config.learning_rate();
-                                        epsilon_);
+  real momentum = config.momentum();
-  vecs[PARAMETER_LEARNING_RATE]->sqrt();
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-  // E(dx_t^2) = \rou * E(dx_{t-1}^2) + (1-\rou) * (-g*learn_rate)^2
+  adadeltaApply(value,
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->decayAddSquareMul(
+                grad,
-      *vecs[PARAMETER_GRADIENT],
+                mom,
-      *vecs[PARAMETER_LEARNING_RATE],
+                accum,
-      rou_,
+                accum_update,
-      1.0f - rou_);
+                lr,
+                rou_,
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                epsilon_,
-                                   *vecs[PARAMETER_MOMENTUM],
+                learningRate,
-                                   *vecs[PARAMETER_LEARNING_RATE],
+                momentum,
-                                   learningRate_ * config.learning_rate(),
+                decayRate);
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
 }
 void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
                                       const ParameterConfig& config,
                                       size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& sum1 = *vecs[PARAMETER_GRADIENT_SQURESUM1];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+  real accumulatedRou = rou_;
  bool firstTime = timer_ == 0;
  if (sparseId != -1LU) {
    CHECK_LT(sparseId, t0Vec_.size());
@@ -191,40 +204,36 @@ void RMSPropParameterOptimizer::update(const VectorPtr vecs[],
    t0Vec_[sparseId] = timer_ + 1;
  }
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  real epsilon = optConfig_.ada_epsilon();
-  // For the first time update, make the sum be the current square
+  real learningRate = learningRate_ * config.learning_rate();
-  // so that the initial estimation of E(g_t^2) will not be too small.
+  real momentum = config.momentum();
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-      *vecs[PARAMETER_GRADIENT],
-      accumulatedRou,
+  rmspropApply(value,
-      firstTime ? 1.0f : 1.0f - rou_);
+               grad,
+               mom,
-  // E(g_t) = \rou * E(g_{t-1}) + (1-\rou) * g
+               sum,
-  vecs[PARAMETER_GRADIENT_SQURESUM1]->add(
+               sum1,
-      *vecs[PARAMETER_GRADIENT], accumulatedRou, 1.0f - rou_);
+               lr,
+               accumulatedRou,
-  // learn_rate = 1/sqrt( ( E(g_t^2) - (E(g_t))^2 + epsilon )
+               rou_,
-  // Basiclly if the sign of the gradient changes more often,
+               epsilon,
-  // the learning rate will be decreased.
+               learningRate,
-  vecs[PARAMETER_LEARNING_RATE]->assign(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+               momentum,
-  vecs[PARAMETER_LEARNING_RATE]->addSquare(*vecs[PARAMETER_GRADIENT_SQURESUM1],
+               decayRate,
-                                           -1.0f);
+               firstTime);
-  vecs[PARAMETER_LEARNING_RATE]->add(optConfig_.ada_epsilon());
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
-                                   *vecs[PARAMETER_MOMENTUM],
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
 }
 void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
                                              const ParameterConfig& config,
                                              size_t sparseId) const {
-  real accumulatedRou = rou_;
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& sum = *vecs[PARAMETER_GRADIENT_SQURESUM];
+  BaseMatrix& lr = *vecs[PARAMETER_LEARNING_RATE];
+  real accumulatedRou = rou_;
  bool firstTime = timer_ == 0;
  if (sparseId != -1LU) {
    CHECK_LT(sparseId, t0Vec_.size());
@@ -233,77 +242,62 @@ void DecayedAdagradParameterOptimizer::update(const VectorPtr vecs[],
    t0Vec_[sparseId] = timer_ + 1;
  }
-  // E(g_t^2) = \rou * E(g_{t-1}^2) + (1-\rou) * g^2
+  real epsilon = optConfig_.ada_epsilon();
-  // For the first time update, make the sum be the current square
+  real learningRate = learningRate_ * config.learning_rate();
-  // so that the initial estimation of E(g_t^2) will not be too small.
+  real momentum = config.momentum();
-  vecs[PARAMETER_GRADIENT_SQURESUM]->decayAddSquare(
+  real decayRate = applyDecay_ ? config.decay_rate() : 0;
-      *vecs[PARAMETER_GRADIENT],
-      accumulatedRou,
+  decayedAdagradApply(value,
-      firstTime ? 1.0f : 1.0f - rou_);
+                      grad,
+                      mom,
-  // learn_rate = 1/sqrt( ( E(g_t^2) + epsilon )
+                      sum,
-  // Basiclly if the bigger the magnitude gradient is,
+                      lr,
-  // the smaller the learning rate will be.
+                      accumulatedRou,
-  vecs[PARAMETER_LEARNING_RATE]->assign(optConfig_.ada_epsilon());
+                      rou_,
-  vecs[PARAMETER_LEARNING_RATE]->add(*vecs[PARAMETER_GRADIENT_SQURESUM]);
+                      epsilon,
-  vecs[PARAMETER_LEARNING_RATE]->invSqrt(*vecs[PARAMETER_LEARNING_RATE]);
+                      learningRate,
+                      momentum,
-  vecs[PARAMETER_VALUE]->sgdUpdate(*vecs[PARAMETER_GRADIENT],
+                      decayRate,
-                                   *vecs[PARAMETER_MOMENTUM],
+                      firstTime);
-                                   *vecs[PARAMETER_LEARNING_RATE],
-                                   learningRate_ * config.learning_rate(),
-                                   config.momentum(),
-                                   applyDecay_ ? config.decay_rate() : 0);
 }
 void AdamParameterOptimizer::update(const VectorPtr vecs[],
                                    const ParameterConfig& config,
                                    size_t sparseId) const {
  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  real beta1_power = std::pow(beta1_, step_);
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
+  real beta2_power = std::pow(beta2_, step_);
-  Vector* v = vecs[PARAMETER_SECOND_MOMENTUM].get();
+  real learningRate = config.learning_rate() * learningRate_;
-  Vector* theta = vecs[PARAMETER_VALUE].get();
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  m->add(*g, beta1_, 1 - beta1_);
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
+  BaseMatrix& v = *vecs[PARAMETER_SECOND_MOMENTUM];
-  // v_t = \beta_2 * v_{t-1} + (1-\beta_2)* g_{t-1}^2
-  g->square();
+  adamApply(value,
-  v->add(*g, beta2_, 1 - beta2_);
+            grad,
+            mom,
-  // tmp = m_t / ( \sqrt{v_t} + \epsilon )
+            v,
-  // \theta_t = \theta_{t-1} - \alpha * \sqrt(1-\beta_2^t) / (1-\beta_1^t) * tmp
+            beta1_,
-  g->sqrt(*v);
+            beta2_,
-  g->dotDiv(*m, *g, 0., epsilon_);
+            beta1_power,
-  real alpha = config.learning_rate() * learningRate_;
+            beta2_power,
-  alpha = alpha * std::sqrt(1 - std::pow(beta2_, step_)) /
+            epsilon_,
-          (1 - std::pow(beta1_, step_));
+            learningRate);
-  theta->add(*theta, 1.0, *g, -alpha);
 }
 void AdamaxParameterOptimizer::update(const VectorPtr vecs[],
                                      const ParameterConfig& config,
                                      size_t sparseId) const {
  CHECK(sparseId == -1UL) << "Sparse update is not supported";
-  Vector* m = vecs[PARAMETER_MOMENTUM].get();
+  real learningRate = config.learning_rate() * learningRate_;
-  Vector* g = vecs[PARAMETER_GRADIENT].get();
-  Vector* u = vecs[PARAMETER_WEIGHTED_INFINITY_NORM].get();
-  Vector* theta = vecs[PARAMETER_VALUE].get();
-  // m_t = \beta_1 * m_{t-1} + (1-\beta_1)* g_t;
-  m->add(*g, beta1_, 1 - beta1_);
-  // u_t = max(\beta_2*u_{t-1}, abs(g_t))
+  BaseMatrix& value = *vecs[PARAMETER_VALUE];
-  u->mulScalar(beta2_);
+  BaseMatrix& grad = *vecs[PARAMETER_GRADIENT];
-  g->abs();
+  BaseMatrix& mom = *vecs[PARAMETER_MOMENTUM];
-  u->max(*u, *g);
+  BaseMatrix& u = *vecs[PARAMETER_WEIGHTED_INFINITY_NORM];
-  // \theta_t = \theta_{t-1} - (\alpha/(1-\beta_1^t))*m_t/u_t
+  adamaxApply(value, grad, mom, u, beta1_, beta2_, step_, learningRate);
-  g->dotDiv(*m, *u);
-  real learningRate = config.learning_rate() * learningRate_;
-  learningRate /= (1 - std::pow(beta1_, step_));
-  theta->add(*theta, 1.0, *g, -learningRate);
 }
 void OptimizerWithGradientClipping::update(const VectorPtr vecs[],

--- a/paddle/scripts/docker/Dockerfile
+++ b/paddle/scripts/docker/Dockerfile
 FROM ubuntu:14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-RUN apt-get update && \
+RUN apt-get update \
-    apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
    python-protobuf python-numpy python-dev swig openssh-server \
    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    apt-get clean -y
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
-RUN pip install BeautifulSoup docopt PyYAML pillow \
+    && apt-get clean -y
-    'sphinx>=1.4.0' sphinx_rtd_theme breathe recommonmark
+RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    sphinx sphinx_rtd_theme breathe recommonmark
 ARG WITH_AVX
-ENV WITH_AVX=${WITH_AVX:-ON}
+ARG WITH_DOC
+ARG WITH_SWIG_PY
+ARG WITH_STYLE_CHECK
 ENV WITH_GPU=OFF
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-ON}
+ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
 RUN pip install /usr/local/opt/paddle/share/wheels/*.whl

--- a/paddle/scripts/docker/Dockerfile.gpu
+++ b/paddle/scripts/docker/Dockerfile.gpu
 FROM nvidia/cuda:7.5-cudnn5-devel-ubuntu14.04
 MAINTAINER PaddlePaddle Authors <paddle-dev@baidu.com>
-RUN apt-get update && \
+RUN apt-get update \
-    apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
+    && apt-get install -y cmake libprotobuf-dev protobuf-compiler git \
    libgoogle-glog-dev libgflags-dev libatlas-dev libatlas3-base g++ m4 python-pip \
    python-protobuf python-numpy python-dev swig openssh-server \
    wget unzip python-matplotlib tar xz-utils bzip2 gzip coreutils \
-    sed grep graphviz libjpeg-dev zlib1g-dev doxygen && \
+    sed grep graphviz libjpeg-dev zlib1g-dev doxygen \
-    apt-get clean -y
+    clang-3.8 llvm-3.8 libclang-3.8-dev \
-RUN pip install BeautifulSoup docopt PyYAML pillow \
+    && apt-get clean -y
-    'sphinx>=1.4.0' sphinx_rtd_theme breathe recommonmark
+RUN pip install -U BeautifulSoup docopt PyYAML pillow \
+    sphinx sphinx_rtd_theme breathe recommonmark
 ARG WITH_AVX
-ENV WITH_AVX=${WITH_AVX:-ON}
+ARG WITH_DOC
+ARG WITH_SWIG_PY
+ARG WITH_STYLE_CHECK
 ENV WITH_GPU=ON
+ENV WITH_AVX=${WITH_AVX:-ON}
+ENV WITH_DOC=${WITH_DOC:-ON}
+ENV WITH_SWIG_PY=${WITH_SWIG_PY:-ON}
+ENV WITH_STYLE_CHECK=${WITH_STYLE_CHECK:-OFF}
 RUN mkdir /paddle
 COPY . /paddle/
 RUN /paddle/paddle/scripts/docker/build.sh
+VOLUME ["/usr/share/nginx/html/data", "/usr/share/nginx/html/paddle"]
 RUN echo 'export LD_LIBRARY_PATH=/usr/lib64:${LD_LIBRARY_PATH}' >> /etc/profile
 RUN pip install /usr/local/opt/paddle/share/wheels/*.whl

--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -20,8 +20,28 @@ cmake .. \
      -DWITH_AVX=${WITH_AVX} \
      -DWITH_SWIG_PY=ON \
      -DCUDNN_ROOT=/usr/ \
-      -DWITH_STYLE_CHECK=OFF
+      -DWITH_STYLE_CHECK=OFF \
+      -DCMAKE_EXPORT_COMPILE_COMMANDS=ON
 make -j `nproc`
 make install
+# Install woboq_codebrowser.
+git clone https://github.com/woboq/woboq_codebrowser /woboq
+cd /woboq
+cmake -DLLVM_CONFIG_EXECUTABLE=/usr/bin/llvm-config-3.8 \
+      -DCMAKE_BUILD_TYPE=Release \
+      .
+make
+export WOBOQ_OUT=/usr/share/nginx/html/paddle
+export BUILD_DIR=/paddle/build
+mkdir -p $WOBOQ_OUT
+cp -rv /woboq/data $WOBOQ_OUT/../data
+/woboq/generator/codebrowser_generator \
+    -b /paddle/build \
+    -a \
+    -o $WOBOQ_OUT \
+    -p paddle:/paddle
+/woboq/indexgenerator/codebrowser_indexgenerator $WOBOQ_OUT
 trap : 0
--- a/paddle/scripts/tools/build_docs/Dockerfile
+++ b/paddle/scripts/tools/build_docs/Dockerfile
 FROM paddledev/paddle:cpu-devel-latest
 COPY build.sh /
 RUN pip install sphinx &&\
+    pip install sphinx_rtd_theme &&\
    apt install -y doxygen graphviz &&\
    pip install breathe recommonmark numpy protobuf==2.6.1
 CMD /build.sh
--- a/paddle/scripts/travis/build_and_test.sh
+++ b/paddle/scripts/travis/build_and_test.sh
 #!/bin/bash
+./build_submodules.sh
 source ./common.sh
 CMAKE_EXTRA=""
 if [[ "$TRAVIS_OS_NAME" == "osx" ]]; then

--- a/paddle/scripts/travis/build_submodules.sh
+++ b/paddle/scripts/travis/build_submodules.sh
+#!/bin/bash
+set -e
+WORK_DIR=$PWD
+PROJ_ROOT=$(git rev-parse --show-cdup)
+SUBMODULES=$(grep path ${PROJ_ROOT}.gitmodules | sed 's/^.*path = //')
+for module in $SUBMODULES
+do
+  case $module in
+    "warp-ctc")
+      if [ -d ${PROJ_ROOT}warp-ctc/build ]; then
+        rm -rf ${PROJ_ROOT}warp-ctc/build
+      fi
+      mkdir ${PROJ_ROOT}warp-ctc/build
+      cd ${PROJ_ROOT}warp-ctc/build
+      cmake ..; make
+    ;;
+  esac
+done
+cd $WORK_DIR
--- a/paddle/trainer/Tester.cpp
+++ b/paddle/trainer/Tester.cpp
@@ -17,22 +17,22 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 #include <google/protobuf/text_format.h>
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/GlobalConstants.h"
+#include "TesterConfig.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
 #include "paddle/gserver/gradientmachines/NeuralNetwork.h"
 #include "paddle/gserver/layers/ValidationLayer.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "TesterConfig.h"
 namespace paddle {
@@ -66,6 +66,9 @@ Tester::Tester(const std::shared_ptr<TrainerConfigHelper>& config,
 }
 void Tester::startTestPeriod() {
+  if (testDataProvider_) {
+    testDataProvider_->reset();
+  }
  testEvaluator_->start();
  testContext_.cost = 0;
  testContext_.numSamples = 0;
@@ -87,33 +90,18 @@ void Tester::testOneDataBatch(const DataBatch& dataBatch,
 void Tester::testOnePeriod() {
  DataBatch dataBatch;
  int64_t batchSize = config_->getOptConfig().batch_size();
-  bool testAllData =
-      intconfig_->testPeriod == 0 || intconfig_->testAllDataInOnePeriod;
-  int batches =
-      testAllData ? std::numeric_limits<int>::max() : intconfig_->testPeriod;
  std::vector<Argument> outArgs;
  startTestPeriod();
-  for (int i = 0; i < batches; ++i) {
+  while (testDataProvider_->getNextBatch(batchSize, &dataBatch) != 0) {
-    int num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-    if (num == 0) {
-      testDataProvider_->reset();
-      if (intconfig_->prevBatchState) {
-        gradientMachine_->resetState();
-      }
-      if (testAllData) {
-        break;
-      } else {
-        num = testDataProvider_->getNextBatch(batchSize, &dataBatch);
-      }
-    }
    testOneDataBatch(dataBatch, &outArgs);
  }
  finishTestPeriod();
 }
 void Tester::finishTestPeriod() {
+  if (intconfig_->prevBatchState) {
+    gradientMachine_->resetState();
+  }
  testEvaluator_->finish();
  CHECK_GT(testContext_.numSamples, 0)
      << "There is no samples in your test batch. Possibly "

--- a/paddle/trainer/TesterConfig.h
+++ b/paddle/trainer/TesterConfig.h
@@ -39,11 +39,6 @@ struct TesterConfig {
   */
  int testPeriod;
-  /**
-   * indicate whether testing data in one period
-   */
-  bool testAllDataInOnePeriod;
  /**
   * indicate whether to save previous batch state
   */

--- a/paddle/trainer/Trainer.cpp
+++ b/paddle/trainer/Trainer.cpp
@@ -17,43 +17,41 @@ limitations under the License. */
 #include <fenv.h>
 #include <stdio.h>
-#include <iostream>
 #include <iomanip>
-#include <sstream>
+#include <iostream>
 #include <limits>
+#include <sstream>
 #include <google/protobuf/text_format.h>
+#include "paddle/utils/Excepts.h"
+#include "paddle/utils/GlobalConstants.h"
 #include "paddle/utils/PythonUtil.h"
 #include "paddle/utils/Stat.h"
 #include "paddle/utils/Util.h"
-#include "paddle/utils/Excepts.h"
-#include "paddle/utils/GlobalConstants.h"
-#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "RemoteParameterUpdater.h"
-#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
-#include "paddle/gserver/layers/ValidationLayer.h"
 #include "TesterConfig.h"
 #include "ThreadParameterUpdater.h"
-#include "RemoteParameterUpdater.h"
 #include "TrainerConfigHelper.h"
+#include "paddle/gserver/gradientmachines/GradientMachineMode.h"
+#include "paddle/gserver/gradientmachines/NeuralNetwork.h"
+#include "paddle/gserver/layers/ValidationLayer.h"
 P_DEFINE_string(config, "", "Trainer config file");
 P_DEFINE_int32(test_period,
               0,
-               "Run test every so many train batches."
+               "if equal 0, do test on all test data at the end of "
-               " 0 for testing after each pass."
+               "each pass. While if equal non-zero, do test on all test "
-               " If not 0, test log_period batches."
+               "data every test_period batches");
-               " If 0, test on all test data");
+P_DEFINE_bool(test_all_data_in_one_period,
+              false,
+              "This option was deprecated, since we will always do "
+              "test on all test set ");
 P_DEFINE_bool(local, true, "Train in local mode or not");
-P_DEFINE_bool(
-    test_all_data_in_one_period,
-    false,
-    "true will test all data in one test peroid."
-    "Otherwise test (batch_size * log_peroid) data in one test period.");
 P_DEFINE_int32(average_test_period,
               0,
               "Do test on average parameter every so"
@@ -396,10 +394,6 @@ void Trainer::startTrain() {
    dataProvider_->reset();
  }
-  if (this->testDataProvider_) {
-    this->testDataProvider_->reset();
-  }
  trainerInternal_.getGradientMachine()->start(*config_, dataProvider_);
 }
@@ -633,8 +627,17 @@ void Trainer::test() { tester_->test(); }
 std::unique_ptr<TesterConfig> Trainer::createTesterConfig() {
  TesterConfig* conf = new TesterConfig;
+  if (FLAGS_test_period) {
+    LOG(WARNING) << "The meaning of --test_period is changed: "
+                 << "if equal 0, do test on all test data at the end of "
+                 << "each pass. While if equal non-zero, do test on all test "
+                 << "data every test_period batches ";
+  }
+  if (FLAGS_test_all_data_in_one_period) {
+    LOG(WARNING) << "--test_all_data_in_one_period was deprecated, since "
+                 << "we will always do test on all test set ";
+  }
  conf->testPeriod = FLAGS_test_period;
-  conf->testAllDataInOnePeriod = FLAGS_test_all_data_in_one_period;
  conf->prevBatchState = FLAGS_prev_batch_state;
  conf->logPeriod = FLAGS_log_period;
  conf->loadsaveParametersInPserver = FLAGS_loadsave_parameters_in_pserver;

--- a/proto/ModelConfig.proto.m4
+++ b/proto/ModelConfig.proto.m4
@@ -422,6 +422,9 @@ sinclude(`ModelConfigLayer.proto.m4')
  // to indicate rectangle image data
  optional uint64 height = 50;
  optional uint64 width = 51;
+  // blank label used in ctc loss
+  optional uint32 blank = 52 [default = 0];
 }
 message EvaluatorConfig {

--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1872,7 +1872,7 @@ class BatchNormLayer(LayerBase):
        image_conf = self.config.inputs[0].image_conf
        parse_image(self.inputs[0].image, input_layer.name, image_conf)
        self.set_cnn_layer(name, image_conf.img_size_y, image_conf.img_size,
-                           image_conf.channels)
+                           image_conf.channels, False)
        psize = self.calc_parameter_size(image_conf)
        dims = [1, psize]
@@ -2987,6 +2987,27 @@ class CTCLayer(LayerBase):
        config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')
+@config_layer('warp_ctc')
+class WarpCTCLayer(LayerBase):
+    def __init__(self,
+                 name,
+                 size,
+                 inputs,
+                 blank=0,
+                 norm_by_times=False,
+                 device=None):
+        super(WarpCTCLayer, self).__init__(
+            name, 'warp_ctc', size=size, inputs=inputs, device=device)
+        self.config.blank = blank
+        self.config.norm_by_times = norm_by_times
+        config_assert(len(self.inputs) == 2, 'WarpCTCLayer must have 2 inputs')
+        input_layer = self.get_input_layer(0)
+        config_assert(
+            (input_layer.active_type == '' or
+             input_layer.active_type == 'linear'),
+            "Expecting the active_type of input layer to be linear or null")
 @config_layer('recurrent_layer_group')
 class RecurrentLayerGroup(LayerBase):
    def __init__(self, name, device=None):
@@ -3377,7 +3398,21 @@ def parse_config(config_file, config_arg_str):
    g_root_submodel.is_recurrent_layer_group = False
    g_current_submodel = g_root_submodel
-    execfile(config_file, make_config_environment(config_file, config_args))
+    # for paddle on spark, need support non-file config.
+    # you can use parse_config like below:
+    #
+    # from paddle.trainer.config_parser import parse_config
+    # def configs():
+    #    #your paddle config code, which is same as config file.
+    #
+    # config = parse_config(configs, "is_predict=1")
+    # # then you get config proto object.
+    if hasattr(config_file, '__call__'):
+        config_file.func_globals.update(
+            make_config_environment("", config_args))
+        config_file()
+    else:
+        execfile(config_file, make_config_environment(config_file, config_args))
    for k, v in settings.iteritems():
        if v is None:
            continue

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -91,6 +91,7 @@ __all__ = [
    'linear_comb_layer',
    'convex_comb_layer',
    'ctc_layer',
+    'warp_ctc_layer',
    'crf_layer',
    'crf_decoding_layer',
    'nce_layer',
@@ -172,6 +173,7 @@ class LayerType(object):
    PRINT_LAYER = "print"
    CTC_LAYER = "ctc"
+    WARP_CTC_LAYER = "warp_ctc"
    CRF_LAYER = "crf"
    CRF_DECODING_LAYER = "crf_decoding"
    NCE_LAYER = 'nce'
@@ -4096,6 +4098,83 @@ def ctc_layer(input,
    return LayerOutput(name, LayerType.CTC_LAYER, [input, label], size=size)
+@wrap_name_default()
+@layer_support()
+def warp_ctc_layer(input,
+                   label,
+                   size=None,
+                   name=None,
+                   blank=0,
+                   norm_by_times=False,
+                   layer_attr=None):
+    """
+    A layer intergrating the open-source `warp-ctc
+    <https://github.com/baidu-research/warp-ctc>` library, which is used in
+    `Deep Speech 2: End-toEnd Speech Recognition in English and Mandarin
+    <https://arxiv.org/pdf/1512.02595v1.pdf>`, to compute Connectionist Temporal
+    Classification (CTC) loss.
+    More details of CTC can be found by referring to `Connectionist Temporal
+    Classification: Labelling Unsegmented Sequence Data with Recurrent
+    Neural Networks <http://machinelearning.wustl.edu/mlpapers/paper_files/
+    icml2006_GravesFGS06.pdf>`_
+    Note:
+        - Let num_classes represent the category number. Considering the 'blank'
+          label needed by CTC, you need to use (num_classes + 1) as the input
+          size. Thus, the size of both warp_ctc_layer and 'input' layer should
+          be set to num_classes + 1.
+        - You can set 'blank' to any value ranged in [0, num_classes], which
+          should be consistent as that used in your labels.
+        - As a native 'softmax' activation is interated to the warp-ctc library,
+         'linear' activation is expected instead in the 'input' layer.
+    The simple usage:
+    .. code-block:: python
+      ctc = warp_ctc_layer(input=input,
+                           label=label,
+                           size=1001,
+                           blank=1000,
+                           norm_by_times=False)
+    :param input: The input layer.
+    :type input: LayerOutput
+    :param label: The data layer of label with variable length.
+    :type label: LayerOutput
+    :param size: category numbers + 1.
+    :type size: int
+    :param name: The name of this layer, which can not specify.
+    :type name: basestring|None
+    :param blank: the 'blank' label used in ctc
+    :type blank: int
+    :param norm_by_times: Whether to normalization by times. False by default.
+    :type norm_by_times: bool
+    :param layer_attr: Extra Layer config.
+    :type layer_attr: ExtraLayerAttribute|None
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput)
+    assert isinstance(label, LayerOutput)
+    if label.size is not None:
+        if size is not None:
+            assert size == label.size + 1
+        else:
+            size = label.size + 1
+    Layer(
+        name=name,
+        type=LayerType.WARP_CTC_LAYER,
+        size=size,
+        blank=blank,
+        norm_by_times=norm_by_times,
+        inputs=[input.name, label.name],
+        **ExtraLayerAttribute.to_kwargs(layer_attr))
+    return LayerOutput(
+        name, LayerType.WARP_CTC_LAYER, parents=[input, label], size=size)
 @wrap_name_default()
 @wrap_param_attr_default()
 @layer_support()

--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_cost_layers.protostr
@@ -47,6 +47,20 @@ layers {
  }
  norm_by_times: false
 }
+layers {
+  name: "__warp_ctc_layer_0__"
+  type: "warp_ctc"
+  size: 5001
+  active_type: ""
+  inputs {
+    input_layer_name: "input"
+  }
+  inputs {
+    input_layer_name: "labels"
+  }
+  norm_by_times: false
+  blank: 0
+}
 layers {
  name: "crf_label"
  type: "data"
@@ -244,6 +258,7 @@ input_layer_names: "xe-label"
 input_layer_names: "huber_probs"
 input_layer_names: "huber_label"
 output_layer_names: "__ctc_layer_0__"
+output_layer_names: "__warp_ctc_layer_0__"
 output_layer_names: "__crf_layer_0__"
 output_layer_names: "__rank_cost_0__"
 output_layer_names: "__lambda_cost_0__"
@@ -260,6 +275,7 @@ sub_models {
  layer_names: "xe-label"
  layer_names: "__fc_layer_0__"
  layer_names: "__ctc_layer_0__"
+  layer_names: "__warp_ctc_layer_0__"
  layer_names: "crf_label"
  layer_names: "__crf_layer_0__"
  layer_names: "left"
@@ -289,6 +305,7 @@ sub_models {
  input_layer_names: "huber_probs"
  input_layer_names: "huber_label"
  output_layer_names: "__ctc_layer_0__"
+  output_layer_names: "__warp_ctc_layer_0__"
  output_layer_names: "__crf_layer_0__"
  output_layer_names: "__rank_cost_0__"
  output_layer_names: "__lambda_cost_0__"

--- a/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_cost_layers.py
@@ -12,6 +12,8 @@ hidden = fc_layer(input=seq_in, size=4)
 outputs(
    ctc_layer(
        input=seq_in, label=labels),
+    warp_ctc_layer(
+        input=seq_in, label=labels, blank=0),
    crf_layer(
        input=hidden, label=data_layer(
            name='crf_label', size=4)),

--- a/warp-ctc @ bd535c8d
+++ b/warp-ctc @ bd535c8d
+Subproject commit bd535c8d44e03c8ebd2d768e06c8c05fdccd11d2