Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into rnn_forward_result_test

b1b71eab · superjom · e9a92e3e · 452f3cc0 · b1b71eab · b1b71eab
163 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -24,4 +24,5 @@ cmake-build-*
 python/paddle/v2/framework/core.so
 CMakeFiles
 cmake_install.cmake
-
+paddle/.timestamp
+python/paddlepaddle.egg-info/
--- a/.travis.yml
+++ b/.travis.yml
@@ -37,8 +37,8 @@ before_install:
  - if [[ "$JOB" == "check_style" ]]; then sudo ln -s /usr/bin/clang-format-3.8 /usr/bin/clang-format; fi
  # Paddle is using protobuf 3.1 currently. Protobuf 3.2 breaks the compatibility. So we specify the python
  # protobuf version.
-  - pip install numpy wheel 'protobuf==3.1' sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit requests==2.9.2 LinkChecker
-  - pip install rarfile
+  - pip install -r $TRAVIS_BUILD_DIR/python/requirements.txt
+  - pip install wheel sphinx==1.5.6 recommonmark sphinx-rtd-theme==0.1.9 virtualenv pre-commit LinkChecker
  - curl https://glide.sh/get | bash
  - eval "$(GIMME_GO_VERSION=1.8.3 gimme)"
  - go get -u github.com/alecthomas/gometalinter

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,8 +14,8 @@

 cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
-set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})
+set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})

 include(system)

@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -121,8 +121,8 @@ include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage


-include_directories("${PROJ_ROOT}")
-include_directories("${PROJ_ROOT}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 include_directories(${Boost_INCLUDE_DIRS})
@@ -144,7 +144,7 @@ if(WITH_GPU)
 endif(WITH_GPU)

 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
 endif()

 if(USE_NNPACK)
@@ -164,10 +164,12 @@ if(WITH_GOLANG)
    add_subdirectory(go)
 endif(WITH_GOLANG)

+set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 add_subdirectory(paddle)
 if(WITH_PYTHON)
  add_subdirectory(python)
 endif()
+
 if(WITH_DOC)
    add_subdirectory(doc)
 endif()
--- a/Dockerfile
+++ b/Dockerfile
@@ -28,7 +28,7 @@ RUN apt-get update && \
    wget unzip unrar tar xz-utils bzip2 gzip coreutils ntp \
    curl sed grep graphviz libjpeg-dev zlib1g-dev  \
    python-matplotlib gcc-4.8 g++-4.8 \
-    automake locales clang-format-3.8 swig doxygen cmake  \
+    automake locales clang-format swig doxygen cmake  \
    liblapack-dev liblapacke-dev libboost-dev \
    clang-3.8 llvm-3.8 libclang-3.8-dev \
    net-tools && \
@@ -64,13 +64,28 @@ RUN pip install --upgrade pip && \
    pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
    pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
    pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install rarfile
+    pip install opencv-python rarfile 'scipy>=0.19.0' 'nltk>=3.2.2'

 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip install certifi urllib3[secure]

+# TODO(qijun) The template library Eigen doesn't work well with GCC 5 
+# coming with the default Docker image, so we switch to use GCC 4.8 
+# by default. And I will check Eigen library later.
+
+RUN ln -sf gcc-4.8 /usr/bin/gcc && \
+    ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \
+    ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \
+    ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \
+    ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \
+    ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \
+    ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \
+    ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \
+    ln -sf g++-4.8 /usr/bin/g++ && \
+    ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++ 
+
 # Install woboq_codebrowser to /woboq
 RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
    (cd /woboq \

--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -129,7 +129,7 @@ if(WITH_GOLANG)
    add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
      COMMAND env GOPATH=${GOPATH} ${GLIDE} install
      COMMAND touch ${CMAKE_BINARY_DIR}/glide
-      DEPENDS ${PROJ_ROOT}/go/glide.lock
+      DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
      WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
      )


--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -52,7 +52,7 @@ macro(add_style_check_target TARGET_NAME)

        if(SOURCES_LIST)
            add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-                COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
                        "--filter=${STYLE_FILTER}"
                        ${SOURCES_LIST}
                COMMENT "cpplint: Checking source code style"

--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,10 +9,12 @@ function(CheckCompilerCXX11Flag)
        if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
            message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
        endif()
-        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
-        # Use Debug mode instead for now.
-        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
-            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        if(NOT ANDROID)
+            # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+            # Use Debug mode instead for now.
+            if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
+                set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+            endif()
        endif()
    elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
        # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -411,7 +411,7 @@ function(py_test TARGET_NAME)
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
    add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
             python2 ${py_test_SRCS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
  endif()

--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
 set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
-set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
 include (CMakePackageConfigHelpers)

--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -141,8 +141,8 @@ endmacro()
 function(create_resources res_file output_file)
  add_custom_command(
    OUTPUT ${output_file}
-    COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py)
+    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
+    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()



--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -4,7 +4,7 @@ set(tmp_version "HEAD")
 while ("${PADDLE_VERSION}" STREQUAL "")
  execute_process(
    COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
-    WORKING_DIRECTORY ${PROJ_ROOT}
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
    OUTPUT_VARIABLE GIT_TAG_NAME
    RESULT_VARIABLE GIT_RESULT
    ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)

--- a/doc/api/v2/config/layer.rst
+++ b/doc/api/v2/config/layer.rst
@@ -257,6 +257,11 @@ seq_concat
 ..  autoclass:: paddle.v2.layer.seq_concat
    :noindex:

+kmax_sequence_score
+-------------------
+..  autoclass:: paddle.v2.layer.kmax_sequence_score
+    :noindex:
+
 sub_nested_seq
 --------------
 ..  autoclass:: paddle.v2.layer.sub_nested_seq

--- a/doc/design/auto_gradient_check.md
+++ b/doc/design/auto_gradient_check.md
+## Auto Gradient Checker Design
+
+## Backgraound：
+- Operator forward computing is easy to check if the result is right because it has a clear definition. **But** backpropagation is a notoriously difficult algorithm to debug and get right:
+  - 1. you should get the right backpropagation formula according to the forward computation.
+  - 2. you should implement it right in CPP.
+  - 3. it's difficult to prepare test data.
+
+- Auto gradient check gets a numeric gradient by forward Operator and use it as a reference of the backward Operator's result. It has several advantages:
+  - 1. numeric gradient checker only need forward operator.
+  - 2. user only need to prepare the input data for forward Operator.
+
+## Mathematical Theory
+The following two document from stanford has a detailed explanation of how to get numeric gradient and why it's useful.
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
+
+
+## Numeric Gradient Implementation
+### Python Interface
+```python
+def get_numeric_gradient(op,
+                         input_values,
+                         output_name,
+                         input_to_check,
+                         delta=0.005,
+                         local_scope=None):
+    """
+    Get Numeric Gradient for an operator's input.
+
+    :param op: C++ operator instance, could be an network
+    :param input_values: The input variables. Should be an dictionary, key is
+    variable name. Value is numpy array.
+    :param output_name: The final output variable name.
+    :param input_to_check: The input variable need to get gradient.
+    :param delta: The perturbation value for numeric gradient method. The
+    smaller delta is, the more accurate result will get. But if that delta is
+     too small, it could occur numerical stability problem.
+    :param local_scope: The local scope used for get_numeric_gradient.
+    :return: The gradient array in numpy format.
+    """
+```
+
+### Explaination:
+
+- Why need `output_name`
+  - One Operator may have multiple Output, you can get independent gradient from each Output. So user should set one output to calculate.
+
+- Why need `input_to_check`
+  - One operator may have multiple inputs. Gradient Op can calculate the gradient of these Inputs at the same time. But Numeric Gradient needs to calculate them one by one. So `get_numeric_gradient` is designed to calculate the gradient for one input. If you need to compute multiple inputs, you can call `get_numeric_gradient` multiple times.
+
+
+### Core Algorithm Implementation
+
+
+```python
+    # we only compute gradient of one element each time.
+    # we use a for loop to compute the gradient of every element.
+    for i in xrange(tensor_size):
+        # get one input element throw it's index i.
+        origin = tensor_to_check.get_float_element(i)
+
+        # add delta to it, run op and then get the sum of the result tensor.
+        x_pos = origin + delta
+        tensor_to_check.set_float_element(i, x_pos)
+        y_pos = get_output()
+
+        # plus delta to this element, run op and get the sum of the result tensor.
+        x_neg = origin - delta
+        tensor_to_check.set_float_element(i, x_neg)
+        y_neg = get_output()
+
+        # restore old value
+        tensor_to_check.set_float_element(i, origin)
+
+        # compute the gradient of this element and store it into a numpy array.
+        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    # reshape the gradient result to the shape of the source tensor.
+    return gradient_flat.reshape(tensor_to_check.get_dims())
+```
+
+## Auto Graident Checker Framework
+
+Each Operator Kernel has three kinds of Gradient:
+
+- 1. Numeric Gradient
+- 2. CPU Operator Gradient
+- 3. GPU Operator Gradient(if supported)
+
+Numeric Gradient Only relies on forward Operator. So we use Numeric Gradient as the reference value.
+
+- 1. calculate the numeric gradient.
+- 2. calculate CPU kernel Gradient with the backward Operator and compare it with the numeric gradient.
+- 3. calculate GPU kernel Gradient with the backward Operator and compare it with the numeric gradient.(if support GPU)
+
+#### Python Interface
+
+```python
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+            computation will use these variables.
+        :param inputs_to_check: inputs var names that should check gradient.
+        :param output_name: output name that used to
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used when create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+```
+
+### How to check if two numpy array is close enough?
+if `abs_numeric_grad` is nearly zero, then use abs error for numeric_grad, not relative
+
+```python
+numeric_grad = ...
+operator_grad = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+
+abs_numeric_grad = numpy.abs(numeric_grad)
+# if abs_numeric_grad is nearly zero, then use abs error for numeric_grad, not relative
+# error.
+abs_numeric_grad[abs_numeric_grad < 1e-3] = 1
+
+diff_mat = numpy.abs(abs_numeric_grad - operator_grad) / abs_numeric_grad
+max_diff = numpy.max(diff_mat)
+```
+
+
+#### Notes：
+1，The Input data for auto gradient checker should be reasonable to avoid numeric problem.
+
+
+#### Refs:
+
+- [Gradient checking and advanced optimization(en)](http://deeplearning.stanford.edu/wiki/index.php/Gradient_checking_and_advanced_optimization)
+- [Gradient checking and advanced optimization(cn)](http://ufldl.stanford.edu/wiki/index.php/%E6%A2%AF%E5%BA%A6%E6%A3%80%E9%AA%8C%E4%B8%8E%E9%AB%98%E7%BA%A7%E4%BC%98%E5%8C%96)
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,22 +13,18 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2

 MarkdownParser = parser.CommonMarkParser
 AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]

 # -- General configuration ------------------------------------------------

@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']

 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,15 +13,11 @@
 # serve to show the default.
 import sys
 import os, subprocess
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
-try:
-   import py_paddle
-   import paddle
-   import paddle.v2
-except ImportError:
-   print("Must install paddle python package before generating documentation")
-   sys.exit(1)
+import paddle
+import paddle.v2


 MarkdownParser = parser.CommonMarkParser
@@ -29,7 +25,7 @@ AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]

 # -- General configuration ------------------------------------------------

@@ -124,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']

 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'

--- a/go/glide.lock
+++ b/go/glide.lock
 hash: 1b9b07408ca7fac27a374dc2ccd2433e4bff090484008a037df967284949a582
-updated: 2017-08-03T21:46:51.744995189Z
+updated: 2017-08-07T23:37:48.867469328Z
 imports:
 - name: github.com/beorn7/perks
  version: 4c0e84591b9aa9e6dcfdf3e020114cd81f89d5f9
@@ -10,7 +10,7 @@ imports:
 - name: github.com/cockroachdb/cmux
  version: 112f0506e7743d64a6eb8fedbcff13d9979bbf92
 - name: github.com/coreos/etcd
-  version: c31bec0f29facff13f7c3e3d948e55dd6689ed42
+  version: d0d1a87aa96ae14914751d42264262cb69eda170
  subpackages:
  - alarm
  - auth
@@ -24,6 +24,7 @@ imports:
  - error
  - etcdserver
  - etcdserver/api
+  - etcdserver/api/etcdhttp
  - etcdserver/api/v2http
  - etcdserver/api/v2http/httptypes
  - etcdserver/api/v3client
@@ -210,11 +211,6 @@ testImports:
  version: 04cdfd42973bb9c8589fd6a731800cf222fde1a9
  subpackages:
  - spew
- name: github.com/docker/docker
-  version: b6d164e6c46d8115b146e4c3ac93784e9ef8b49e
-  subpackages:
-  - pkg/ioutils
-  - pkg/longpath
 - name: github.com/pmezard/go-difflib
  version: d8ed2627bdf02c080bf22230dbb337003b7aba2d
  subpackages:

--- a/go/master/service_test.go
+++ b/go/master/service_test.go
 package master_test

 import (
+	"io/ioutil"
+	"net/url"
 	"os"
+	"strings"
 	"testing"
 	"time"

 	"github.com/PaddlePaddle/Paddle/go/master"
 	"github.com/coreos/etcd/clientv3"
 	"github.com/coreos/etcd/embed"
-	"github.com/docker/docker/pkg/ioutils"
 	"github.com/stretchr/testify/assert"
 )

 func TestNewServiceWithEtcd(t *testing.T) {
 	// setup an embed etcd server
-	etcdDir, err := ioutils.TempDir("", "")
+	etcdDir, err := ioutil.TempDir("", "")
 	if err != nil {
 		t.Fatal(err)
 	}
 	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
 	cfg.Dir = etcdDir
 	e, err := embed.StartEtcd(cfg)
 	if err != nil {
@@ -30,15 +36,13 @@ func TestNewServiceWithEtcd(t *testing.T) {
 			t.Fatal(err)
 		}
 	}()
-	select {
-	case <-e.Server.ReadyNotify():
-		t.Log("Server is ready!")
-	case <-time.After(60 * time.Second):
-		e.Server.Stop() // trigger a shutdown
-		t.Fatal("Server took too long to start!")
-	}

-	ep := []string{"127.0.0.1:2379"}
+	<-e.Server.ReadyNotify()
+
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
+	ep := []string{endpoint}
 	masterAddr := "127.0.0.1:3306"
 	store, err := master.NewEtcdClient(ep, masterAddr, master.DefaultLockPath, master.DefaultAddrPath, master.DefaultStatePath, 30)
 	if err != nil {

--- a/go/pserver/client/c/cclient.go
+++ b/go/pserver/client/c/cclient.go
@@ -90,8 +90,12 @@ func cArrayToSlice(p unsafe.Pointer, len int) []byte {

 type selector bool

-func (s selector) Select() bool {
-	return bool(s)
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
 }

 type lister []client.Server
@@ -114,11 +118,10 @@ func paddle_new_pserver_client(addrs *C.char, selected int) C.paddle_pserver_cli
 }

 //export paddle_new_etcd_pserver_client
-func paddle_new_etcd_pserver_client(etcdEndpoints *C.char, selected int) C.paddle_pserver_client {
-	// TODO(Longfei: use etcd lock to decide which trainer to initialize the parameters)
+func paddle_new_etcd_pserver_client(etcdEndpoints *C.char) C.paddle_pserver_client {
 	addr := C.GoString(etcdEndpoints)
 	etcdClient := client.NewEtcd(addr)
-	c := client.NewClient(etcdClient, etcdClient.Desired(), selector(selected != 0))
+	c := client.NewClient(etcdClient, etcdClient.Desired(), etcdClient)
 	return add(c)
 }

@@ -136,7 +139,12 @@ func paddle_pserver_client_release(client C.paddle_pserver_client) {
 //export paddle_begin_init_params
 func paddle_begin_init_params(client C.paddle_pserver_client) C.int {
 	c := get(client)
-	if selected := c.BeginInitParams(); selected {
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		panic(err)
+	}
+
+	if selected {
 		return 1
 	}
 	return 0

--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -17,12 +17,10 @@ def main():
    # network config
    x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
    y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(
-                                    name='w', learning_rate=1e-3),
+                                param_attr=paddle.attr.Param(name='w'),
                                size=1,
                                act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(
-                                    name='b', learning_rate=1e-3))
+                                bias_attr=paddle.attr.Param(name='b'))
    y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
    cost = paddle.layer.mse_cost(input=y_predict, label=y)


--- a/go/pserver/client/client.go
+++ b/go/pserver/client/client.go
@@ -27,9 +27,13 @@ import (

 // TODO(helin): add RPC call retry logic

-// Selector selects if the client should initialize parameter servers.
+// Selector selects if the client should initialize parameters and
+// reports the initialization process done.
 type Selector interface {
-	Select() bool
+	// Select selects if the client should initialize parameter servers.
+	Select() (bool, error)
+	// Done indicates the initialization process is done.
+	Done() error
 }

 // Server is the identification of a parameter Server.
@@ -115,7 +119,7 @@ func (c *Client) monitorPservers(l Lister, pserverNum int) {
 // servers. Other trainers will be blocked until the initialization is
 // done, and they need to get the initialized parameters from
 // parameter servers using GetParams.
-func (c *Client) BeginInitParams() bool {
+func (c *Client) BeginInitParams() (bool, error) {
 	return c.sel.Select()
 }


--- a/go/pserver/client/client_test.go
+++ b/go/pserver/client/client_test.go
@@ -124,8 +124,12 @@ func initEtcdClient() {

 type selector bool

-func (s selector) Select() bool {
-	return bool(s)
+func (s selector) Select() (bool, error) {
+	return bool(s), nil
+}
+
+func (s selector) Done() error {
+	return nil
 }

 type lister []client.Server
@@ -135,7 +139,11 @@ func (l lister) List() []client.Server {
 }

 func testClient(t *testing.T, c *client.Client) {
-	selected := c.BeginInitParams()
+	selected, err := c.BeginInitParams()
+	if err != nil {
+		t.Fatal(err)
+	}
+
 	if !selected {
 		t.Fatal("should be selected.")
 	}

--- a/go/pserver/client/etcd_client.go
+++ b/go/pserver/client/etcd_client.go
@@ -16,53 +16,60 @@ package client

 import (
 	"context"
+	"errors"
+	"fmt"
 	"strconv"
 	"strings"
 	"time"

 	"github.com/PaddlePaddle/Paddle/go/pserver"
 	"github.com/coreos/etcd/clientv3"
+	"github.com/coreos/etcd/clientv3/concurrency"
 	log "github.com/sirupsen/logrus"
 )

 const (
 	defaultEtcdTimeout time.Duration = 5 * time.Second
+
+	initLockPath = "/init_ps/lock"
+	initDonePath = "/init_ps/done"
+	initDoneVal  = "1"
 )

-// EtcdClient is used by pserver client that is a part of trainer process.
+// Etcd is used by pserver client that is a part of trainer process.
 // TODO:
-// 1. add watcher to watch the change state of pservers)
-// 1. add etcd lock)
-type EtcdClient struct {
+// 1. add watcher to watch the change state of pservers.
+type Etcd struct {
 	client    *clientv3.Client
 	timeout   time.Duration
 	endpoints []string
+	lock      *concurrency.Mutex
 }

 // Desired read ps desired number from etcd.
-func (p *EtcdClient) Desired() int {
+func (e *Etcd) Desired() int {
 	var psDesired int
 	for {
-		ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
-		resp, err := p.client.Get(ctx, pserver.PsDesired)
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		resp, err := e.client.Get(ctx, pserver.PsDesired)
 		cancel()
 		if err != nil {
 			log.Errorf("Get ps dresire number failed! recnnectiong..., %v", err)
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}

 		kvs := resp.Kvs
 		if len(kvs) == 0 {
 			log.Infoln("Waiting for ps desired registered ...")
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}

 		psDesired, err = strconv.Atoi(string(resp.Kvs[0].Value))
 		if err != nil {
 			log.Errorf("psDesired %d invalid %v", psDesired, err)
-			time.Sleep(p.timeout)
+			time.Sleep(e.timeout)
 			continue
 		}

@@ -73,26 +80,26 @@ func (p *EtcdClient) Desired() int {
 }

 // List return the pserver list read from etcd.
-func (p *EtcdClient) List() []Server {
-	psDesired := p.Desired()
+func (e *Etcd) List() []Server {
+	psDesired := e.Desired()

 	servers := make([]Server, psDesired)
 	for {
 		for i := 0; i < psDesired; i++ {
-			ctx, cancel := context.WithTimeout(context.Background(), p.timeout)
+			ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
 			psKey := pserver.PsPath + strconv.Itoa(i)
 			log.Debugf("checking %s", psKey)
-			resp, err := p.client.Get(ctx, psKey)
+			resp, err := e.client.Get(ctx, psKey)
 			cancel()
 			if err != nil {
 				log.Infof("Get psKey= %s error, %v", psKey, err)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 			kvs := resp.Kvs
 			if len(kvs) == 0 {
 				log.Infof("Waiting for ps addr registered ...")
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}

@@ -100,7 +107,7 @@ func (p *EtcdClient) List() []Server {
 			// TODO(Longfei) check the ps address
 			if psAddr == "" {
 				log.Infof("Get psKey = %s, psAddr is empty", psKey)
-				time.Sleep(p.timeout)
+				time.Sleep(e.timeout)
 				continue
 			}
 			log.Debugf("got value (%s) for key: %s", psAddr, psKey)
@@ -113,7 +120,7 @@ func (p *EtcdClient) List() []Server {
 }

 // NewEtcd create a etcd client to return the state of pserver on etcd.
-func NewEtcd(endpoints string) *EtcdClient {
+func NewEtcd(endpoints string) *Etcd {
 	ep := strings.Split(endpoints, ",")
 	var cli *clientv3.Client
 	var err error
@@ -130,10 +137,118 @@ func NewEtcd(endpoints string) *EtcdClient {
 		break
 	}
 	log.Infof("Connected to etcd: %s\n", endpoints)
-	client := &EtcdClient{
+	client := &Etcd{
 		client:    cli,
 		timeout:   defaultEtcdTimeout,
 		endpoints: ep,
 	}
 	return client
 }
+
+// Select indicates if the current trainer is selected to initialize
+// the pserver parameters.
+func (e *Etcd) Select() (bool, error) {
+	sess, err := concurrency.NewSession(e.client, concurrency.WithTTL(5))
+	if err != nil {
+		return false, err
+	}
+
+	lock := concurrency.NewMutex(sess, initLockPath)
+	log.Infof("Trying to acquire lock at %s.", initLockPath)
+	// Do not use timeout context here, since we don't know how
+	// long does it take for other trainers to initialize the
+	// parameters.
+	err = lock.Lock(context.Background())
+	if err != nil {
+		return false, err
+	}
+	log.Infof("Successfully acquired lock at %s.", initLockPath)
+
+	get := clientv3.OpGet(initDonePath)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(lock.IsOwner()).Then(get).Commit()
+	cancel()
+	if err != nil {
+		return false, err
+	}
+
+	if !tresp.Succeeded {
+		return false, errors.New("no longer the owner of the lock")
+	}
+
+	resp := tresp.Responses[0].GetResponseRange()
+
+	if len(resp.Kvs) == 0 {
+		// Key value not set, select current trainer.
+		e.lock = lock
+		log.Infoln("Trainer selected.")
+		return true, nil
+	}
+
+	if string(resp.Kvs[0].Value) == initDoneVal {
+		log.Infoln("Initialization is already done.")
+		ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+		err = lock.Unlock(ctx)
+		cancel()
+		if err != nil {
+			log.Errorln(err)
+		}
+		return false, nil
+	}
+
+	return false, fmt.Errorf("key %s have unexpected value: %v", initDonePath, resp.Kvs[0].Value)
+}
+
+// Done indicates the parameter initialization process is done.
+func (e *Etcd) Done() error {
+	if e.lock == nil {
+		return errors.New("lock is nil, Done called unexpectedly")
+	}
+
+	put := clientv3.OpPut(initDonePath, initDoneVal)
+	ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+	tresp, err := e.client.Txn(ctx).If(e.lock.IsOwner()).Then(put).Commit()
+	cancel()
+	if err != nil {
+		return err
+	}
+
+	if !tresp.Succeeded {
+		return errors.New("no longer the owner of the lock")
+	}
+
+	ctx, cancel = context.WithTimeout(context.Background(), e.timeout)
+	err = e.lock.Unlock(ctx)
+	cancel()
+	if err != nil {
+		log.Errorln(err)
+	} else {
+		e.lock = nil
+	}
+
+	return nil
+}
+
+// Close closes the etcd client.
+func (e *Etcd) Close() error {
+	var err error
+	if e.lock != nil {
+		ctx, cancel := context.WithTimeout(context.Background(), e.timeout)
+		err = e.lock.Unlock(ctx)
+		cancel()
+		if err == nil {
+			e.lock = nil
+		}
+	}
+
+	cErr := e.client.Close()
+	if cErr != nil {
+		if err != nil {
+			log.Errorln(cErr)
+			return err
+		}
+		return cErr
+	}
+
+	return err
+}
--- a/go/pserver/client/etcd_client_test.go
+++ b/go/pserver/client/etcd_client_test.go
+package client_test
+
+import (
+	"io/ioutil"
+	"net/url"
+	"os"
+	"strings"
+	"sync"
+	"testing"
+
+	"github.com/PaddlePaddle/Paddle/go/pserver/client"
+	"github.com/coreos/etcd/embed"
+)
+
+func TestSelector(t *testing.T) {
+	etcdDir, err := ioutil.TempDir("", "")
+	if err != nil {
+		t.Fatal(err)
+	}
+	cfg := embed.NewConfig()
+	lpurl, _ := url.Parse("http://localhost:0")
+	lcurl, _ := url.Parse("http://localhost:0")
+	cfg.LPUrls = []url.URL{*lpurl}
+	cfg.LCUrls = []url.URL{*lcurl}
+	cfg.Dir = etcdDir
+	e, err := embed.StartEtcd(cfg)
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	defer func() {
+		e.Close()
+		if err := os.RemoveAll(etcdDir); err != nil {
+			t.Fatal(err)
+		}
+	}()
+
+	<-e.Server.ReadyNotify()
+
+	port := strings.Split(e.Clients[0].Addr().String(), ":")[1]
+	endpoint := "127.0.0.1:" + port
+
+	var mu sync.Mutex
+	selectedCount := 0
+	var wg sync.WaitGroup
+	selectAndDone := func(c *client.Etcd) {
+		defer wg.Done()
+
+		selected, err := c.Select()
+		if err != nil {
+			panic(err)
+		}
+
+		if selected {
+			mu.Lock()
+			selectedCount++
+			mu.Unlock()
+			err = c.Done()
+			if err != nil {
+				t.Fatal(err)
+			}
+		}
+	}
+
+	c0 := client.NewEtcd(endpoint)
+	c1 := client.NewEtcd(endpoint)
+	c2 := client.NewEtcd(endpoint)
+	c3 := client.NewEtcd(endpoint)
+	wg.Add(3)
+	go selectAndDone(c0)
+	go selectAndDone(c1)
+	go selectAndDone(c2)
+	wg.Wait()
+
+	// simulate trainer crashed and restarted after the
+	// initialization process.
+	wg.Add(1)
+	go selectAndDone(c3)
+	wg.Wait()
+
+	mu.Lock()
+	if selectedCount != 1 {
+		t.Fatal("selected count wrong:", selectedCount)
+	}
+	mu.Unlock()
+
+	err = c0.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c1.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c2.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+
+	err = c3.Close()
+	if err != nil {
+		t.Fatal(err)
+	}
+}
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -19,9 +19,9 @@ add_library(paddle_api STATIC ${API_SOURCES})
 add_dependencies(paddle_api paddle_proto paddle_trainer_lib)

 INCLUDE(${SWIG_USE_FILE})
-INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
+INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)

-FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)

 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)

@@ -79,16 +79,16 @@ SWIG_LINK_LIBRARIES(swig_paddle
    ${START_END}
 )

-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
    COMMAND ${CMAKE_COMMAND} -E touch .timestamp
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
    DEPENDS _swig_paddle
 )

 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)

 if(WITH_TESTING)
    IF(NOT PY_PIP_FOUND)

--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -41,7 +41,7 @@ ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
      config->m->getConfig(), pserverSpec, useEtcd));
  return updater;
 #else
-  throw UnsupportError();
+  throw UnsupportError("not compiled with WITH_GOLANG");
 #endif
 }


--- a/paddle/capi/Arguments.cpp
+++ b/paddle/capi/Arguments.cpp
@@ -90,6 +90,18 @@ paddle_error paddle_arguments_set_ids(paddle_arguments args,
  return kPD_NO_ERROR;
 }

+paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                              uint64_t ID,
+                                              uint64_t frameHeight,
+                                              uint64_t frameWidth) {
+  if (args == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].setFrameHeight(frameHeight);
+  a->args[ID].setFrameWidth(frameWidth);
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
                                                     uint64_t ID,
                                                     uint32_t nestedLevel,

--- a/paddle/capi/arguments.h
+++ b/paddle/capi/arguments.h
@@ -111,6 +111,20 @@ PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
                                             uint64_t ID,
                                             paddle_ivector ids);

+/**
+ * @brief paddle_arguments_set_frame_shape Set the fram size of one argument
+ *        in array, which index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [in] frameHeight maximum height of input images
+ * @param [in] frameWidth maximum width of input images
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint64_t frameHeight,
+                                                     uint64_t frameWidth);
+
 /**
 * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
 *        argument in array, which index is `ID`.

--- a/paddle/capi/examples/model_inference/common/common.h
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -3,18 +3,21 @@
 #include <stdio.h>
 #include <stdlib.h>

-#define CHECK(stmt)                                                \
-  do {                                                             \
-    paddle_error __err__ = stmt;                                   \
-    if (__err__ != kPD_NO_ERROR) {                                 \
-      fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \
-      exit(__err__);                                               \
-    }                                                              \
+#define CHECK(stmt)                                                      \
+  do {                                                                   \
+    paddle_error __err__ = stmt;                                         \
+    if (__err__ != kPD_NO_ERROR) {                                       \
+      fprintf(stderr, "Invoke paddle error %d in " #stmt "\n", __err__); \
+      exit(__err__);                                                     \
+    }                                                                    \
  } while (0)

 void* read_config(const char* filename, long* size) {
  FILE* file = fopen(filename, "r");
-  if (file == NULL) return NULL;
+  if (file == NULL) {
+    fprintf(stderr, "Open %s error\n", filename);
+    return NULL;
+  }
  fseek(file, 0L, SEEK_END);
  *size = ftell(file);
  fseek(file, 0L, SEEK_SET);

--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -54,6 +54,31 @@ paddle_error paddle_gradient_machine_create_for_inference(
  return kPD_NO_ERROR;
 }

+paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
+  if (mergedModel == nullptr) return kPD_NULLPTR;
+  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
+  int64_t modelConfigSize = 0;
+  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
+  std::string modelConfigProtobuf;
+  modelConfigProtobuf.resize(modelConfigSize);
+  is.read(&modelConfigProtobuf[0], modelConfigSize);
+  paddle::TrainerConfig config;
+  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config.model_config(), CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
  delete cast(machine);
  return kPD_NO_ERROR;

--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -36,6 +36,18 @@ typedef void* paddle_gradient_machine;
 PD_API paddle_error paddle_gradient_machine_create_for_inference(
    paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);

+/**
+ * @brief Create a gradient machine used for model inference, using config with
+ *        parameters which is generated by `paddle merge_model`.
+ * @param [out] machine that used for model inference.
+ * @param [in] mergedModel
+ * @param [in] size
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size);
+
 /**
 * @brief Load parameter from disk.
 * @param machine Gradient Machine.

--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ target_include_directories(capi_test_gradientMachine PUBLIC
  ${PADDLE_CAPI_INC_PATH})
 target_link_libraries(capi_test_gradientMachine paddle_capi)
 add_test(NAME capi_test_gradientMachine
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-  WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -7,6 +7,9 @@ cc_library(tensor SRCS tensor.cc DEPS ddim place paddle_memory device_context)
 cc_test(tensor_test SRCS tensor_test.cc DEPS tensor)
 cc_test(eigen_test SRCS eigen_test.cc DEPS tensor)

+cc_library(lod_tensor SRCS lod_tensor.cc details/lod_tensor.cc DEPS ddim place tensor)
+cc_test(lod_tensor_test SRCS lod_tensor_test.cc DEPS lod_tensor)
+
 cc_test(variable_test SRCS variable_test.cc)

 cc_library(scope SRCS scope.cc)
@@ -32,6 +35,11 @@ py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
+add_custom_command(TARGET framework_py_proto POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/
+    COMMENT "Copy generated python proto into directory paddle/v2/framework/proto."
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})

 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
@@ -40,11 +48,16 @@ if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
    SRCS pybind.cc
    DEPS pybind python backward
-	fc_op
-	sgd_op
-	add_op
-	mean_op
-	cross_entropy_op
-	fill_zeros_like_op
-	recurrent_op)
+    sgd_op
+    add_op
+    mul_op
+    rowwise_add_op
+    sigmoid_op
+    softmax_op
+    mean_op
+    cross_entropy_op
+    recurrent_op
+    uniform_random_op
+    gaussian_random_op
+    fill_zeros_like_op)
 endif(WITH_PYTHON)
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -14,7 +14,6 @@ limitations under the License. */

 #pragma once

-#include <boost/variant.hpp>
 #include <functional>
 #include <string>
 #include <unordered_map>
@@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.pb.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/variant.h"

 namespace paddle {
 namespace framework {

--- a/paddle/framework/backward.cc
+++ b/paddle/framework/backward.cc
@@ -13,6 +13,7 @@
   limitations under the License. */

 #include "paddle/framework/backward.h"
+
 #include <list>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
@@ -132,8 +133,9 @@ std::shared_ptr<OperatorBase> BackwardRecursive(
    std::shared_ptr<OperatorBase> grad_op = OpRegistry::CreateGradOp(forwardOp);
    for (std::string& grad_input : grad_op->inputs_) {
      if (no_grad_names.count(grad_input)) {
-        std::string prefix =
-            grad_input.substr(0, grad_input.size() - kGradVarSuffix.size());
+        // +1 for \0
+        std::string prefix = grad_input.substr(
+            0, grad_input.size() - sizeof(kGradVarSuffix) / sizeof(char) + 1);
        grad_input = prefix + kZeroVarSuffix;

        // If part of input gradient of that operator is not calculated, fill
@@ -166,7 +168,7 @@ std::shared_ptr<OperatorBase> Backward(
  std::unordered_set<std::string> no_grad_names;
  no_grad_names.reserve(no_grad_vars.size());

-  no_grad_names.insert(kEmptyVarName + kGradVarSuffix);
+  no_grad_names.insert(std::string(kEmptyVarName) + kGradVarSuffix);

  for (auto& name : no_grad_vars) {
    no_grad_names.insert(name + kGradVarSuffix);

--- a/paddle/framework/backward_test.cc
+++ b/paddle/framework/backward_test.cc
@@ -17,16 +17,23 @@
 #include <gtest/gtest.h>
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/operators/type_alias.h"

 namespace paddle {
 namespace framework {

+using OperatorBase = framework::OperatorBase;
+using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
+using OpProto = framework::OpProto;
+using OpAttrChecker = framework::OpAttrChecker;
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;
+
 class EmptyOp : public OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(EmptyOp, OperatorBase)
+
  void InferShape(const Scope &scope) const override {}
-  void Run(const Scope &scope,
-           const platform::DeviceContext &dev_ctx) const override {}
+  void Run(const Scope &scope, const DeviceContext &dev_ctx) const override {}
 };

 class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
@@ -71,7 +78,7 @@ class NoGradOpMaker : public OpProtoAndCheckerMaker {
  }
 };

-class FcOp : public ops::NetOp {
+class FcOp : public operators::NetOp {
 public:
  void Init() override {
    AddOp(OpRegistry::CreateOp("mul", {Input("X"), Input("W")},
@@ -143,6 +150,7 @@ class AddOpMaker : public OpProtoAndCheckerMaker {
 }  // namespace paddle

 namespace f = paddle::framework;
+namespace ops = paddle::operators;
 using EnforceNotMet = paddle::platform::EnforceNotMet;
 REGISTER_OP(rowwise_add, f::EmptyOp, f::RowWiseAddOpMaker);
 REGISTER_GRADIENT_OP(rowwise_add, rowwise_add_grad, f::EmptyOp);
@@ -165,10 +173,10 @@ TEST(Backward, simple_op_grad) {
  ASSERT_EQ(4UL, gop->inputs_.size());
  ASSERT_EQ(f::kEmptyVarName, gop->inputs_[0]);
  ASSERT_EQ("rowwise_add_grad", gop->type_);
-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->outputs_[0]);
-  ASSERT_EQ("b" + f::kGradVarSuffix, gop->outputs_[1]);
+  ASSERT_EQ(f::GradVarName("X"), gop->outputs_[0]);
+  ASSERT_EQ(f::GradVarName("b"), gop->outputs_[1]);

-  ASSERT_EQ("X" + f::kGradVarSuffix, gop->Output("X" + f::kGradVarSuffix));
+  ASSERT_EQ(f::GradVarName("X"), gop->Output(f::GradVarName("X")));
 }

 TEST(Backward, simple_op_not_need_grad) {
@@ -176,7 +184,7 @@ TEST(Backward, simple_op_not_need_grad) {
  ASSERT_NE(fwd, nullptr);
  auto gop = f::Backward(*fwd, {"X"});
  ASSERT_EQ(std::find(gop->outputs_.begin(), gop->outputs_.end(),
-                      "X" + f::kGradVarSuffix),
+                      f::GradVarName("X")),
            gop->outputs_.end());

  auto no_input_gop = f::Backward(*fwd, {"X", "b"});
@@ -244,18 +252,18 @@ TEST(Backward, net_input_of_network_not_need_grad) {
  all_output.erase(f::kEmptyVarName);

  for (auto &out : {"W1", "b1", "hidden0", "W2", "b2"}) {
-    ASSERT_NE(all_output.find(out + f::kGradVarSuffix), all_output.end());
+    ASSERT_NE(all_output.find(f::GradVarName(out)), all_output.end());
  }

  // Not Generated X
-  ASSERT_EQ(all_output.find("X" + f::kGradVarSuffix), all_output.end());
+  ASSERT_EQ(all_output.find(f::GradVarName("X")), all_output.end());

  ASSERT_EQ(2UL, bwd_net->ops_.size());
  ASSERT_TRUE(bwd_net->ops_[1]->IsNetOp());
  auto first_fc_grad = static_cast<ops::NetOp *>(bwd_net->ops_[1].get());
  ASSERT_EQ(3UL, first_fc_grad->ops_.size());
  ASSERT_EQ(f::kEmptyVarName,
-            first_fc_grad->ops_[2]->Output("A" + f::kGradVarSuffix));
+            first_fc_grad->ops_[2]->Output(f::GradVarName("A")));
 }

 TEST(Backward, net_shared_weight) {
@@ -307,15 +315,15 @@ TEST(Backward, op_part_of_output_are_not_need) {
  ASSERT_EQ(1UL, fill_zero.inputs_.size());
  ASSERT_EQ("Z", fill_zero.inputs_[0]);
  ASSERT_EQ(1UL, fill_zero.outputs_.size());
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, fill_zero.outputs_[0]);
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix, fill_zero.outputs_[0]);

  auto &d_many_out = *net->ops_[1];
  ASSERT_EQ("many_output_op_grad", d_many_out.type_);
  ASSERT_EQ(1UL + 2UL + 2UL, d_many_out.inputs_.size());  // I/O/OG
-  ASSERT_EQ("Z" + f::kZeroVarSuffix, d_many_out.Input("z" + f::kGradVarSuffix));
-  ASSERT_EQ("Y" + f::kGradVarSuffix, d_many_out.Input("y" + f::kGradVarSuffix));
-  ASSERT_EQ("X" + f::kGradVarSuffix,
-            d_many_out.Output("x" + f::kGradVarSuffix));
+  ASSERT_EQ(std::string("Z") + f::kZeroVarSuffix,
+            d_many_out.Input(f::GradVarName("z")));
+  ASSERT_EQ(f::GradVarName("Y"), d_many_out.Input(f::GradVarName("y")));
+  ASSERT_EQ(f::GradVarName("X"), d_many_out.Output(f::GradVarName("x")));
 }

 TEST(Backward, op_part_of_input_are_not_need) {
@@ -325,10 +333,9 @@ TEST(Backward, op_part_of_input_are_not_need) {
  ASSERT_EQ(grad_mul.type_, "mul_grad");
  ASSERT_EQ(grad_mul.inputs_.size(), 2UL + 1UL + 1UL);
  ASSERT_EQ(grad_mul.outputs_.size(), 2UL);
-  ASSERT_EQ(grad_mul.Output("A" + f::kGradVarSuffix), f::kEmptyVarName);
-  ASSERT_EQ(grad_mul.Output("B" + f::kGradVarSuffix), "b" + f::kGradVarSuffix);
-  ASSERT_EQ(grad_mul.Input("Out" + f::kGradVarSuffix),
-            "out" + f::kGradVarSuffix);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("A")), f::kEmptyVarName);
+  ASSERT_EQ(grad_mul.Output(f::GradVarName("B")), f::GradVarName("b"));
+  ASSERT_EQ(grad_mul.Input(f::GradVarName("Out")), f::GradVarName("out"));
  ASSERT_EQ(grad_mul.Input("A"), "a");
  ASSERT_EQ(grad_mul.Input("B"), "b");
  ASSERT_EQ(grad_mul.Input("Out"), "out");

--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -14,13 +14,12 @@ limitations under the License. */

 #pragma once

-#include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
 #include "paddle/framework/dim.h"
 #include "paddle/platform/enforce.h"
-#include "unsupported/Eigen/CXX11/Tensor"
+#include "paddle/platform/variant.h"

 namespace paddle {
 namespace framework {

--- a/paddle/framework/details/lod_tensor.cc
+++ b/paddle/framework/details/lod_tensor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <memory>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+using LOD = LODTensor::LOD;
+
+std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level_begin,
+                              size_t level_end) {
+  auto new_lod = std::make_shared<LOD>();
+  new_lod->reserve(level_end - level_begin);
+  for (size_t i = level_begin; i < level_end; i++) {
+    new_lod->emplace_back(lod[i]);
+  }
+  return new_lod;
+}
+
+std::shared_ptr<LOD> SliceLOD(const LOD &lod, size_t level, size_t elem_begin,
+                              size_t elem_end, bool tensor_shared) {
+  // slice the lod.
+  auto new_lod = std::make_shared<LOD>();
+  new_lod->reserve(lod.size() - level);
+  auto start = lod.at(level)[elem_begin];
+  auto end = lod.at(level)[elem_end];
+
+  for (auto it = lod.begin() + level; it != lod.end(); it++) {
+    auto it_begin = std::find(it->begin(), it->end(), start);
+    auto it_end = std::find(it_begin, it->end(), end);
+    PADDLE_ENFORCE(it_begin != it->end(), "error in parsing lod info");
+    PADDLE_ENFORCE(it_end != it->end(), "error in parsing lod info");
+    new_lod->emplace_back(it_begin, it_end + 1);
+    if (!tensor_shared) {
+      // reset offset if tensor is copyed and sliced.
+      std::transform(new_lod->back().begin(), new_lod->back().end(),
+                     new_lod->back().begin(),
+                     [start](int v) { return v - start; });
+      PADDLE_ENFORCE(new_lod->back().front() == 0, "error in slice LOD");
+    }
+  }
+  return new_lod;
+}
+
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/details/lod_tensor.h
+++ b/paddle/framework/details/lod_tensor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <memory>
+
+namespace paddle {
+namespace framework {
+namespace details {
+
+/*
+ * Slice levels from LOD.
+ *
+ * @lod: LOD to slice.
+ * @level_begin: level to begin slice.
+ * @level_end: level to end slice.
+ */
+std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
+                                         size_t level_begin, size_t level_end);
+
+/*
+ * Slice elements from a level of LOD.
+ *
+ * @lod: LOD to slice.
+ * @level: which level to slice.
+ * @elem_begin: element's index to begin slice.
+ * @elem_end: element's index to end slice.
+ */
+std::shared_ptr<LODTensor::LOD> SliceLOD(const LODTensor::LOD &lod,
+                                         size_t level, size_t elem_begin,
+                                         size_t elem_end, bool tensor_shared);
+}  // namespace details
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/grad_op_builder.cc
+++ b/paddle/framework/grad_op_builder.cc
@@ -19,45 +19,44 @@ permissions and limitations under the License. */
 namespace paddle {
 namespace framework {

-class OpRegistry;
-
-using VarIndexMap = std::unordered_map<std::string, int>;
+typedef std::vector<int> Ints;

 enum class OpArgType { IN, OUT };

-static std::vector<int>* GetOpFormat(OperatorBase* op, const OpArgType& type) {
-  std::string key = type == OpArgType::IN ? "input_format" : "output_format";
-  return op->attrs_.count(key)
-             ? &boost::get<std::vector<int>>(op->attrs_.at(key))
-             : nullptr;
+const Ints* AttrFormat(const AttributeMap& attrs, const std::string& key) {
+  return (attrs.count(key) > 0) ? &boost::get<Ints>(attrs.at(key)) : nullptr;
 }

-static const std::vector<int>* GetOpFormat(const OperatorBase* op,
-                                           const OpArgType& type) {
-  std::string key = type == OpArgType::IN ? "input_format" : "output_format";
-  return op->attrs_.count(key)
-             ? &boost::get<std::vector<int>>(op->attrs_.at(key))
-             : nullptr;
+Ints* AttrFormat(AttributeMap& attrs, const std::string& key) {
+  return (attrs.count(key) > 0) ? &boost::get<Ints>(attrs.at(key)) : nullptr;
 }

-static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
-                       const OpArgType& src_type, const OpArgType& dst_type,
+static void TransOpArg(const OperatorBase* src_op,
+                       std::vector<std::string>& grad_inputs,
+                       std::vector<std::string>& grad_outputs,
+                       AttributeMap& grad_attrs,
+                       std::unordered_map<std::string, int>& grad_idxs,
+                       const std::string& src_type, const std::string& dst_type,
                       int& idx, bool is_grad) {
  const std::vector<std::string>& src_inout =
-      src_type == OpArgType::IN ? src_op->inputs_ : src_op->outputs_;
-  const std::vector<int>* src_format = GetOpFormat(src_op, src_type);
+      (src_type == "input_format") ? src_op->inputs_ : src_op->outputs_;
+
+  const std::vector<int>* src_format = AttrFormat(src_op->Attrs(), src_type);

  std::vector<std::string>& dst_inout =
-      dst_type == OpArgType::IN ? dst_op->inputs_ : dst_op->outputs_;
-  std::vector<int>* dst_format = GetOpFormat(dst_op, dst_type);
+      (dst_type == "input_format") ? grad_inputs : grad_outputs;
+
+  std::vector<int>* dst_format = AttrFormat(grad_attrs, dst_type);
+
  const OpProto& proto = OpRegistry::protos().at(src_op->type_);
+
  const auto& src_arg_list =
-      src_type == OpArgType::IN ? proto.inputs() : proto.outputs();
+      (src_type == "input_format") ? proto.inputs() : proto.outputs();

  for (const auto& arg : src_arg_list) {
    std::string src_name = arg.name();
    std::string dst_name = is_grad ? src_name + kGradVarSuffix : src_name;
-    (*dst_op->in_out_idxs_)[dst_name] = idx++;
+    grad_idxs[dst_name] = idx++;
    int src_arg_idx = src_op->in_out_idxs_->at(src_name);
    int src_begin =
        src_format == nullptr ? src_arg_idx : src_format->at(src_arg_idx);
@@ -76,26 +75,42 @@ static void TransOpArg(const OperatorBase* src_op, OperatorBase* dst_op,
 }

 OperatorBase* BuildGradOp(const OperatorBase* op) {
-  std::string grad_op_type = OpRegistry::grad_ops().at(op->type_);
-  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
-  grad_op->type_ = grad_op_type;
-  grad_op->attrs_ = op->attrs_;
-  grad_op->attrs_.erase("input_format");
-  grad_op->attrs_.erase("output_format");
-  if (GetOpFormat(op, OpArgType::IN) != nullptr) {
-    grad_op->attrs_["output_format"] = std::vector<int>({0});
+  const std::string& grad_op_type = OpRegistry::grad_ops().at(op->Type());
+
+  AttributeMap grad_attrs(op->Attrs());
+  grad_attrs.erase("input_format");
+  grad_attrs.erase("output_format");
+  if (op->Attrs().count("input_format") > 0) {
+    grad_attrs["output_format"] = std::vector<int>({0});
  }
-  if (GetOpFormat(op, OpArgType::IN) != nullptr ||
-      GetOpFormat(op, OpArgType::OUT) != nullptr) {
-    grad_op->attrs_["input_format"] = std::vector<int>({0});
+  if (op->Attrs().count("input_format") > 0 ||
+      op->Attrs().count("output_format") > 0) {
+    grad_attrs["input_format"] = std::vector<int>({0});
  }
-  grad_op->in_out_idxs_.reset(new VarIndexMap());
+
+  std::vector<std::string> grad_inputs, grad_outputs;
+
+  using VarIndexMap = std::unordered_map<std::string, int>;
+  VarIndexMap* grad_idxs = new VarIndexMap;
  int in_idx = 0;
  int out_idx = 0;
-  TransOpArg(op, grad_op, OpArgType::IN, OpArgType::IN, in_idx, false);   // I
-  TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, false);  // G
-  TransOpArg(op, grad_op, OpArgType::OUT, OpArgType::IN, in_idx, true);   // OG
-  TransOpArg(op, grad_op, OpArgType::IN, OpArgType::OUT, out_idx, true);  // IG
+  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
+             "input_format", "input_format", in_idx, false);  // I
+  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
+             "output_format", "input_format", in_idx, false);  // G
+  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
+             "output_format", "input_format", in_idx, true);  // OG
+  TransOpArg(op, grad_inputs, grad_outputs, grad_attrs, *grad_idxs,
+             "input_format", "output_format", out_idx, true);  // IG
+
+  OperatorBase* grad_op = OpRegistry::op_creators().at(grad_op_type)();
+
+  grad_op->type_ = grad_op_type;
+  grad_op->inputs_ = grad_inputs;
+  grad_op->outputs_ = grad_outputs;
+  grad_op->attrs_ = grad_attrs;
+  grad_op->in_out_idxs_.reset(grad_idxs);
+
  return grad_op;
 }


--- a/paddle/framework/grad_op_builder_test.cc
+++ b/paddle/framework/grad_op_builder_test.cc
@@ -10,6 +10,8 @@ namespace framework {

 class NOP : public OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(NOP, OperatorBase)
+
  void InferShape(const Scope &scope) const override {}
  void Run(const Scope &scope,
           const platform::DeviceContext &dev_ctx) const override {}
@@ -83,21 +85,19 @@ TEST(GradOpBuilder, MutiInOut) {
  EXPECT_EQ(grad_test_op->Input("Out1"), "out1");
  EXPECT_EQ(grad_test_op->Inputs("Out2_mult"),
            std::vector<std::string>({"out2_1", "out2_2"}));
-  EXPECT_EQ(grad_test_op->Input("Out1" + f::kGradVarSuffix),
-            "out1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Inputs("Out2_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out1")),
+            f::GradVarName("out1"));
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out2_mult")),
            std::vector<std::string>(
-                {"out2_1" + f::kGradVarSuffix, "out2_2" + f::kGradVarSuffix}));
+                {f::GradVarName("out2_1"), f::GradVarName("out2_2")}));

  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
-            "in1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
-            std::vector<std::string>({"in2_1" + f::kGradVarSuffix,
-                                      "in2_2" + f::kGradVarSuffix,
-                                      "in2_3" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Output("In3" + f::kGradVarSuffix),
-            "in3" + f::kGradVarSuffix);
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
+            std::vector<std::string>({f::GradVarName("in2_1"),
+                                      f::GradVarName("in2_2"),
+                                      f::GradVarName("in2_3")}));
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In3")), f::GradVarName("in3"));
 }

 TEST(GradOpBuilder, IOIgnoredInGradient) {
@@ -119,19 +119,18 @@ TEST(GradOpBuilder, IOIgnoredInGradient) {
  EXPECT_EQ(grad_test_op->Inputs("Out1_mult"),
            std::vector<std::string>({"out1_1", "out1_2"}));
  EXPECT_EQ(grad_test_op->Input("Out2"), f::kEmptyVarName);
-  EXPECT_EQ(grad_test_op->Inputs("Out1_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Inputs(f::GradVarName("Out1_mult")),
            std::vector<std::string>(
-                {"out1_1" + f::kGradVarSuffix, "out1_2" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Input("Out2" + f::kGradVarSuffix),
-            "out2" + f::kGradVarSuffix);
+                {f::GradVarName("out1_1"), f::GradVarName("out1_2")}));
+  EXPECT_EQ(grad_test_op->Input(f::GradVarName("Out2")),
+            f::GradVarName("out2"));

  ASSERT_EQ(grad_test_op->outputs_.size(), 5UL);
-  EXPECT_EQ(grad_test_op->Output("In1" + f::kGradVarSuffix),
-            "in1" + f::kGradVarSuffix);
-  EXPECT_EQ(grad_test_op->Outputs("In2_mult" + f::kGradVarSuffix),
+  EXPECT_EQ(grad_test_op->Output(f::GradVarName("In1")), f::GradVarName("in1"));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In2_mult")),
            std::vector<std::string>(
-                {"in2_1" + f::kGradVarSuffix, "in2_2" + f::kGradVarSuffix}));
-  EXPECT_EQ(grad_test_op->Outputs("In3_mult" + f::kGradVarSuffix),
+                {f::GradVarName("in2_1"), f::GradVarName("in2_2")}));
+  EXPECT_EQ(grad_test_op->Outputs(f::GradVarName("In3_mult")),
            std::vector<std::string>(
-                {"in3_1" + f::kGradVarSuffix, "in3_2" + f::kGradVarSuffix}));
+                {f::GradVarName("in3_1"), f::GradVarName("in3_2")}));
 }
--- a/paddle/framework/lod_tensor.cc
+++ b/paddle/framework/lod_tensor.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+
+namespace paddle {
+namespace framework {
+
+LODTensor LODTensor::SliceShared(size_t level_begin, size_t level_end) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
+  // slice levels just need to update LOD info, each level will contains the
+  // whole tensor_, so no need to modify tensor_.
+  return LODTensor(tensor_, new_lod);
+}
+
+LODTensor LODTensor::SliceShared(size_t level, size_t elem_begin,
+                                 size_t elem_end) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
+                                   true /*tensor_shared*/);
+
+  // slice elements just need to update LOD info, because offsets are not
+  // changed, so the original tensor_ can be reused.
+  return LODTensor(tensor_, new_lod);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_tensor.h
+++ b/paddle/framework/lod_tensor.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <memory>
+#if (!PADDLE_ONLY_CPU)
+#include <thrust/device_vector.h>
+#include <thrust/host_vector.h>
+#endif
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/enforce.h"
+
+namespace paddle {
+namespace framework {
+
+/*
+ * LODTensor (Level of details Tensor)
+ * see https://en.wikipedia.org/wiki/Level_of_details for reference.
+ */
+class LODTensor {
+ public:
+// Level save offsets of each unit.
+#ifdef PADDLE_ONLY_CPU
+  using Level = std::vector<size_t>;
+#else
+  using Level = thrust::device_vector<size_t>;
+#endif
+  // LOD stores offsets of each level of units, the largest units level first,
+  // then the smaller units level. Each Level stores the offsets of units in
+  // Tesor.
+  typedef std::vector<Level> LOD;
+
+  LODTensor() {}
+  LODTensor(const std::shared_ptr<Tensor> &tensor,
+            const std::shared_ptr<LOD> &lod) {
+    Reset(tensor, lod);
+  }
+
+  void Reset(const std::shared_ptr<Tensor> &tensor,
+             const std::shared_ptr<LOD> &lod) {
+    tensor_ = tensor;
+    lod_start_pos_ = lod;
+  }
+
+  /*
+   * Get a element from LOD.
+   */
+  size_t lod_element(size_t level, size_t elem) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    PADDLE_ENFORCE(elem < NumElements(level),
+                   "element begin [%d] out of range [%d]", elem,
+                   NumElements(level));
+    return (*lod_start_pos_)[level][elem];
+  }
+
+  /*
+   * Number of LODTensor's levels, each level has units of data, for example,
+   * in the sentence's view, article, paragraph, sentence are 3 levels.
+   */
+  size_t NumLevels() const {
+    return lod_start_pos_ ? lod_start_pos_->size() : 0UL;
+  }
+  /*
+   * Number of elements in a level.
+   */
+  size_t NumElements(size_t level = 0) const {
+    PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                   NumLevels());
+    // the last offset is the end of last element
+    return lod_start_pos_->at(level).size() - 1;
+  }
+
+  /*
+   * Slice of levels[level_begin:level_end], with tensor copied.
+   */
+  template <typename T>
+  LODTensor SliceCopied(size_t level_begin, size_t level_end,
+                        const platform::Place &dst_place) const;
+
+  /*
+   * Slice of levels[level_begin:level_end], with tensor shared.
+   */
+  LODTensor SliceShared(size_t level_begin, size_t level_end) const;
+
+  /*
+   * Slice of elements of a level, [elem_begin: elem_end], with tensor copied.
+   * @note: low performance in slice lod_start_pos_.
+   */
+  template <typename T>
+  LODTensor SliceCopied(size_t level, size_t elem_begin, size_t elem_end,
+                        const platform::Place &dst_place) const;
+
+  /*
+   * Slice of elements of a level, [elem_begin: elem_end], with tensor shared.
+   * @note: low performance in slice lod_start_pos_.
+   */
+  LODTensor SliceShared(size_t level, size_t elem_begin, size_t elem_end) const;
+
+  /*
+   * Copy other's lod_start_pos_, to share LOD info.
+   * @note: the LOD info should not be changed.
+   */
+  void ShareLOD(const LODTensor &other) {
+    lod_start_pos_ = other.lod_start_pos_;
+  }
+
+  /*
+   * Copy other's lod_start_pos_'s content, free to mutate.
+   */
+  void CopyLOD(const LODTensor &other) {
+    lod_start_pos_ = std::make_shared<LOD>(*other.lod_start_pos_);
+  }
+  /*
+   * Determine whether LODTensor has a valid LOD info.
+   */
+  bool HasLOD() const { return bool(lod_start_pos_); }
+  LOD *lod() const { return lod_start_pos_.get(); }
+
+  std::shared_ptr<Tensor> &tensor() { return tensor_; }
+  Tensor *raw_tensor() { return tensor_.get(); }
+
+ private:
+  std::shared_ptr<LOD> lod_start_pos_;
+  std::shared_ptr<Tensor> tensor_;
+};
+
+}  // namespace framework
+}  // namespace paddle
+
+#include "paddle/framework/lod_tensor_impl.h"
--- a/paddle/framework/lod_tensor_impl.h
+++ b/paddle/framework/lod_tensor_impl.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include "paddle/framework/details/lod_tensor.h"
+
+namespace paddle {
+namespace framework {
+
+template <typename T>
+LODTensor LODTensor::SliceCopied(size_t level_begin, size_t level_end,
+                                 const platform::Place &dst_place) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level_begin, level_end);
+  auto new_tensor = std::make_shared<Tensor>();
+  new_tensor->CopyFrom<T>(*tensor_, dst_place);
+
+  return LODTensor(new_tensor, new_lod);
+}
+
+template <typename T>
+LODTensor LODTensor::SliceCopied(size_t level, size_t elem_begin,
+                                 size_t elem_end,
+                                 const platform::Place &dst_place) const {
+  PADDLE_ENFORCE(HasLOD(), "has no LOD info, can't be sliced.");
+  PADDLE_ENFORCE(level < NumLevels(), "level [%d] out of range [%d]", level,
+                 NumLevels());
+  PADDLE_ENFORCE(elem_begin < NumElements(level),
+                 "element begin [%d] out of range [%d]", elem_begin,
+                 NumElements(level));
+  PADDLE_ENFORCE(elem_end < NumElements(level) + 1,
+                 "element end [%d] out of range [%d]", elem_end,
+                 NumElements(level));
+
+  auto new_lod = details::SliceLOD(*lod_start_pos_, level, elem_begin, elem_end,
+                                   false /*tensor_shared*/);
+
+  auto start_idx = new_lod->front().front();
+  auto end_idx = new_lod->front().back() - 1 /*the next element's start*/;
+  auto sliced_tensor = tensor_->Slice<T>(start_idx, end_idx);
+  auto new_tensor = std::make_shared<Tensor>();
+  new_tensor->CopyFrom<T>(sliced_tensor, dst_place);
+
+  return LODTensor(new_tensor, new_lod);
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/lod_tensor_test.cc
+++ b/paddle/framework/lod_tensor_test.cc
+/*
+  Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+  Licensed under the Apache License, Version 2.0 (the "License");
+  you may not use this file except in compliance with the License.
+  You may obtain a copy of the License at
+  http://www.apache.org/licenses/LICENSE-2.0
+  Unless required by applicable law or agreed to in writing, software
+  distributed under the License is distributed on an "AS IS" BASIS,
+  WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+  See the License for the specific language governing permissions and
+  limitations under the License.
+*/
+
+#include "paddle/framework/lod_tensor.h"
+
+#include <glog/logging.h>
+#include <gtest/gtest.h>
+#include <memory>
+
+namespace paddle {
+namespace framework {
+
+class LODTensorTester : public ::testing::Test {
+ public:
+  virtual void SetUp() override {
+    lod_tensor.reset(new LODTensor);
+    // tensor's batch_size: 30
+    // 3 levels
+    // 0 10 20
+    // 0 5 10 15 20
+    // 0 2 5 7 10 12 15 20
+    auto lod = std::make_shared<LODTensor::LOD>();
+    lod->push_back(std::vector<size_t>{0, 10, 20});
+    lod->push_back(std::vector<size_t>{0, 5, 10, 15, 20});
+    lod->push_back(std::vector<size_t>{0, 2, 5, 7, 10, 12, 15, 17, 20});
+
+    auto tensor = std::make_shared<Tensor>();
+    tensor->Resize({20 /*batch size*/, 128 /*dim*/});
+    // malloc memory
+    tensor->mutable_data<float>(place);
+
+    lod_tensor->Reset(tensor, lod);
+  }
+
+ protected:
+  std::unique_ptr<LODTensor> lod_tensor;
+  platform::CPUPlace place;
+};
+
+TEST_F(LODTensorTester, NumLevels) { ASSERT_EQ(lod_tensor->NumLevels(), 3UL); }
+
+TEST_F(LODTensorTester, NumElements) {
+  ASSERT_EQ(lod_tensor->NumElements(0), 2UL);
+  ASSERT_EQ(lod_tensor->NumElements(1), 4UL);
+  ASSERT_EQ(lod_tensor->NumElements(2), 8UL);
+}
+
+TEST_F(LODTensorTester, SliceShared_Level) {
+  // slice 1 level
+  for (size_t level = 0; level < 3UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 1);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+  }
+  // slice 2 level
+  for (size_t level = 0; level < 2UL; ++level) {
+    auto new_lod_tensor = lod_tensor->SliceShared(level, level + 2);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
+              lod_tensor->NumElements(level + 1));
+    ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+  }
+}
+
+TEST_F(LODTensorTester, SliceCopied_Level) {
+  // slice 1 level
+  for (size_t level = 0; level < 3UL; ++level) {
+    auto new_lod_tensor =
+        lod_tensor->SliceCopied<float>(level, level + 1, place);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 1UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0UL), lod_tensor->NumElements(level));
+    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+    // TODO(superjom) add tensor comparation here.
+  }
+  // slice 2 level
+  for (size_t level = 0; level < 2UL; ++level) {
+    auto new_lod_tensor =
+        lod_tensor->SliceCopied<float>(level, level + 2, place);
+    ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+    ASSERT_EQ(new_lod_tensor.NumElements(0), lod_tensor->NumElements(level));
+    ASSERT_EQ(new_lod_tensor.NumElements(1),
+              lod_tensor->NumElements(level + 1));
+    // ASSERT_EQ(new_lod_tensor.tensor(), lod_tensor->tensor());
+    // TODO(superjom) add tensor comparation here.
+  }
+}
+
+TEST_F(LODTensorTester, SliceShared_Element) {
+  size_t level = 0;
+  auto new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+
+  level = 1;
+  new_lod_tensor = lod_tensor->SliceShared(level, 0, 2);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+}
+
+TEST_F(LODTensorTester, SliceCopied_Element) {
+  size_t level = 0;
+  auto new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 3UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(2), 8UL);
+  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+
+  level = 1;
+  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 0, 2, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+  ASSERT_NE(new_lod_tensor.raw_tensor(), lod_tensor->raw_tensor());
+
+  level = 1;
+  // LOD is
+  //    0 5 10
+  //    0 2 5 7 10
+  new_lod_tensor = lod_tensor->SliceCopied<float>(level, 1, 3, place);
+  ASSERT_EQ(new_lod_tensor.NumLevels(), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(0), 2UL);
+  ASSERT_EQ(new_lod_tensor.NumElements(1), 4UL);
+
+  ASSERT_EQ(new_lod_tensor.lod_element(0, 0), 0UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(0, 1), 5UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 0), 0UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 1), 2UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 2), 5UL);
+  ASSERT_EQ(new_lod_tensor.lod_element(1, 3), 7UL);
+
+  // TODO(superjom) compare the content of these tensors
+}
+
+TEST_F(LODTensorTester, ShareLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.ShareLOD(*lod_tensor);
+  ASSERT_EQ(new_lod_tensor.lod(), lod_tensor->lod());
+}
+
+TEST_F(LODTensorTester, CopyLOD) {
+  LODTensor new_lod_tensor;
+  new_lod_tensor.CopyLOD(*lod_tensor);
+  ASSERT_NE(new_lod_tensor.lod(), lod_tensor->lod());
+}
+
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/framework/op_registry.h
+++ b/paddle/framework/op_registry.h
@@ -69,18 +69,18 @@ class OpProtoAndCheckerMaker {

  VariableBuilder AddInput(const std::string& name,
                           const std::string& comment) {
-    auto input = proto_->mutable_inputs()->Add();
-    *input->mutable_name() = name;
-    *input->mutable_comment() = comment;
+    VarProto* input = proto_->add_inputs();
+    input->set_name(name);
+    input->set_comment(comment);
    return VariableBuilder{input, [=] { this->SetHasMultipleInput(); },
                           nullptr};
  }

  VariableBuilder AddOutput(const std::string& name,
                            const std::string& comment) {
-    auto output = proto_->mutable_outputs()->Add();
-    *output->mutable_name() = name;
-    *output->mutable_comment() = comment;
+    VarProto* output = proto_->add_outputs();
+    output->set_name(name);
+    output->set_comment(comment);
    return VariableBuilder{output, [=] { this->SetHasMultipleOutput(); },
                           [=] { this->SetHasTemporaryOutput(); }};
  }
@@ -89,17 +89,15 @@ class OpProtoAndCheckerMaker {
  TypedAttrChecker<T>& AddAttr(const std::string& name,
                               const std::string& comment,
                               bool generated = false) {
-    auto attr = proto_->mutable_attrs()->Add();
-    *attr->mutable_name() = name;
-    *attr->mutable_comment() = comment;
+    AttrProto* attr = proto_->add_attrs();
+    attr->set_name(name);
+    attr->set_comment(comment);
    attr->set_generated(generated);
    attr->set_type(AttrTypeID<T>());
    return op_checker_->AddAttrChecker<T>(name);
  }

-  void AddComment(const std::string& comment) {
-    *(proto_->mutable_comment()) = comment;
-  }
+  void AddComment(const std::string& comment) { proto_->set_comment(comment); }

 private:
  void SetHasMultiple(const std::string& in_out, bool* flag) {
@@ -187,7 +185,7 @@ class OpRegistry {
    OpProto& op_proto = protos()[op_type];
    auto maker = ProtoMakerType(&op_proto, &op_checker);
    maker.Validate();
-    *op_proto.mutable_type() = op_type;
+    op_proto.set_type(op_type);
    PADDLE_ENFORCE(
        op_proto.IsInitialized(),
        "Fail to initialize %s's OpProto, because %s is not initialized",
@@ -260,12 +258,6 @@ class OpRegistry {
    return CreateOp(op_desc.type(), inputs, outputs, attrs);
  }

-  static bool SupportGPU(const std::string& op_type) {
-    OperatorWithKernel::OpKernelKey key;
-    key.place_ = platform::GPUPlace();
-    return OperatorWithKernel::AllOpKernels().at(op_type).count(key) != 0;
-  }
-
  static std::shared_ptr<OperatorBase> CreateGradOp(const OperatorBase& op) {
    PADDLE_ENFORCE(!op.IsNetOp(),
                   "Use framework::Backward to get backward ops");
@@ -313,22 +305,45 @@ class OpRegistry {
  }
 };

+class Registrar {
+ public:
+  // In our design, various kinds of classes, e.g., operators and kernels, have
+  // their corresponding registry and registrar. The action of registration is
+  // in the constructor of a global registrar variable, which, however, are not
+  // used in the code that calls package framework, and would be removed from
+  // the generated binary file by the linker. To avoid such removal, we add
+  // Touch to all registrar classes and make USE_OP macros to call this
+  // method. So, as long as the callee code calls USE_OP, the global
+  // registrar variable won't be removed by the linker.
+  void Touch() {}
+};
+
 template <typename OpType, typename ProtoMakerType>
-class OpRegisterHelper {
+class OpRegistrar : public Registrar {
 public:
-  explicit OpRegisterHelper(const char* op_type) {
+  explicit OpRegistrar(const char* op_type) {
    OpRegistry::RegisterOp<OpType, ProtoMakerType>(op_type);
  }
 };

 template <typename GradOpType>
-class GradOpRegisterHelper {
+class GradOpRegistrar : public Registrar {
 public:
-  GradOpRegisterHelper(const char* op_type, const char* grad_op_type) {
+  GradOpRegistrar(const char* op_type, const char* grad_op_type) {
    OpRegistry::RegisterGradOp<GradOpType>(op_type, grad_op_type);
  }
 };

+template <typename PlaceType, typename KernelType>
+class OpKernelRegistrar : public Registrar {
+ public:
+  explicit OpKernelRegistrar(const char* op_type) {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = PlaceType();
+    OperatorWithKernel::AllOpKernels()[op_type][key].reset(new KernelType);
+  }
+};
+
 /**
 * check if MACRO is used in GLOBAL NAMESPACE.
 */
@@ -339,97 +354,121 @@ class GradOpRegisterHelper {
                msg)

 /**
- * Macro to Register Operator.
+ * Macro to register Operator.
 */
-#define REGISTER_OP(__op_type, __op_class, __op_maker_class)                 \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(__reg_op__##__op_type,                      \
-                                 "REGISTER_OP must be in global namespace"); \
-  static ::paddle::framework::OpRegisterHelper<__op_class, __op_maker_class> \
-      __op_register_##__op_type##__(#__op_type);                             \
-  int __op_register_##__op_type##_handle__() { return 0; }
+#define REGISTER_OP(op_type, op_class, op_maker_class)                        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                             \
+      __reg_op__##op_type, "REGISTER_OP must be called in global namespace"); \
+  static ::paddle::framework::OpRegistrar<op_class, op_maker_class>           \
+      __op_registrar_##op_type##__(#op_type);                                 \
+  int TouchOpRegistrar_##op_type() {                                          \
+    __op_registrar_##op_type##__.Touch();                                     \
+    return 0;                                                                 \
+  }

 /**
- * Macro to Register Gradient Operator.
+ * Macro to register Gradient Operator.
 */
-#define REGISTER_GRADIENT_OP(__op_type, __grad_op_type, __grad_op_class)       \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                              \
-      __reg_gradient_op__##__op_type##__grad_op_type,                          \
-      "REGISTER_GRADIENT_OP must be in global namespace");                     \
-  static ::paddle::framework::GradOpRegisterHelper<__grad_op_class>            \
-      __op_gradient_register_##__op_type##__grad_op_type##__(#__op_type,       \
-                                                             #__grad_op_type); \
-  int __op_gradient_register_##__op_type##__grad_op_type##_handle__() {        \
-    return 0;                                                                  \
+#define REGISTER_GRADIENT_OP(op_type, grad_op_type, grad_op_class)           \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                            \
+      __reg_gradient_op__##op_type##_##grad_op_type,                         \
+      "REGISTER_GRADIENT_OP must be called in global namespace");            \
+  static ::paddle::framework::GradOpRegistrar<grad_op_class>                 \
+      __op_gradient_registrar_##op_type##_##grad_op_type##__(#op_type,       \
+                                                             #grad_op_type); \
+  int TouchOpGradientRegistrar_##op_type() {                                 \
+    __op_gradient_registrar_##op_type##_##grad_op_type##__.Touch();          \
+    return 0;                                                                \
  }

 /**
- * Macro to Forbid user register Gradient Operator.
+ * Macro to register OperatorKernel.
 */
-#define NO_GRADIENT(__op_type)                          \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                       \
-      __reg_gradient_op__##__op_type##__op_type##_grad, \
-      "NO_GRADIENT must be in global namespace")
+#define REGISTER_OP_KERNEL(op_type, DEVICE_TYPE, place_class, ...)        \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
+      __reg_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
+      "REGISTER_OP_KERNEL must be called in global namespace");           \
+  static ::paddle::framework::OpKernelRegistrar<place_class, __VA_ARGS__> \
+      __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__(#op_type);      \
+  int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE() {                \
+    __op_kernel_registrar_##op_type##_##DEVICE_TYPE##__.Touch();          \
+    return 0;                                                             \
+  }

 /**
- * Macro to Register OperatorKernel.
+ * Macro to Forbid user register Gradient Operator.
 */
-#define REGISTER_OP_KERNEL(type, DEVICE_TYPE, PlaceType, ...)             \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __reg_op_kernel_##type##_##DEVICE_TYPE##__,                         \
-      "REGISTER_OP_KERNEL must be in global namespace");                  \
-  struct __op_kernel_register__##type##__##DEVICE_TYPE##__ {              \
-    __op_kernel_register__##type##__##DEVICE_TYPE##__() {                 \
-      ::paddle::framework::OperatorWithKernel::OpKernelKey key;           \
-      key.place_ = PlaceType();                                           \
-      ::paddle::framework::OperatorWithKernel::AllOpKernels()[#type][key] \
-          .reset(new __VA_ARGS__());                                      \
-    }                                                                     \
-  };                                                                      \
-  static __op_kernel_register__##type##__##DEVICE_TYPE##__                \
-      __reg_kernel_##type##__##DEVICE_TYPE##__;                           \
-  int __op_kernel_register_##type##_handle_##DEVICE_TYPE##__() { return 0; }
-
-// (type, KernelType)
-#define REGISTER_OP_GPU_KERNEL(type, ...) \
-  REGISTER_OP_KERNEL(type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
-
-// (type, KernelType)
-#define REGISTER_OP_CPU_KERNEL(type, ...) \
-  REGISTER_OP_KERNEL(type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)
+#define NO_GRADIENT(op_type)                           \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                      \
+      __reg_gradient_op__##op_type##_##op_type##_grad, \
+      "NO_GRADIENT must be called in global namespace")
+
+#define REGISTER_OP_GPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, GPU, ::paddle::platform::GPUPlace, __VA_ARGS__)
+
+#define REGISTER_OP_CPU_KERNEL(op_type, ...) \
+  REGISTER_OP_KERNEL(op_type, CPU, ::paddle::platform::CPUPlace, __VA_ARGS__)

 /**
 * Macro to mark what Operator and Kernel we will use and tell the compiler to
 * link them into target.
 */
-#define USE_OP_WITHOUT_KERNEL(op_type)                      \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                           \
-      __use_op_without_kernel_##op_type,                    \
-      "USE_OP_WITHOUT_KERNEL must be in global namespace"); \
-  extern int __op_register_##op_type##_handle__();          \
-  static int __use_op_ptr_##op_type##_without_kernel__      \
-      __attribute__((unused)) = __op_register_##op_type##_handle__()
-
-#define USE_OP_KERNEL(op_type, DEVICE_TYPE)                               \
-  STATIC_ASSERT_GLOBAL_NAMESPACE(                                         \
-      __use_op_kernel_##op_type##_##DEVICE_TYPE##__,                      \
-      "USE_OP_KERNEL must be in global namespace");                       \
-  extern int __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__(); \
-  static int __use_op_ptr_##op_type##_##DEVICE_TYPE##_kernel__            \
-      __attribute__((unused)) =                                           \
-          __op_kernel_register_##op_type##_handle_##DEVICE_TYPE##__()
-
-// use Operator with only cpu kernel.
-#define USE_OP_CPU(op_type)       \
-  USE_OP_WITHOUT_KERNEL(op_type); \
-  USE_OP_KERNEL(op_type, CPU)
+#define USE_OP_ITSELF(op_type)                                    \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                 \
+      __use_op_itself_##op_type,                                  \
+      "USE_OP_ITSELF must be called in global namespace");        \
+  extern int TouchOpRegistrar_##op_type();                        \
+  static int use_op_itself_##op_type##_ __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type()
+
+// TODO(fengjiayi): Most ops' gradient op have not been compeleted. So we use
+// `NO_GRAD` to disable micro USE_OP_GRADIENT(op_type). Otherwise the code can't
+// be compiled. `NO_GRAD` should be removed after all gradient ops are
+// compeleted.
+#define NO_GRAD
+#ifndef NO_GRAD
+#define USE_OP_GRADIENT(op_type)                                    \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                   \
+      __use_op_gradient_##op_type,                                  \
+      "USE_OP_GRADIENT must be called in global namespace");        \
+  extern int TouchOpGradientRegistrar_##op_type();                  \
+  static int use_op_gradient_##op_type##_ __attribute__((unused)) = \
+      TouchOpGradientRegistrar_##op_type()
+#else
+#define USE_OP_GRADIENT(op_type)
+#endif
+
+#define USE_OP_DEVICE_KERNEL(op_type, DEVICE_TYPE)               \
+  STATIC_ASSERT_GLOBAL_NAMESPACE(                                \
+      __use_op_kernel_##op_type##_##DEVICE_TYPE##__,             \
+      "USE_OP_DEVICE_KERNEL must be in global namespace");       \
+  extern int TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE(); \
+  static int use_op_kernel_##op_type##_##DEVICE_TYPE##_          \
+      __attribute__((unused)) =                                  \
+          TouchOpKernelRegistrar_##op_type##_##DEVICE_TYPE()
+
+// TODO(fengjiayi): The following macros seems ugly, do we have better method?

 #ifdef PADDLE_ONLY_CPU
-#define USE_OP(op_type) USE_OP_CPU(op_type)
+#define USE_OP_KERNEL(op_type) USE_OP_DEVICE_KERNEL(op_type, CPU)
 #else
-#define USE_OP(op_type) \
-  USE_OP_CPU(op_type);  \
-  USE_OP_KERNEL(op_type, GPU)
+#define USE_OP_KERNEL(op_type)        \
+  USE_OP_DEVICE_KERNEL(op_type, CPU); \
+  USE_OP_DEVICE_KERNEL(op_type, GPU)
 #endif

+#define USE_NO_GRAD_OP(op_type) \
+  USE_OP_ITSELF(op_type);       \
+  USE_OP_KERNEL(op_type)
+
+#define USE_CPU_OP(op_type)           \
+  USE_OP_ITSELF(op_type);             \
+  USE_OP_DEVICE_KERNEL(op_type, CPU); \
+  USE_OP_GRADIENT(op_type)
+
+#define USE_OP(op_type)    \
+  USE_NO_GRAD_OP(op_type); \
+  USE_OP_GRADIENT(op_type)
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/op_registry_test.cc
+++ b/paddle/framework/op_registry_test.cc
@@ -7,6 +7,8 @@ namespace paddle {
 namespace framework {
 class CosineOp : public OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(CosineOp, OperatorBase)
+
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}
  void InferShape(const Scope& scope) const override {}
@@ -27,6 +29,8 @@ class CosineOpProtoAndCheckerMaker : public OpProtoAndCheckerMaker {

 class MyTestOp : public OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(MyTestOp, OperatorBase)
+
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const override {}

--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once

 #include <algorithm>
-#include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -27,25 +26,26 @@ limitations under the License. */
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
+#include "paddle/platform/variant.h"
 #include "paddle/utils/Error.h"

 namespace paddle {
 namespace framework {

 /// If a variable is a empty variable, that name will be used.
-const std::string kEmptyVarName = "@EMPTY@";
+constexpr char kEmptyVarName[] = "@EMPTY@";

 /// If a variable is a temporary variable, that name will be set in Python,
 /// but it will be convert to a unique name in scope after OpCreator.
-const std::string kTempVarName = "@TEMP@";
+constexpr char kTempVarName[] = "@TEMP@";

 /// If a variable's name has a certain suffix, it means that the
 /// variable is the gradient of another varibale.
 /// e.g. Variable "x@GRAD" is the gradient of varibale "x".
-const std::string kGradVarSuffix = "@GRAD";
+constexpr char kGradVarSuffix[] = "@GRAD";

 /// Variables with this suffix are supposed to be filled up with zeros.
-const std::string kZeroVarSuffix = "@ZERO";
+constexpr char kZeroVarSuffix[] = "@ZERO";

 inline std::string GradVarName(const std::string& var_name) {
  return var_name + kGradVarSuffix;
@@ -63,6 +63,17 @@ class ExecutionContext;
 */
 class OperatorBase {
 public:
+  OperatorBase() {}  // TODO(yi): This constructor is to be removed.
+  OperatorBase(const std::string& type, const std::vector<std::string>& inputs,
+               const std::vector<std::string>& outputs,
+               const AttributeMap& attrs,
+               std::unordered_map<std::string, int>* in_out_idxs)
+      : type_(type),
+        inputs_(inputs),
+        outputs_(outputs),
+        attrs_(attrs),
+        in_out_idxs_(in_out_idxs) {}
+
  virtual ~OperatorBase() {}

  template <typename T>
@@ -88,21 +99,31 @@ class OperatorBase {

  virtual bool IsNetOp() const { return false; }

+  virtual bool SupportGPU() const { return false; }
+
  /// rename inputs outputs name
  void Rename(const std::string& old_name, const std::string& new_name);

  //! Get a input with argument's name described in `op_proto`
  const std::string& Input(const std::string& name) const;
-
  //! Get a input which has multiple variables.
  //! TODO add a vector_view to prevent memory copy.
  std::vector<std::string> Inputs(const std::string& name) const;
+
  //! Get a output with argument's name described in `op_proto`
  const std::string& Output(const std::string& name) const;
  //! Get an output which has multiple variables.
  //! TODO add a vector_view to prevent memory copy.
  std::vector<std::string> Outputs(const std::string& name) const;

+  const std::string Type() const { return type_; }
+  const std::vector<std::string> Inputs() const { return inputs_; }
+  const std::vector<std::string> Outputs() const { return outputs_; }
+  const AttributeMap& Attrs() const { return attrs_; }
+  const std::unordered_map<std::string, int>* InOutIdx() const {
+    return in_out_idxs_.get();
+  }
+
 public:
  std::string type_;
  // NOTE: in case of OpGrad, inputs_ contains:
@@ -118,10 +139,10 @@ class OperatorBase {
  std::shared_ptr<std::unordered_map<std::string, int>> in_out_idxs_;
 };

-class OperatorContext {
+class InferShapeContext {
 public:
-  OperatorContext(const OperatorBase* op, const Scope& scope)
-      : op_(*op), scope_(scope) {}
+  InferShapeContext(const OperatorBase& op, const Scope& scope)
+      : op_(op), scope_(scope) {}

  size_t InputSize() const { return op_.inputs_.size(); }

@@ -232,12 +253,6 @@ class OperatorContext {
  const Scope& scope_;
 };

-class InferShapeContext : public OperatorContext {
- public:
-  InferShapeContext(const OperatorBase* op, const Scope& scope)
-      : OperatorContext(op, scope) {}
-};
-
 template <typename T>
 struct EigenDeviceConverter;

@@ -253,11 +268,11 @@ struct EigenDeviceConverter<platform::GPUPlace> {
 };
 #endif

-class ExecutionContext : public OperatorContext {
+class ExecutionContext : public InferShapeContext {
 public:
-  ExecutionContext(const OperatorBase* op, const Scope& scope,
+  ExecutionContext(const OperatorBase& op, const Scope& scope,
                   const platform::DeviceContext* device_context)
-      : OperatorContext(op, scope), device_context_(device_context) {}
+      : InferShapeContext(op, scope), device_context_(device_context) {}

  template <typename PlaceType,
            typename DeviceType =
@@ -285,6 +300,14 @@ class OpKernel {

 class OperatorWithKernel : public OperatorBase {
 public:
+  OperatorWithKernel() {}  // TODO(yi): This constructor is to be removed.
+  OperatorWithKernel(const std::string& type,
+                     const std::vector<std::string>& inputs,
+                     const std::vector<std::string>& outputs,
+                     const AttributeMap& attrs,
+                     std::unordered_map<std::string, int>* in_out_idxs)
+      : OperatorBase(type, inputs, outputs, attrs, in_out_idxs) {}
+
  struct OpKernelKey {
    platform::Place place_;

@@ -308,14 +331,14 @@ class OperatorWithKernel : public OperatorBase {
  using OpKernelMap =
      std::unordered_map<OpKernelKey, std::unique_ptr<OpKernel>, OpKernelHash>;

-  void InferShape(const Scope& scope) const {
-    InferShape(InferShapeContext(this, scope));
+  void InferShape(const Scope& scope) const override {
+    InferShape(InferShapeContext(*this, scope));
  }

  void Run(const Scope& scope,
           const platform::DeviceContext& dev_ctx) const final {
    auto& opKernel = AllOpKernels().at(type_).at(OpKernelKey(dev_ctx));
-    opKernel->Compute(ExecutionContext(this, scope, &dev_ctx));
+    opKernel->Compute(ExecutionContext(*this, scope, &dev_ctx));
  }

  static std::unordered_map<std::string /* op_type */, OpKernelMap>&
@@ -324,9 +347,25 @@ class OperatorWithKernel : public OperatorBase {
    return g_all_op_kernels;
  }

+  bool SupportGPU() const override {
+    OperatorWithKernel::OpKernelKey key;
+    key.place_ = platform::GPUPlace();
+    return OperatorWithKernel::AllOpKernels().at(type_).count(key) != 0;
+  }
+
 protected:
  virtual void InferShape(const InferShapeContext& ctx) const = 0;
 };

+#define DEFINE_OPERATOR_CTOR(Class, ParentClass)                         \
+ public:                                                                 \
+  Class() { /* TODO(yi): This constructor is to be removed. */           \
+  }                                                                      \
+  Class(const std::string& type, const std::vector<std::string>& inputs, \
+        const std::vector<std::string>& outputs,                         \
+        const ::paddle::framework::AttributeMap& attrs,                  \
+        std::unordered_map<std::string, int>* in_out_idxs)               \
+      : ParentClass(type, inputs, outputs, attrs, in_out_idxs) {}
+
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/framework/operator_test.cc
+++ b/paddle/framework/operator_test.cc
@@ -23,6 +23,8 @@ static int op_run_num = 0;

 class OpWithoutKernelTest : public OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(OpWithoutKernelTest, OperatorBase)
+
  void Init() override { x = 1; }
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,
@@ -97,6 +99,8 @@ class OpKernelTestProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
 static int cpu_kernel_run_num = 0;

 class OpWithKernelTest : public OperatorWithKernel {
+ public:
+  DEFINE_OPERATOR_CTOR(OpWithKernelTest, OperatorWithKernel)
 protected:
  void InferShape(const framework::InferShapeContext& ctx) const override {}
 };
@@ -116,6 +120,8 @@ class CPUKernelTest : public OpKernel {
 // multiple inputs test
 class OperatorMultiInputsTest : public OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(OperatorMultiInputsTest, OperatorBase)
+
  void Init() override { x = 1; }
  void InferShape(const Scope& scope) const override {}
  void Run(const Scope& scope,

--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -18,13 +18,11 @@ limitations under the License. */

 #include "paddle/framework/backward.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
 #include "paddle/framework/tensor_py.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/operators/type_alias.h"
 #include "paddle/platform/enforce.h"
 #include "paddle/platform/place.h"
+#include "paddle/string/to_string.h"
 #include "pybind11/numpy.h"
 #include "pybind11/pybind11.h"
 #include "pybind11/stl.h"
@@ -32,18 +30,23 @@ limitations under the License. */
 namespace py = pybind11;

 USE_OP(add_two);
-USE_OP_CPU(onehot_cross_entropy);
-USE_OP_WITHOUT_KERNEL(fc);
-USE_OP(sgd);
+USE_CPU_OP(onehot_cross_entropy);
+USE_NO_GRAD_OP(sgd);
 USE_OP(mul);
 USE_OP(mean);
 USE_OP(sigmoid);
 USE_OP(softmax);
 USE_OP(rowwise_add);
 USE_OP(fill_zeros_like);
-USE_OP_WITHOUT_KERNEL(recurrent_op);
+USE_OP_ITSELF(recurrent_op);
+USE_OP(gaussian_random);
+USE_OP(uniform_random);
+
 namespace paddle {
 namespace framework {
+
+using Tensor = framework::Tensor;
+
 template <typename ClassType>
 void ExposeOperator(ClassType &m) {
  m.def("infer_shape", &ClassType::type::InferShape)
@@ -56,6 +59,26 @@ void ExposeOperator(ClassType &m) {
           [](const typename ClassType::type &op) -> std::vector<std::string> {
             return op.outputs_;
           })
+      .def("inputs",
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
+             return op.inputs_;
+           })
+      .def("support_gpu", &ClassType::type::SupportGPU)
+      .def("temp_outputs",
+           [](const typename ClassType::type &op) -> std::vector<std::string> {
+             auto iter = op.attrs_.find("temporary_index");
+             std::vector<std::string> ret;
+             if (iter == op.attrs_.end()) {
+               return ret;
+             } else {
+               auto tmp_idx = boost::get<std::vector<int>>(iter->second);
+               ret.reserve(tmp_idx.size());
+               for (auto &index : tmp_idx) {
+                 ret.push_back(op.outputs_.at(index));
+               }
+               return ret;
+             }
+           })
      .def("__str__", &ClassType::type::DebugString);
 }

@@ -129,8 +152,8 @@ All parameter, weight, gradient are variables in Paddle.
           [](Variable &self) -> Tensor * { return self.GetMutable<Tensor>(); },
           py::return_value_policy::reference)
      .def("get_net",
-           [](Variable &self) -> ops::NetOp * {
-             return self.GetMutable<ops::NetOp>();
+           [](Variable &self) -> operators::NetOp * {
+             return self.GetMutable<operators::NetOp>();
           },
           py::return_value_policy::reference);

@@ -184,9 +207,13 @@ All parameter, weight, gradient are variables in Paddle.
                  });
  // clang-format on

-  py::class_<paddle::platform::GPUPlace>(m, "GPUPlace").def(py::init<int>());
+  py::class_<platform::GPUPlace>(m, "GPUPlace")
+      .def(py::init<int>())
+      .def("__str__", string::to_string<const platform::GPUPlace &>);

-  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace").def(py::init<>());
+  py::class_<paddle::platform::CPUPlace>(m, "CPUPlace")
+      .def(py::init<>())
+      .def("__str__", string::to_string<const platform::CPUPlace &>);

  py::class_<OperatorBase, std::shared_ptr<OperatorBase>> operator_base(
      m, "Operator");
@@ -201,8 +228,6 @@ All parameter, weight, gradient are variables in Paddle.
    return OpRegistry::CreateOp(desc);
  });

-  operator_base.def_static("support_gpu", &OpRegistry::SupportGPU);
-
  operator_base.def("backward",
                    [](const OperatorBase &forwardOp,
                       const std::unordered_set<std::string> &no_grad_vars) {
@@ -211,23 +236,24 @@ All parameter, weight, gradient are variables in Paddle.

  ExposeOperator(operator_base);

-  py::class_<ops::NetOp, std::shared_ptr<ops::NetOp>> net(m, "Net");
+  py::class_<operators::NetOp, std::shared_ptr<operators::NetOp>> net(m, "Net");

  net.def_static("create",
-                 []() -> std::shared_ptr<ops::NetOp> {
-                   auto retv = std::make_shared<ops::NetOp>();
+                 []() -> std::shared_ptr<operators::NetOp> {
+                   auto retv = std::make_shared<operators::NetOp>();
                   retv->type_ = "plain_net";
                   return retv;
                 })
-      .def("add_op", &ops::NetOp::AddOp)
-      .def(
-          "add_op",
-          [](ops::NetOp &self, const std::shared_ptr<ops::NetOp> &net) -> void {
-            self.AddOp(std::static_pointer_cast<OperatorBase>(net));
-          })
-      .def("complete_add_op", &ops::NetOp::CompleteAddOp)
-      .def("complete_add_op",
-           [](std::shared_ptr<ops::NetOp> &self) { self->CompleteAddOp(); });
+      .def("add_op", &operators::NetOp::AddOp)
+      .def("add_op",
+           [](operators::NetOp &self,
+              const std::shared_ptr<operators::NetOp> &net) -> void {
+             self.AddOp(std::static_pointer_cast<OperatorBase>(net));
+           })
+      .def("complete_add_op", &operators::NetOp::CompleteAddOp)
+      .def("complete_add_op", [](std::shared_ptr<operators::NetOp> &self) {
+        self->CompleteAddOp();
+      });

  ExposeOperator(net);


--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -18,6 +18,8 @@ limitations under the License. */
 #include <cstring>
 #include <memory>
 #include <typeindex>
+#include <vector>
+
 #include "paddle/framework/ddim.h"
 #include "paddle/memory/memory.h"
 #include "paddle/platform/device_context.h"
@@ -77,11 +79,11 @@ class Tensor {
  inline const DDim& dims() const;

  /*! Resize the dimensions of the memory block. */
-  inline void Resize(const DDim& dims);
+  inline Tensor& Resize(const DDim& dims);

  /*! The internal of two tensors share the same memory block. */
  template <typename T>
-  inline void ShareDataWith(const Tensor& src);
+  inline Tensor& ShareDataWith(const Tensor& src);

  /**
   * @brief   Copy the content of external tensor to a new place.

--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -23,9 +23,11 @@ template <typename T>
 inline void Tensor::check_memory_size() const {
  PADDLE_ENFORCE_NOT_NULL(
      holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_GE(holder_->size(), product(dims_) * sizeof(T) + offset_,
-                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-                    "first to re-allocate memory.");
+  PADDLE_ENFORCE_GE(
+      holder_->size(), product(dims_) * sizeof(T) + offset_,
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
 }

 template <typename T>
@@ -78,9 +80,10 @@ inline T* Tensor::mutable_data(platform::Place place) {
 }

 template <typename T>
-inline void Tensor::ShareDataWith(const Tensor& src) {
+inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
  src.check_memory_size<T>();
  *this = src;
+  return *this;
 }

 template <typename T>
@@ -136,7 +139,10 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
  return dst;
 }

-inline void Tensor::Resize(const DDim& dims) { dims_ = dims; }
+inline Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}

 inline const DDim& Tensor::dims() const { return dims_; }


--- a/paddle/framework/tensor_test.cc
+++ b/paddle/framework/tensor_test.cc
@@ -19,7 +19,7 @@ TEST(Tensor, Dims) {
  using namespace paddle::framework;
  using namespace paddle::platform;
  Tensor tt;
-  tt.Resize(make_ddim({2, 3, 4}));
+  tt.Resize({2, 3, 4});
  DDim dims = tt.dims();
  ASSERT_EQ(arity(dims), 3);
  for (int i = 0; i < 3; ++i) {

--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -38,10 +38,11 @@ if(WITH_GPU)
    add_simple_unittest(RowConvOpTest)
    add_simple_unittest(BlockExpandOpTest)
    add_simple_unittest(CropOpTest)
+    add_simple_unittest(DepthwiseConvOpTest)
 endif()

-add_simple_unittest(ConvOpTest)
 add_simple_unittest(Im2ColTest)
+add_simple_unittest(GemmConvOpTest)
 endif()

 add_style_check_target(paddle_function ${h_files})

--- a/paddle/function/ConvOpTest.cpp
+++ b/paddle/function/ConvOpTest.cpp
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "Function.h"
-#include "FunctionTest.h"
-
-namespace paddle {
-
-enum TestType {
-  kForwardTest = 0,
-  kBackwardInputTest = 1,
-  kBackwardFilterTest = 2,
-};
-
-template <DeviceType DType1, DeviceType DType2>
-class ConvolutionTest {
-public:
-  ConvolutionTest(const std::string& conv1,
-                  const std::string& conv2,
-                  TestType type,
-                  bool useGroups = true,
-                  std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64}) {
-              if (inputChannels > outputChannels) break;
-              size_t groups;
-              if (!useGroups) {
-                groups = 1;
-              } else {
-                if (outputChannels % inputChannels != 0) continue;
-                groups = inputChannels;
-              }
-
-              for (size_t stride : {1, 2}) {
-                for (size_t padding : {0, 1}) {
-                  if (padding >= filterSize) break;
-                  size_t outputSize =
-                      (inputSize - filterSize + 2 * padding + stride) / stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputSize
-                          << " inputWidth=" << inputSize
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterSize
-                          << " filterWidth=" << filterSize
-                          << " outputHeight=" << outputSize
-                          << " outputWidth=" << outputSize
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", groups)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputSize, inputSize};
-
-                  TensorShape filter;
-                  if (groups > 1)
-                    filter = TensorShape({groups,
-                                          outputChannels / groups,
-                                          inputChannels / groups,
-                                          filterSize,
-                                          filterSize});
-                  else
-                    filter = TensorShape({outputChannels,
-                                          inputChannels,
-                                          filterSize,
-                                          filterSize});
-                  TensorShape output{
-                      batchSize, outputChannels, outputSize, outputSize};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
-                                    ADD_TO);
-                    test.run();
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-// Mainly used to test cases where the height and width (input, filter)
-// are not equal.
-template <DeviceType DType1, DeviceType DType2>
-class ConvolutionTest2 {
-public:
-  ConvolutionTest2(const std::string& conv1,
-                   const std::string& conv2,
-                   TestType type,
-                   bool useGroups = true,
-                   std::string algo = "auto") {
-    for (size_t batchSize : {16}) {
-      for (size_t inputHeight : {7, 31}) {
-        for (size_t inputWidth : {10, 54}) {
-          for (size_t filterHeight : {1, 5}) {
-            for (size_t filterWidth : {3, 7}) {
-              for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {7}) {
-                  size_t groups;
-                  if (!useGroups) {
-                    groups = 1;
-                  } else {
-                    if (outputChannels % inputChannels != 0) continue;
-                    groups = inputChannels;
-                  }
-
-                  size_t stride = 1;
-                  size_t padding = 0;
-                  size_t outputHeight =
-                      (inputHeight - filterHeight + 2 * padding + stride) /
-                      stride;
-                  size_t outputWidth =
-                      (inputWidth - filterWidth + 2 * padding + stride) /
-                      stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputHeight
-                          << " inputWidth=" << inputWidth
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterHeight
-                          << " filterWidth=" << filterWidth
-                          << " outputHeight=" << outputHeight
-                          << " outputWidth=" << outputWidth
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", groups)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputHeight, inputWidth};
-
-                  TensorShape filter;
-                  if (groups > 1)
-                    filter = TensorShape({groups,
-                                          outputChannels / groups,
-                                          inputChannels / groups,
-                                          filterHeight,
-                                          filterWidth});
-                  else
-                    filter = TensorShape({outputChannels,
-                                          inputChannels,
-                                          filterHeight,
-                                          filterWidth});
-                  TensorShape output{
-                      batchSize, outputChannels, outputHeight, outputWidth};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
-                                    ADD_TO);
-                    test.run();
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-// ======Start Convolution TEST======
-
-TEST(Forward, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
-}
-
-#ifndef PADDLE_ONLY_CPU
-TEST(Forward, GEMM2) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
-}
-
-TEST(BackwardInput, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU",
-      "GemmConvGradInput-GPU",
-      kBackwardInputTest,
-      false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU",
-      "GemmConvGradInput-GPU",
-      kBackwardInputTest,
-      false);
-}
-
-TEST(BackwardFilter, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU",
-      "GemmConvGradFilter-GPU",
-      kBackwardFilterTest,
-      false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU",
-      "GemmConvGradFilter-GPU",
-      kBackwardFilterTest,
-      false);
-}
-#endif
-// ======End Convolution TEST======
-
-// ======Start DepthwiseConvolution TEST======
-
-// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
-// version of depthwiseConv is implemented.
-
-#ifndef PADDLE_ONLY_CPU
-
-TEST(DepthwiseConvForward, GEMM2) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
-}
-
-TEST(DepthwiseConvBackwardInput, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU",
-      "DepthwiseConvGradInput-GPU",
-      kBackwardInputTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU",
-      "DepthwiseConvGradInput-GPU",
-      kBackwardInputTest);
-}
-
-TEST(DepthwiseConvBackwardFilter, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU",
-      "DepthwiseConvGradFilter-GPU",
-      kBackwardFilterTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU",
-      "DepthwiseConvGradFilter-GPU",
-      kBackwardFilterTest);
-}
-
-#endif
-// ======End DepthwiseConvolution TEST======
-
-}  // namespace paddle
--- a/paddle/function/ConvOpTest.h
+++ b/paddle/function/ConvOpTest.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FunctionTest.h"
+
+namespace paddle {
+
+template <DeviceType DType1, DeviceType DType2>
+void forward(Compare2Function<DType1, DType2>& test,
+             const TensorShape& input,
+             const TensorShape& filter,
+             const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_input(Compare2Function<DType1, DType2>& test,
+                    const TensorShape& input,
+                    const TensorShape& filter,
+                    const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_filter(Compare2Function<DType1, DType2>& test,
+                     const TensorShape& input,
+                     const TensorShape& filter,
+                     const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+using Function = void (*)(Compare2Function<DType1, DType2>& test,
+                          const TensorShape& input,
+                          const TensorShape& filter,
+                          const TensorShape& output);
+
+/**
+ * \brief A basic convolution function test interface.
+ *
+ * \param conv1         type name of convolution function 1.
+ * \param conv2         type name of convolution function 2.
+ * \param function      test function, can be one of the forward, backward_input
+ *                      backward_filter function.
+ * Example:
+ * 1. Compare GemmConv's CPU and GPU implementation:
+ *   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+ *      "GemmConv-CPU", "GemmConv-GPU", forward);
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution(const std::string& conv1,
+                 const std::string& conv2,
+                 Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 5}) {
+    for (size_t inputSize : {7, 14, 31}) {
+      for (size_t filterSize : {1, 3, 5}) {
+        for (size_t inputChannels : {3, 16}) {
+          for (size_t outputChannels : {3, 16}) {
+            if (outputChannels < inputChannels) continue;
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (padding >= filterSize) break;
+
+                // NNPACK only supports stride = 1 if batchSize > 1
+                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                    batchSize > 1 && stride > 1)
+                  break;
+
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", (size_t)1)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{
+                    outputChannels, inputChannels, filterSize, filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for
+ *        image height is not equal image width.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution2(const std::string& conv1,
+                  const std::string& conv2,
+                  Function<DType1, DType2> function) {
+  for (size_t batchSize : {4}) {
+    for (size_t inputHeight : {7, 31}) {
+      for (size_t inputWidth : {10, 54}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t inputChannels : {7}) {
+              for (size_t outputChannels : {7}) {
+                size_t stride = 1;
+                size_t padding = 0;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputHeight
+                        << " inputWidth=" << inputWidth
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterHeight
+                        << " filterWidth=" << filterWidth
+                        << " outputHeight=" << outputHeight
+                        << " outputWidth=" << outputWidth
+                        << " stride=" << stride << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", (size_t)1)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputHeight, inputWidth};
+                TensorShape filter{
+                    outputChannels, inputChannels, filterHeight, filterWidth};
+                TensorShape output{
+                    batchSize, outputChannels, outputHeight, outputWidth};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for depthwise convolution.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void DepthwiseConvolution(const std::string& conv1,
+                          const std::string& conv2,
+                          Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 32}) {
+    for (size_t inputSize : {7, 14, 54}) {
+      for (size_t filterSize : {3, 4}) {
+        for (size_t inputChannels : {32}) {
+          for (size_t outputChannels : {32, 64}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // NNPACK only supports stride = 1 if batchSize > 1,
+                // and there has some bug when batchSize > 1 and groups != 1
+                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                    batchSize > 1)
+                  break;
+
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{groups,
+                                   outputChannels / groups,
+                                   inputChannels / groups,
+                                   filterSize,
+                                   filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
--- a/paddle/operators/mean_op_test.cc
+++ b/paddle/operators/mean_op_test.cc
@@ -13,13 +13,25 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
+#include "ConvOpTest.h"

-#include <paddle/framework/op_registry.h>
+namespace paddle {

-USE_OP(mean);
+#ifndef PADDLE_ONLY_CPU
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "DepthwiseConv-GPU", forward);
+}
+
+TEST(DepthwiseConv, BackwardInput) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "DepthwiseConvGradInput-GPU", backward_input);
+}

-TEST(MeanOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("mean");
-  ASSERT_NE(it, protos.end());
+TEST(DepthwiseConv, BackwardFilter) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "DepthwiseConvGradFilter-GPU", backward_filter);
 }
+#endif
+
+}  // namespace paddle
--- a/paddle/function/FunctionTest.cpp
+++ b/paddle/function/FunctionTest.cpp
@@ -93,8 +93,8 @@ TEST(Arguments, Matrix) {
  MatrixPtr matrix = Matrix::create(100, 200);
  CheckBufferArg check = [=](const BufferArg& arg) {
    EXPECT_EQ(arg.shape().ndims(), 2U);
-    EXPECT_EQ(arg.shape()[0], 100);
-    EXPECT_EQ(arg.shape()[1], 200);
+    EXPECT_EQ(arg.shape()[0], 100U);
+    EXPECT_EQ(arg.shape()[1], 200U);
    EXPECT_EQ(arg.data(), matrix->getData());

    EXPECT_EQ(arg.matrix<DEVICE_TYPE_CPU>().getHeight(), matrix->getHeight());
@@ -112,8 +112,8 @@ TEST(Arguments, Matrix) {
 TEST(Arguments, Vector) {
  VectorPtr vector = Vector::create(100, false);
  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 1);
-    EXPECT_EQ(arg.shape()[0], 100);
+    EXPECT_EQ(arg.shape().ndims(), 1U);
+    EXPECT_EQ(arg.shape()[0], 100U);
    EXPECT_EQ(arg.data(), vector->getData());

    CpuVector inVector = arg.vector<real, DEVICE_TYPE_CPU>();
@@ -131,9 +131,9 @@ TEST(Arguments, Vector) {
 TEST(Arguments, CpuSparseMatrix) {
  CpuSparseMatrix sparse(200, 300, 50);
  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 2);
-    EXPECT_EQ(arg.shape()[0], 200);
-    EXPECT_EQ(arg.shape()[1], 300);
+    EXPECT_EQ(arg.shape().ndims(), 2U);
+    EXPECT_EQ(arg.shape()[0], 200U);
+    EXPECT_EQ(arg.shape()[1], 300U);
    EXPECT_EQ(arg.data(), sparse.getData());
    // CHECK_EQ(arg.sparse().nnz(), 50);
    // CHECK_EQ(arg.sparse().dataFormat(), SPARSE_CSR_FORMAT);
@@ -152,10 +152,10 @@ TEST(Arguments, CpuSparseMatrix) {
 TEST(Arguments, BufferArg) {
  BufferArg arg(nullptr, VALUE_TYPE_FLOAT, {1, 2, 3});
  CheckBufferArg check = [=](const BufferArg& arg) {
-    EXPECT_EQ(arg.shape().ndims(), 3);
-    EXPECT_EQ(arg.shape()[0], 1);
-    EXPECT_EQ(arg.shape()[1], 2);
-    EXPECT_EQ(arg.shape()[2], 3);
+    EXPECT_EQ(arg.shape().ndims(), 3U);
+    EXPECT_EQ(arg.shape()[0], 1U);
+    EXPECT_EQ(arg.shape()[1], 2U);
+    EXPECT_EQ(arg.shape()[2], 3U);
  };

  BufferArgs argments;

--- a/paddle/operators/sgd_op_test.cc
+++ b/paddle/operators/sgd_op_test.cc
@@ -13,10 +13,38 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
-#include <paddle/framework/op_registry.h>
-USE_OP(sgd);
-TEST(SGDOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("sgd");
-  ASSERT_NE(it, protos.end());
+#include "ConvOpTest.h"
+
+namespace paddle {
+
+TEST(GemmConv, NaiveConv) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
 }
+
+#ifndef PADDLE_ONLY_CPU
+TEST(GemmConv, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+}
+
+TEST(GemmConv, BackwardInput) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+}
+
+TEST(GemmConv, BackwardFilter) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+}
+#endif
+
+}  // namespace paddle
--- a/paddle/function/TensorShapeTest.cpp
+++ b/paddle/function/TensorShapeTest.cpp
@@ -44,7 +44,7 @@ TEST(TensorShape, GetAndSet) {
  EXPECT_EQ(t.ndims(), 3U);
  EXPECT_EQ(t.getElements(), 6U);

-  EXPECT_EQ(t[1], 2);
+  EXPECT_EQ(t[1], 2U);
  t.setDim(1, 100);
  EXPECT_EQ(t.getElements(), 300U);
  EXPECT_EQ(t[1], 100U);

--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -196,30 +196,30 @@ public:
        CHECK_EQ(status, nnp_status_success);
      }
    } else {
-      for (size_t g = 0; g < groups_; g++) {
-        // only supports stride = 1
-        CHECK_EQ(strideH(), 1);
-        CHECK_EQ(strideW(), 1);
-        nnp_status status =
-            nnp_convolution_output(algorithm_,
-                                   batchSize,
-                                   inputChannels / groups_,
-                                   outputChannels / groups_,
-                                   inputSize,
-                                   padding,
-                                   kernelSize,
-                                   inputData + inputOffset * g,
-                                   filterData + filterOffset * g,
-                                   nullptr, /* bias */
-                                   outputData + outputOffset * g,
-                                   bufferPtr,
-                                   sizePtr,
-                                   nnp_activation_identity,
-                                   nullptr,
-                                   threadpool_, /* threadpool */
-                                   nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
+      // only supports stride = 1
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
+
+      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
+      CHECK_EQ(groups_, static_cast<size_t>(1));
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
    }
  }


--- a/paddle/function/nnpack/NNPACKConvOpTest.cpp
+++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp
@@ -13,87 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include <gtest/gtest.h>
-#include "paddle/function/Function.h"
-#include "paddle/function/FunctionTest.h"
-
-DEFINE_string(algo,
-              "auto",
-              "The algorithm (auto, ft8x8, ft16x16, wt8x8, "
-              "implicit-gemm, or direct) for computing convolution of NNPACK.");
+#include "paddle/function/ConvOpTest.h"

 namespace paddle {

-#define IS_NNPACK_SUPPORT(algo, filterSize, stride)        \
-  if (algo == "direct" && filterSize != 1) continue;       \
-  if (algo == "direct" && batchSize != 1) continue;        \
-  if (algo == "wt8x8" && filterSize != 3) continue;        \
-  if (algo == "implicit-gemm" && batchSize != 1) continue; \
-  if (algo != "auto" && algo != "implicit-gemm" && stride > 1) continue;
-
-class ConvolutionTest {
-public:
-  ConvolutionTest(const std::string& conv1,
-                  const std::string& conv2,
-                  std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
-              for (size_t stride : {1, 2}) {
-                // if batchSize > 1 NNPACKConv only supports stride = 1
-                if (batchSize > 1 && stride > 1) break;
-                for (size_t padding : {0, 1}) {
-                  if (padding >= filterSize) break;
-                  size_t outputSize =
-                      (inputSize - filterSize + 2 * padding + stride) / stride;
-                  IS_NNPACK_SUPPORT(algo, filterSize, stride);
-                  LOG(INFO) << " batchSize=" << batchSize
-                            << " inputChannels=" << inputChannels
-                            << " inputHeight=" << inputSize
-                            << " inputWidth=" << inputSize
-                            << " outputChannels=" << outputChannels
-                            << " filterHeight=" << filterSize
-                            << " filterWidth=" << filterSize
-                            << " outputHeight=" << outputSize
-                            << " outputWidth=" << outputSize
-                            << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", (size_t)1)
-                          .set("algo", algo));
-
-                  TensorShape shape0{
-                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape shape1{
-                      outputChannels, inputChannels, filterSize, filterSize};
-                  TensorShape shape2{
-                      batchSize, outputChannels, outputSize, outputSize};
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2));
-                  test.run();
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
+TEST(NNPACK, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}

-TEST(Convolution, NNPACK) {
-  // NNPACK only supports stride = 1
-  ConvolutionTest test("GemmConv-CPU", "NNPACKConv-CPU", FLAGS_algo);
+TEST(NNPACK, Depthwise) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
 }

 }  // namespace paddle
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -23,6 +23,17 @@ endmacro()

 filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
+
+if(NOT WITH_MKLDNN)
+    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
+    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
+    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
+else()
+    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
+endif()
+
 if(NOT WITH_GPU)
    list(REMOVE_ITEM GSERVER_HEADER
        layers/CudnnConvBaseLayer.h

--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -112,7 +112,6 @@ BEGIN_DEFINE_ACTIVATION(softmax)
 private:
 MatrixPtr sftMaxSum_;
 MatrixPtr sftMaxDot_;
-MatrixPtr one_;

 public:
 Error __must_check forward(Argument& act) {
@@ -138,14 +137,6 @@ Error __must_check backward(Argument& act) {
                           1,
                           /* trans */ false,
                           useGpu(act.deviceId));
-    if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_,
-                             1,
-                             outputG->getWidth(),
-                             /* trans */ false,
-                             useGpu(act.deviceId));
-      one_->one();
-    }

    sftMaxDot_->dotMul(*outputG, *outputV);
    sftMaxSum_->colMerge(*sftMaxDot_);

--- a/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+++ b/paddle/gserver/layers/KmaxSeqScoreLayer.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "Layer.h"
+
+namespace paddle {
+
+class KmaxSeqScoreLayer : public Layer {
+private:
+  MatrixPtr scores_;
+  size_t beamSize_;
+  void kmaxScorePerSeq(const real* score,
+                       real* sortedRes,
+                       const ICpuGpuVectorPtr seqStartPos);
+
+public:
+  explicit KmaxSeqScoreLayer(const LayerConfig& config) : Layer(config) {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void forward(PassType passType) override;
+  void backward(const UpdateCallback& callback = nullptr) override;
+};
+
+REGISTER_LAYER(kmax_seq_score, KmaxSeqScoreLayer);
+
+bool KmaxSeqScoreLayer::init(const LayerMap& layerMap,
+                             const ParameterMap& parameterMap) {
+  bool ret = Layer::init(layerMap, parameterMap);
+  CHECK_EQ(1U, inputLayers_.size());
+
+  beamSize_ = config_.beam_size();
+  CHECK_GE(beamSize_, 1U);
+
+  setNeedSequenceInfo(false);
+  setNeedGradient(false);
+  return ret;
+}
+
+void KmaxSeqScoreLayer::kmaxScorePerSeq(const real* scores,
+                                        real* sortedIds,
+                                        const ICpuGpuVectorPtr seqStartPos) {
+  int* starts = seqStartPos->getMutableData(false);
+  std::vector<real> indices;
+  for (size_t i = 0; i < seqStartPos->getSize() - 1; ++i) {
+    int seqLen = starts[i + 1] - starts[i];
+    int k = std::min(static_cast<int>(beamSize_), seqLen);
+
+    indices.resize(seqLen, 0);
+    std::iota(begin(indices), end(indices), 0.);
+    std::vector<real> tmpScore(scores + starts[i], scores + starts[i + 1]);
+    std::partial_sort(
+        begin(indices),
+        begin(indices) + k,
+        end(indices),
+        [&](size_t a, size_t b) { return tmpScore[a] > tmpScore[b]; });
+    memcpy(sortedIds + (i * beamSize_), indices.data(), k * sizeof(real));
+  }
+}
+
+void KmaxSeqScoreLayer::forward(PassType passType) {
+  Layer::forward(passType);
+
+  const Argument& input = getInput(0);
+  const MatrixPtr inputScore = getInputValue(0);
+
+  CHECK(input.hasSeq() || input.hasSubseq())
+      << "input of " << getName()
+      << " must be a sequence or a nested sequence.";
+  CHECK_EQ(input.value->getWidth(), 1UL)
+      << "input of " << getName()
+      << " is score over a sequence or a nested sequence, so its width "
+      << " must be 1.";
+
+  if (useGpu_) {
+    // this Layer runs only in CPU, if the model is runing on GPU,
+    // then copy the input to this layer from GPU to CPU.
+    Matrix::resizeOrCreate(scores_,
+                           inputScore->getHeight(),
+                           1,
+                           false /* trans */,
+                           false /* useGpu */);
+    scores_->copyFrom(*inputScore);
+  } else {
+    scores_ = inputScore;
+  }
+
+  Matrix::resizeOrCreate(
+      output_.value,
+      input.hasSubseq() ? input.getNumSubSequences() : input.getNumSequences(),
+      beamSize_,
+      false,
+      false);
+  output_.value->one();
+  output_.value->mulScalar(-1.);
+
+  kmaxScorePerSeq(scores_->getData(),
+                  output_.value->getData(),
+                  input.hasSubseq() ? input.subSequenceStartPositions
+                                    : input.sequenceStartPositions);
+}
+
+void KmaxSeqScoreLayer::backward(const UpdateCallback& callback) {}
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNBase.h
+++ b/paddle/gserver/layers/MKLDNNBase.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+typedef enum {
+  MKLDNN_BASE = 1,   // basical info of MKLDNN
+  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
+  MKLDNN_SIZES = 2,  // size info of MKLDNN
+  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_ALL = 4,    // show all info of MKLDNN
+} MKLDNN_LOG_LEVEL;
+
+/**
+ * @brief MKLDNN CPU engine.
+ *
+ */
+class CPUEngine {
+public:
+  static CPUEngine& Instance() {
+    // Thread-safe in C++11.
+    static CPUEngine myInstance;
+    return myInstance;
+  }
+
+  // Disallow copy or move
+  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
+  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
+  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
+  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
+
+  mkldnn::engine& getEngine() { return cpuEngine_; }
+
+protected:
+  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
+  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
+  ~CPUEngine() {}
+
+private:
+  mkldnn::engine cpuEngine_;
+};
+
+/**
+ * @brief MKLDNN Stream.
+ *
+ */
+class MKLDNNStream {
+public:
+  MKLDNNStream() : ready_(false) { resetState(); }
+
+  virtual ~MKLDNNStream() {}
+
+  /**
+   * @brief Submit stream
+   * @param prims The primitives vector
+   * @param block Waiting for the stream to complete
+   */
+  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
+    resetState();
+    stream_->submit(prims).wait(block);
+    ready_ = false;
+  }
+
+  /**
+   * @brief Reset the mkldnn stream
+   */
+  void resetState() {
+    if (ready_) {
+      return;
+    }
+    // TODO(TJ): change me when mkldnn have method to reset this state
+    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
+    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+    ready_ = true;
+  }
+
+private:
+  bool ready_;
+  std::shared_ptr<mkldnn::stream> stream_;
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNFcLayer.cpp
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNFcLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+typedef inner_product_forward fc_fwd;
+typedef inner_product_backward_weights fc_bwdWgt;
+typedef inner_product_backward_data fc_bwdData;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
+
+bool MKLDNNFcLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  CHECK_EQ(inputLayers_.size(), 1) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
+
+  // output size, cat not be changed
+  oc_ = getSize();
+  oh_ = 1;
+  ow_ = 1;
+
+  // input size can not change in FC
+  iLayerSize_ = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
+
+  // create weight
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+  }
+  return true;
+}
+
+void MKLDNNFcLayer::convertWeightsFromPaddle() {
+  if (FLAGS_use_mkldnn_wgt) {
+    return;
+  }
+
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  // The weight_ is transposed from initial paddle weight
+  MatrixPtr paddleWgt = Matrix::create(
+      weight_->getW()->getData(), iLayerSize_, oc_, false, false);
+
+  // TODO(TJ): remove this print when do not need differ weights
+  std::ostringstream ostr;
+  paddleWgt->print(ostr);
+  VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
+
+  // The mkldnn weight is transposed from initial paddle matrix
+  MatrixPtr paddleWgtT;
+  paddleWgt->transpose(paddleWgtT, true);
+  weight_->getW()->copyFrom(*paddleWgtT);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNFcLayer::convertWeightsToPaddle() {
+  MatrixPtr dnnWgt = weight_->getW();
+  MatrixPtr paddleWgt;
+  dnnWgt->transpose(paddleWgt, true);
+
+  // copy paddle weight and override on weight_
+  MatrixPtr dnnWgtT = Matrix::create(
+      dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
+  dnnWgtT->copyFrom(*paddleWgt);
+}
+
+void MKLDNNFcLayer::reshape() {
+  const Argument& input = getInput(0);
+  int batchSize = input.getBatchSize();
+  if (bs_ == batchSize) {
+    return;
+  }
+  bs_ = batchSize;
+  ih_ = input.getFrameHeight();
+  iw_ = input.getFrameWidth();
+  if (ih_ == 0) {
+    ih_ = 1;
+  }
+  if (iw_ == 0) {
+    iw_ = 1;
+  }
+  hasSpatial_ = true;
+  if (ih_ == 1 && iw_ == 1) {
+    hasSpatial_ = false;
+  }
+  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
+  ic_ = iLayerSize_ / (ih_ * iw_);
+  CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
+  CHECK_EQ(size_t(oc_), getSize());
+  printSizeInfo();
+
+  // reset output
+  output_.setFrameHeight(oh_);
+  output_.setFrameWidth(ow_);
+  resetOutput(bs_, oc_);
+
+  // reset mkldnn forward
+  resetFwd();
+  needResetBwd_ = true;
+
+  convertWeightsFromPaddle();
+}
+
+void MKLDNNFcLayer::resetFwd() {
+  bool hasBias = biases_ && biases_->getW();
+  real* iData = getInputValue(0)->getData();
+  real* oData = getOutputValue()->getData();
+  real* wData = weight_->getW()->getData();
+  real* bData = hasBias ? biases_->getW()->getData() : NULL;
+
+  // TODO(TJ): below create should be covered in MkldnnMatrix
+  // create memory desc
+  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+                                 : createMD({bs_, ic_}, format::nc);
+  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
+                                 : createMD({oc_, ic_}, format::oi);
+  memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
+                                   : createMD({}, format::format_undef);
+  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+
+  // create memory primitive desc and memory self
+  inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+  wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
+  outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+
+  prop_kind pk = prop_kind::forward;
+  fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
+                                       : fc_fwd::desc(pk, iMD, wMD, oMD);
+  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+
+  if (bData != NULL) {
+    biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
+    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
+  } else {
+    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
+  }
+  pipelineFwd_.clear();
+  pipelineFwd_.push_back(*fwd_);
+}
+
+void MKLDNNFcLayer::resetBwd() {
+  if (!needResetBwd_) {
+    return;
+  }
+  needResetBwd_ = false;
+
+  bool hasBias = biases_ && biases_->getWGrad();
+  real* iData = getInputValue(0)->getData();
+  real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
+  real* oDiff = getOutputGrad()->getData();
+  real* wDiff = weight_->getWGrad()->getData();
+  real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
+
+  /// backward weight
+  // create memory desc for backward memory
+  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+                                 : createMD({bs_, ic_}, format::nc);
+  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
+                                 : createMD({oc_, ic_}, format::oi);
+  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+  memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
+                                   : createMD({}, format::format_undef);
+
+  if (inVal_) {
+    // update data
+    inVal_->set_data_handle(iData);
+  } else {
+    inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+  }
+
+  // create memory primitive desc and memory self
+  wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
+  outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
+
+  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+  fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
+                                   ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
+                                   : fc_bwdWgt::desc(iMD, wMD, oMD);
+  fc_bwdWgt::primitive_desc bwdWgtPD =
+      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
+
+  if (bDiff != NULL) {
+    biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
+    bwdWgt_.reset(
+        new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_));
+  }
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwdWgt_);
+
+  /// backward data
+  if (iDiff == NULL) {
+    return;
+  }
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+  fc_bwdData::primitive_desc bwdDataPD =
+      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
+  inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  pipelineBwd_.push_back(*bwdData_);
+}
+
+void MKLDNNFcLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  reshape();
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+
+    // update input data
+    // since it might be changed if this is after data layer
+    real* iData = getInputValue(0)->getData();
+    inVal_->set_data_handle(iData);
+
+    // just submit forward pipeline
+    stream_->submit(pipelineFwd_);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    resetBwd();
+
+    // update diff
+    real* oDiff = getOutputGrad()->getData();
+    outGrad_->set_data_handle(oDiff);
+
+    // just sumbmit backward pipeline
+    stream_->submit(pipelineBwd_);
+  }
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+    if (biases_ && biases_->getWGrad()) {
+      biases_->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNFcLayer.h
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer fc layer.
+ *
+ * The config file api is mkldnn_fc
+ */
+class MKLDNNFcLayer : public MKLDNNLayer {
+protected:
+  // input layer size, can not be change after init
+  size_t iLayerSize_;  // == ic * ih * iw
+
+  // if has already init the weight
+  bool hasInitedWgt_;
+
+  // if input layer has image size info (ih>1 && iw>1)
+  bool hasSpatial_;
+
+  // fc weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNFcLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
+
+  ~MKLDNNFcLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  /**
+   * reshape the input image sizes
+   * and reset output buffer size
+   * and reset mkldnn forward
+   */
+  void reshape();
+
+  /**
+   * reset the forward primitve and memory
+   * only would be called when input size changes
+   */
+  void resetFwd();
+
+  /**
+   * reset the backward primitve and memory for mkldnn fc
+   * only would be called when needed
+   */
+  void resetBwd();
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/MKLDNNLayer.h
+++ b/paddle/gserver/layers/MKLDNNLayer.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "MKLDNNBase.h"
+#include "mkldnn.hpp"
+
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkldnn_wgt);
+
+namespace paddle {
+
+class MKLDNNLayer;
+typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
+
+/**
+ * @brief Base class of MKLDNNlayer.
+ *
+ */
+class MKLDNNLayer : public Layer {
+protected:
+  // batch size
+  int bs_;
+  // input image channel, height and width
+  int ic_, ih_, iw_;
+  // output image channel, height and width
+  int oc_, oh_, ow_;
+
+  // backward also need reset after reset forward handle
+  bool needResetBwd_;
+
+  // mkldnn engine, stream and primivtives
+  mkldnn::engine engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwdWgt_;
+  std::shared_ptr<mkldnn::primitive> bwdData_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+  // TODO(TJ): change below memory as MKLDNNMatrixPtr type
+  std::shared_ptr<mkldnn::memory> inVal_;
+  std::shared_ptr<mkldnn::memory> inGrad_;
+  std::shared_ptr<mkldnn::memory> outVal_;
+  std::shared_ptr<mkldnn::memory> outGrad_;
+  std::shared_ptr<mkldnn::memory> wgtVal_;
+  std::shared_ptr<mkldnn::memory> wgtGrad_;
+  std::shared_ptr<mkldnn::memory> biasVal_;
+  std::shared_ptr<mkldnn::memory> biasGrad_;
+
+public:
+  explicit MKLDNNLayer(const LayerConfig& config)
+      : Layer(config),
+        bs_(0),
+        ic_(0),
+        ih_(0),
+        iw_(0),
+        oc_(0),
+        oh_(0),
+        ow_(0),
+        needResetBwd_(true),
+        engine_(mkldnn::engine::cpu, 0),
+        stream_(nullptr),
+        fwd_(nullptr),
+        bwdWgt_(nullptr),
+        bwdData_(nullptr) {}
+
+  ~MKLDNNLayer() {}
+
+  virtual bool init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+    if (!Layer::init(layerMap, parameterMap)) {
+      return false;
+    }
+
+    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                            << "Please set WITH_MKLDNN=ON "
+                            << "and set use_mkldnn=True";
+    stream_.reset(new MKLDNNStream());
+    engine_ = CPUEngine::Instance().getEngine();
+
+    // TODO(TJ): deivecId
+    return true;
+  }
+
+  /**
+   * convert weight from paddle format to mkldnn format
+   * weight_ will be override
+   */
+  virtual void convertWeightsFromPaddle() {}
+
+  /**
+   * convert mkldnn weight to paddle format
+   * weight_ will be override
+   */
+  virtual void convertWeightsToPaddle() {}
+
+  /**
+   * print info about sizes
+   */
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  // TODO(TJ): move to MkldnnMatrix
+  // create memory desc
+  inline mkldnn::memory::desc createMD(
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
+    // TODO(TJ): isFmtSuppoted(fmt)
+    return mkldnn::memory::desc(dims, type, fmt);
+  }
+};
+
+}  // namespace paddle
--- a/paddle/gserver/layers/SubNestedSequenceLayer.cpp
+++ b/paddle/gserver/layers/SubNestedSequenceLayer.cpp
@@ -96,7 +96,7 @@ void SubNestedSequenceLayer::calSelectedCols(
  for (size_t i = 0; i < seqNum; ++i) {
    for (size_t j = 0; j < beamSize; ++j) {
      if (selectedIndices->getElement(i, j) == -1.) break;
-      int selSubSeqIdx = selectedIndices->getElement(i, j);
+      size_t selSubSeqIdx = selectedIndices->getElement(i, j);
      CHECK_GT(inputSeqInfoVec_[i].size() - 1, selSubSeqIdx);

      size_t subSeqLen = inputSeqInfoVec_[i][selSubSeqIdx + 1] -
@@ -135,7 +135,7 @@ void SubNestedSequenceLayer::forward(PassType passType) {
  CHECK(inputSeq.hasSubseq()) << "The first input of SubNestSequence layer "
                              << "must be a nested sequence.";
  const MatrixPtr selectedIndices = getInputValue(1);
-  CHECK_EQ(inputSeq.getNumSequences(), selectedIndices->getHeight());
+  CHECK_EQ(size_t(inputSeq.getNumSequences()), selectedIndices->getHeight());

  if (dynamic_cast<GpuMatrix*>(selectedIndices.get())) {
    /*

--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -9,7 +9,7 @@ add_unittest_without_exec(test_ProtoDataProvider
 # mkdir will get error.
 add_test(NAME test_ProtoDataProvider
    COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)

 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
@@ -18,6 +18,15 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
    COMMAND test_LayerGrad)

+########## test_Mkldnn layers and activations ##########
+if(WITH_MKLDNN)
+    add_unittest_without_exec(test_MKLDNN
+        test_MKLDNN.cpp
+        MKLDNNTester.cpp
+        LayerGradUtil.cpp)
+    add_test(NAME test_MKLDNN COMMAND test_MKLDNN)
+endif()
+
 ################ test_CRFLayerGrad ####################
 add_unittest_without_exec(test_CRFLayerGrad
    test_CRFLayerGrad.cpp
@@ -66,6 +75,16 @@ add_unittest_without_exec(test_BatchNorm

 add_test(NAME test_BatchNorm
    COMMAND test_BatchNorm)
+
+
+################# test_KmaxSeqScore #######################
+add_unittest_without_exec(test_KmaxSeqScore
+    test_KmaxSeqScore.cpp
+    LayerGradUtil.cpp)
+
+add_test(NAME test_KmaxSeqScore
+    COMMAND test_KmaxSeqScore)
+
 ################## test_Evaluator #######################
 add_unittest(test_Evaluator
    test_Evaluator.cpp)
@@ -82,8 +101,8 @@ if(WITH_PYTHON)
        test_PyDataProvider.cpp)

    add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()

 ############### test_RecurrentLayer #######################
@@ -96,7 +115,7 @@ if(NOT WITH_DOUBLE)

    add_test(NAME test_WarpCTCLayer
        COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()

 ############### test_RecurrentGradientMachine ###############
@@ -106,20 +125,20 @@ add_unittest_without_exec(test_RecurrentGradientMachine
    test_RecurrentGradientMachine.cpp)
 add_test(NAME test_RecurrentGradientMachine
    COMMAND .set_python_path.sh -d
-            ${PROJ_ROOT}/python:${PROJ_ROOT}/paddle/gserver/tests
+            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
            ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)

 add_unittest_without_exec(test_NetworkCompare
    test_NetworkCompare.cpp)
 if(WITH_GPU)
    add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 else()
    add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()


@@ -127,6 +146,6 @@ add_unittest_without_exec(test_PyDataProvider2
        test_PyDataProvider2.cpp)

 add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PROJ_ROOT}/paddle/gserver/tests:${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
--- a/paddle/gserver/tests/MKLDNNTester.cpp
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNTester.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+
+namespace paddle {
+
+// init data layer and test layer of both dnn and reference
+void MKLDNNTester::reset(const TestConfig& dnn,
+                         const TestConfig& ref,
+                         size_t batchSize) {
+  const bool trans = false;
+  const bool useGpu = false;
+
+  // clear
+  configs_.clear();
+  layerNames_.clear();
+  dataLayers_.clear();
+  datas_.clear();
+  layerMaps_.clear();
+  parameters_.clear();
+  testLayers_.clear();
+
+  // resize
+  configs_.resize(NUM);
+  layerNames_.resize(NUM);
+  dataLayers_.resize(NUM);
+  datas_.resize(NUM);
+  layerMaps_.resize(NUM);
+  parameters_.resize(NUM);
+  testLayers_.resize(NUM);
+
+  // reset configs and layer names
+  configs_[DNN] = dnn;
+  configs_[REF] = ref;
+  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
+  layerNames_[REF] = "reference";  // second is reference layer
+
+  // reset others
+  for (size_t i = 0; i < NUM; ++i) {
+    configs_[i].layerConfig.set_name(layerNames_[i]);
+    initDataLayer(configs_[i],
+                  &(dataLayers_[i]),
+                  &(datas_[i]),
+                  &(layerMaps_[i]),
+                  layerNames_[i],
+                  batchSize,
+                  trans,
+                  useGpu);
+    initTestLayer(
+        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
+  }
+  dnnLayer_ = testLayers_[DNN];
+  refLayer_ = testLayers_[REF];
+  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+
+  setInputImgSize();
+}
+
+void MKLDNNTester::setInputImgSize() {
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      // TODO(TJ): fix me when concat and elewise ready
+      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
+      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
+    }
+  }
+}
+
+// init randome parameters of ref, and copy to mkldnn
+void MKLDNNTester::randomWgtDatas() {
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
+    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    parameters_[REF][i]->randomize();
+    dnnValue->copyFrom(*refValue);
+
+    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    printVector(dnnValue);
+  }
+}
+
+// random botdata of ref layer and copy same to mkldnn
+void MKLDNNTester::randomBotDatas() {
+  CHECK_EQ(dataLayers_.size(), NUM);
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
+    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
+        *(dataLayers_[REF][i]->getOutputValue()));
+    VLOG(lvl_) << "Input " << i << " data:";
+    printMatrix(dataLayers_[REF][i]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::randomTopDiffs() {
+  refLayer_->getOutputGrad()->randomizeUniform();
+  dnnLayer_->getOutputGrad()->copyFrom(*(refLayer_->getOutputGrad()));
+  VLOG(lvl_) << "Random dom Backward Input, TopDiff: ";
+  printMatrix(refLayer_->getOutputGrad());
+}
+
+void MKLDNNTester::checkForward() {
+  printTopDatas();
+  double delta = compareMatrix(testLayers_[DNN]->getOutputValue(),
+                               testLayers_[REF]->getOutputValue());
+  VLOG(MKLDNN_ALL) << "Check Forward";
+  EXPECT_LE(fabs(delta), eps_);
+}
+
+void MKLDNNTester::checkBackwardData() {
+  // TODO(TJ): uncomment me when batch norm ready
+  // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
+    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
+    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    printMatrix(dnnDiff);
+    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    printMatrix(refDiff);
+
+    double delta = compareMatrix(dnnDiff, refDiff);
+    EXPECT_LE(fabs(delta), eps_);
+    // TODO(TJ): uncomment me when batch norm ready
+    // if (isBN) {
+    //  // the other two inputs in batch norm are for moving mean and var
+    //  break;
+    // }
+  }
+}
+
+void MKLDNNTester::checkBackwardWgts() {
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
+  saveWgt(parameters_[DNN], dnnWgts);
+
+  const MKLDNNLayerPtr dnnlayer =
+      std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  CHECK(dnnlayer);
+  dnnlayer->convertWeightsToPaddle();
+  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
+    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    printVector(dnn);
+    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    printVector(ref);
+
+    double delta = compareVector(dnn, ref);
+    EXPECT_LE(fabs(delta), eps_);
+  }
+
+  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
+  restoreWgt(dnnWgts, parameters_[DNN]);
+}
+
+void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
+                           vector<VectorPtr>& to) {
+  const bool useGpu = false;
+  to.resize(from.size());
+  for (size_t i = 0; i < to.size(); ++i) {
+    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
+    to[i] = Vector::create(wgt->getSize(), useGpu);
+    to[i]->copyFrom(*wgt);
+  }
+}
+
+void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
+                              vector<ParameterPtr>& to) {
+  CHECK_EQ(from.size(), to.size());
+  for (size_t i = 0; i < from.size(); ++i) {
+    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
+    wgt->copyFrom(*from[i]);
+  }
+}
+
+// clear parameters grad
+void MKLDNNTester::clearWgtDiffs() {
+  for (size_t n = 0; n < parameters_.size(); ++n) {
+    for (size_t i = 0; i < parameters_[n].size(); ++i) {
+      const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
+      if (grad) {
+        grad->zeroMem();
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs() {
+  // dnn and ref
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    // all inputs layers
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      dataLayers_[n][i]->getOutputGrad()->zeroMem();
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs(int n) {
+  CHECK_LT(n, NUM);
+  // all inputs layers
+  for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+    dataLayers_[n][i]->getOutputGrad()->zeroMem();
+  }
+}
+
+void MKLDNNTester::clearTopDatas() {
+  for (size_t i = 0; i < testLayers_.size(); ++i) {
+    testLayers_[i]->getOutputValue()->zeroMem();
+  }
+}
+
+void MKLDNNTester::printTopDatas() {
+  if (!log_) {
+    return;
+  }
+
+  for (int n = 0; n < NUM; ++n) {
+    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    printMatrix(testLayers_[n]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::printMatrix(const MatrixPtr& m) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  m->print(ostr);
+  VLOG(lvl_) << std::endl << ostr.str();
+}
+
+void MKLDNNTester::printVector(const VectorPtr& v) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  v->print(ostr, v->getSize());
+  VLOG(lvl_) << std::endl << ostr.str();
+}
+
+double MKLDNNTester::getDelta(const real* d1,
+                              const real* d2,
+                              size_t len,
+                              const float failRate,
+                              const float thres) {
+  double delta = 0, sum = 0;
+  int failCnt = 0;
+  const double eps = 1e-5;
+  double maxOut = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double ref = fabs(d2[i]);
+    double diff = fabs(d1[i] - d2[i]);
+    delta += diff;
+    sum += ref;
+    if (ref > eps && fabs(d1[i]) > eps && diff / ref > thres) {
+      maxOut = std::max(maxOut, diff / ref);
+      failCnt++;
+    }
+  }
+  EXPECT_TRUE(std::isnormal(sum));
+  EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(delta));
+  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
+                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
+  return (failCnt / (float)len) > failRate ? maxOut : delta / sum;
+}
+
+double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
+  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
+  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
+}
+
+double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
+  CHECK_EQ(v1->getSize(), v2->getSize());
+  return getDelta(v1->getData(), v2->getData(), v1->getSize());
+}
+
+void MKLDNNTester::runOnce() {
+  // test forward
+  randomBotDatas();
+  dnnLayer_->forward(PASS_TRAIN);
+  refLayer_->forward(PASS_TRAIN);
+  checkForward();
+
+  // test backward
+  randomTopDiffs();
+  dnnLayer_->backward(nullptr);
+  refLayer_->backward(nullptr);
+  checkBackwardData();
+  checkBackwardWgts();
+
+  // clear buffers
+  // ref code will addto the diff, dnn code will writeto it
+  // and clearTopDatas() and clearWgtDiffs() should be coverd by test layers
+  clearBotDiffs(REF);
+}
+
+void MKLDNNTester::run(const TestConfig& dnn,
+                       const TestConfig& ref,
+                       size_t batchSize,
+                       size_t inputImgH,
+                       size_t inputImgW,
+                       size_t iter,
+                       float epsilon,
+                       bool log,
+                       int level) {
+  VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: " << dnn.layerConfig.type()
+                     << " vs " << ref.layerConfig.type();
+  ih_ = inputImgH;
+  iw_ = inputImgW;
+  iter_ = iter;
+  eps_ = epsilon;
+  log_ = log;
+  lvl_ = level;
+
+  // Firstly test FLAGS_use_mkldnn_wgt = false
+  FLAGS_use_mkldnn_wgt = false;
+  // reset and run once
+  reset(dnn, ref, batchSize);
+  randomWgtDatas();
+  clearWgtDiffs();
+  clearBotDiffs();
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+
+  // Then test FLAGS_use_mkldnn_wgt = true
+  FLAGS_use_mkldnn_wgt = true;
+  // after run once the mkldnn weight has been stored in dnnlayer
+  // then save the weights and restart again
+  vector<VectorPtr> dnnWgts, refWgts;
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  saveWgt(parameters_[DNN], dnnWgts);
+  saveWgt(parameters_[REF], refWgts);
+
+  // restart again with flag true
+  reset(dnn, ref, batchSize);
+
+  // restore wgt
+  restoreWgt(dnnWgts, parameters_[DNN]);
+  restoreWgt(refWgts, parameters_[REF]);
+  clearWgtDiffs();
+  clearBotDiffs();
+
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+}
+
+}  //  namespace paddle
--- a/paddle/gserver/tests/MKLDNNTester.h
+++ b/paddle/gserver/tests/MKLDNNTester.h
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+
+namespace paddle {
+
+/**
+ * @brief test the functionality of Mkldnnlayers
+ * refer to paddle original function
+ */
+class MKLDNNTester {
+  enum {
+    DNN = 0,  // MKLDNN layer
+    REF = 1,  // Reference layer
+    NUM = 2,  // Number of total
+  };
+
+protected:
+  std::vector<TestConfig> configs_;
+  vector<string> layerNames_;
+  vector<vector<DataLayerPtr>> dataLayers_;
+  vector<vector<Argument>> datas_;
+  vector<LayerMap> layerMaps_;
+  vector<vector<ParameterPtr>> parameters_;
+  vector<LayerPtr> testLayers_;
+  LayerPtr dnnLayer_, refLayer_;
+
+  /// run some iterations, all the result should pass
+  size_t iter_;
+  /// whether to print out the details
+  bool log_;
+  /// vlog level to print the matrix details datas
+  int lvl_;
+  /// epsilon
+  float eps_;
+  /// input image size, default 1
+  size_t ih_, iw_;
+
+public:
+  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
+    iter_ = iter;
+    eps_ = epsilon;
+    log_ = false;
+    lvl_ = MKLDNN_ALL;
+  }
+
+  ~MKLDNNTester() {}
+
+public:
+  void run(const TestConfig& dnn,
+           const TestConfig& ref,
+           size_t batchSize,
+           size_t inputImgH = 1,
+           size_t inputImgW = 1,
+           size_t iter = 3,
+           float epsilon = 1e-4,
+           bool log = false,
+           int level = MKLDNN_ALL);
+  void setLogLevel(int lvl) { lvl_ = lvl; }
+
+private:
+  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
+  void setInputImgSize();
+  void runOnce();
+
+  void randomWgtDatas();
+  void randomBotDatas();
+  void randomTopDiffs();
+
+  void checkForward();
+  void checkBackwardData();
+  void checkBackwardWgts();
+
+  void clearWgtDiffs();
+  void clearBotDiffs();
+  void clearBotDiffs(int n);  // clear specific layer
+  void clearTopDatas();
+
+  void printTopDatas();
+  void printMatrix(const MatrixPtr& m);
+  void printVector(const VectorPtr& v);
+
+  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
+  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
+
+  double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+
+  /**
+   * Get delta percent
+   * if many(>failRate) wrong(abs(dnn-ref)/abs(ref)>thres) points return the
+   * max(diff/ref)
+   * else return sum(abs(a-b)) / sum(abs(b))
+   * The return value should smaller than eps when passing.
+   */
+  double getDelta(const real* d1,
+                  const real* d2,
+                  size_t len,
+                  const float failRate = 1e-3,
+                  const float thres = 0.1);
+};
+
+}  //  namespace paddle
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <string>
+#include <vector>
+#include "ModelConfig.pb.h"
+#include "paddle/gserver/layers/DataLayer.h"
+#include "paddle/trainer/Trainer.h"
+#include "paddle/utils/GlobalConstants.h"
+
+#include "LayerGradUtil.h"
+#include "paddle/testing/TestUtil.h"
+
+using namespace paddle;  // NOLINT
+using namespace std;     // NOLINT
+
+DECLARE_bool(use_gpu);
+DECLARE_int32(gpu_id);
+DECLARE_bool(thread_local_rand_use_global_seed);
+
+vector<int> randSampling(int range, int n) {
+  CHECK_GE(range, n);
+  vector<int> num(range);
+  iota(begin(num), end(num), 0);
+  if (range == n) return num;
+
+  random_shuffle(begin(num), end(num));
+  num.resize(n);
+  return num;
+}
+
+void genRandomSeqInfo(vector<int>& seqStartPosition,
+                      vector<int>& subSeqStartPosition) {
+  const int maxSeqNum = 100;
+  // generate random start position information
+  int seqNum = 1 + (rand() % maxSeqNum);
+  seqStartPosition.resize(seqNum + 1, 0);
+  subSeqStartPosition.resize(1, 0);
+
+  for (int i = 0; i < seqNum; ++i) {
+    int subSeqLen = 1 + (rand() % maxSeqNum);
+    for (int j = 0; j < subSeqLen; ++j)
+      subSeqStartPosition.push_back(subSeqStartPosition.back() + subSeqLen);
+    seqStartPosition[i + 1] = subSeqStartPosition.back();
+  }
+}
+
+void genRandomGroundTruth(real* values,
+                          vector<vector<int>>& groundTruth,
+                          vector<int>& startPos,
+                          size_t beamSize) {
+  groundTruth.resize(startPos.size() - 1, vector<int>(beamSize, -1));
+  for (size_t i = 0; i < startPos.size() - 1; ++i) {
+    int seqLen = startPos[i + 1] - startPos[i];
+    vector<int> pos =
+        randSampling(seqLen, min(static_cast<int>(beamSize), seqLen));
+    for (size_t j = 0; j < pos.size(); ++j) {
+      groundTruth[i][j] = pos[j];
+      values[startPos[i] + pos[j]] = 1.;
+    }
+  }
+}
+
+void checkLayerOut(vector<vector<int>> groundTruth,
+                   real* layerOut,
+                   size_t beamSize) {
+  for (size_t i = 0; i < groundTruth.size(); ++i) {
+    int begPos = i * beamSize;
+    vector<real> tmp(layerOut + begPos, layerOut + begPos + beamSize);
+    sort(begin(tmp), end(tmp));
+    sort(begin(groundTruth[i]), end(groundTruth[i]));
+    for (size_t j = 0; j < beamSize; ++j) CHECK_EQ(tmp[j], groundTruth[i][j]);
+  }
+}
+
+TEST(Layer, kmaxSeqScoreLayer) {
+  const size_t maxBeamSize = 100;
+  size_t beamSize = 1 + (rand() % maxBeamSize);
+
+  vector<int> seqStartPosition;
+  vector<int> subSeqStartPosition;
+  genRandomSeqInfo(seqStartPosition, subSeqStartPosition);
+  MatrixPtr inValue =
+      Matrix::create(subSeqStartPosition.back(), 1, false, false);
+
+  std::vector<bool> mode = {false};
+#ifndef PADDLE_ONLY_CPU
+  mode.push_back(true);
+#endif
+
+  for (auto hasSubseq : {false, true}) {
+    vector<vector<int>> groundTruth;
+    inValue->randomizeUniform();
+    genRandomGroundTruth(inValue->getData(),
+                         groundTruth,
+                         hasSubseq ? subSeqStartPosition : seqStartPosition,
+                         beamSize);
+
+    for (auto useGpu : mode) {
+      TestConfig config;
+      config.layerConfig.set_type("kmax_seq_score");
+      config.layerConfig.set_beam_size(beamSize);
+
+      if (hasSubseq) {
+        config.inputDefs.push_back({INPUT_SELF_DEFINE_DATA,
+                                    "scores",
+                                    inValue,
+                                    seqStartPosition,
+                                    subSeqStartPosition});
+      } else {
+        config.inputDefs.push_back(
+            {INPUT_SELF_DEFINE_DATA, "scores", inValue, seqStartPosition});
+      }
+      config.layerConfig.add_inputs();
+
+      // data layer initialize
+      std::vector<DataLayerPtr> dataLayers;
+      LayerMap layerMap;
+      vector<Argument> datas;
+      initDataLayer(
+          config,
+          &dataLayers,
+          &datas,
+          &layerMap,
+          "kmax_seq_score",
+          100 /* actually this parameter is unused in self-defined input*/,
+          false,
+          useGpu);
+      // test layer initialize
+      std::vector<ParameterPtr> parameters;
+      LayerPtr kmaxSeqScoreLayer;
+      FLAGS_use_gpu = useGpu;
+      initTestLayer(config, &layerMap, &parameters, &kmaxSeqScoreLayer);
+      kmaxSeqScoreLayer->forward(PASS_TRAIN);
+
+      const MatrixPtr outValue = kmaxSeqScoreLayer->getOutputValue();
+      CHECK_EQ(outValue->getHeight(),
+               hasSubseq ? subSeqStartPosition.size() - 1
+                         : seqStartPosition.size() - 1);
+      CHECK_EQ(outValue->getWidth(), beamSize);
+      checkLayerOut(groundTruth, outValue->getData(), beamSize);
+    }
+  }
+}
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand((size_t)(time(NULL)));
+  return RUN_ALL_TESTS();
+}
--- a/paddle/gserver/tests/test_MKLDNN.cpp
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "MKLDNNTester.h"
+#include "ModelConfig.pb.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
+
+struct testFCDesc {
+  int bs;
+  int ic;
+  int oc;
+  int ih, iw;  // oh == ow == 1
+};
+
+void testFcLayer(const testFCDesc& pm) {
+  const std::string compareTypes[] = {"mkldnn_fc", "fc"};
+  TestConfig cfg;
+  cfg.layerConfig.set_type(compareTypes[0]);
+  cfg.layerConfig.set_size(pm.oc);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
+  cfg.layerConfig.add_inputs();
+
+  MKLDNNTester tester;
+  for (auto biasSize : {pm.oc, 0}) {
+    cfg.biasSize = biasSize;
+    TestConfig ref = cfg;
+    ref.layerConfig.set_type(compareTypes[1]);
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(cfg, ref, bs, pm.ih, pm.iw);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, FcLayer) {
+  testFcLayer({/*bs*/ 2, /*ic*/ 2, /*oc*/ 3, /*ih*/ 1, /*iw*/ 1});
+  testFcLayer({/*bs*/ 3, /*ic*/ 7, /*oc*/ 19, /*ih*/ 1, /*iw*/ 1});
+  testFcLayer({/*bs*/ 8, /*ic*/ 16, /*oc*/ 32, /*ih*/ 13, /*iw*/ 13});
+  testFcLayer({/*bs*/ 4, /*ic*/ 12, /*oc*/ 18, /*ih*/ 13, /*iw*/ 11});
+  testFcLayer({/*bs*/ 2, /*ic*/ 64, /*oc*/ 32, /*ih*/ 16, /*iw*/ 16});
+  testFcLayer({/*bs*/ 15, /*ic*/ 3, /*oc*/ 6, /*ih*/ 16, /*iw*/ 16});
+}
+
+// TODO(TJ): add branch test
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = true;
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -15,13 +15,13 @@
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 set(MATH_SOURCES
-    "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
-    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
    ${MATH_SOURCES})
 if(NOT WITH_GPU)
    # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu")
    add_library(paddle_math STATIC
        ${MATH_SOURCES})
 else()

--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -302,6 +302,10 @@ public:
  bool isSparse() const { return true; }

 private:
+  using Matrix::mul;
  using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
 };
 }  // namespace paddle
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -231,6 +231,9 @@ public:
 private:
  using Matrix::mul;
  using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
 };

 }  // namespace paddle
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -41,28 +41,28 @@ function(op_library TARGET)
    endif()
 endfunction()

+cc_test(gather_test SRCS gather_test.cc DEPS tensor)
+
 cc_library(net_op SRCS net_op.cc DEPS op_registry)
 cc_test(net_op_test SRCS net_op_test.cc DEPS net_op)

 op_library(add_op SRCS add_op.cc add_op.cu)
-cc_test(add_op_test SRCS add_op_test.cc DEPS add_op)

 op_library(mean_op SRCS mean_op.cc mean_op.cu)
-cc_test(mean_op_test SRCS mean_op_test.cc DEPS mean_op)

 op_library(mul_op SRCS mul_op.cc mul_op.cu)
 op_library(rowwise_add_op SRCS rowwise_add_op.cu rowwise_add_op.cc)

 op_library(sigmoid_op SRCS sigmoid_op.cc sigmoid_op.cu)
 op_library(softmax_op SRCS softmax_op.cc softmax_op.cu)
+op_library(gaussian_random_op SRCS gaussian_random_op.cc gaussian_random_op.cu)
 op_library(cross_entropy_op SRCS cross_entropy_op.cc cross_entropy_op.cu)
 op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)

 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)

-op_library(fc_op
-    SRCS fc_op.cc
-    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
    DEPS op_desc tensor op_registry operator net_op)
 cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
+op_library(uniform_random_op
+        SRCS uniform_random_op.cc uniform_random_op.cu)
--- a/paddle/operators/add_op.cc
+++ b/paddle/operators/add_op.cc
@@ -17,9 +17,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-class AddOp : public OperatorWithKernel {
+class AddOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(AddOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 2);
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1);
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "Inputs of AddOp must all be set");
@@ -31,9 +32,9 @@ class AddOp : public OperatorWithKernel {
  }
 };

-class AddOpMaker : public OpProtoAndCheckerMaker {
+class AddOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  AddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  AddOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of add op");
    AddInput("Y", "The second input of add op");
@@ -46,14 +47,18 @@ The equation is: Out = X + Y
  }
 };

-class AddOpGrad : public OperatorWithKernel {
+class AddOpGrad : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(AddOpGrad, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
 };

 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
 REGISTER_OP(add_two, ops::AddOp, ops::AddOpMaker);
 REGISTER_GRADIENT_OP(add_two, add_two_grad, ops::AddOpGrad);
-REGISTER_OP_CPU_KERNEL(add_two, ops::AddKernel<ops::CPUPlace, float>);
+
+REGISTER_OP_CPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/add_op.cu
+++ b/paddle/operators/add_op.cu
@@ -16,4 +16,6 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/add_op.h"

-REGISTER_OP_GPU_KERNEL(add_two, ops::AddKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(add_two,
+                       ops::AddKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/add_op.h
+++ b/paddle/operators/add_op.h
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class AddKernel : public OpKernel {
+class AddKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input0 = context.Input<Tensor>(0);
    auto input1 = context.Input<Tensor>(1);
    auto output = context.Output<Tensor>(0);

--- a/paddle/operators/cross_entropy_op.cc
+++ b/paddle/operators/cross_entropy_op.cc
@@ -17,9 +17,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-class OnehotCrossEntropyOp : public OperatorWithKernel {
+class OnehotCrossEntropyOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(OnehotCrossEntropyOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 2,
                      "Input size of OnehotCrossEntropyOp must be two");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1,
@@ -37,9 +38,11 @@ class OnehotCrossEntropyOp : public OperatorWithKernel {
  }
 };

-class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
+class OnehotCrossEntropyGradientOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(OnehotCrossEntropyGradientOp,
+                       framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    auto X_grad = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto X = ctx.Input<Tensor>("X");

@@ -48,9 +51,10 @@ class OnehotCrossEntropyGradientOp : public OperatorWithKernel {
  }
 };

-class OnehotCrossEntropyOpMaker : public OpProtoAndCheckerMaker {
+class OnehotCrossEntropyOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  OnehotCrossEntropyOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  OnehotCrossEntropyOpMaker(framework::OpProto *proto,
+                            framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of OnehotCrossEntropyOp");
    AddInput("label", "The second input of OnehotCrossEntropyOp");
@@ -66,11 +70,14 @@ OnehotCrossEntropy Operator.
 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
 REGISTER_OP(onehot_cross_entropy, ops::OnehotCrossEntropyOp,
            ops::OnehotCrossEntropyOpMaker);
-REGISTER_OP_CPU_KERNEL(onehot_cross_entropy,
-                       ops::OnehotCrossEntropyOpKernel<ops::CPUPlace, float>);
-
+REGISTER_OP_CPU_KERNEL(
+    onehot_cross_entropy,
+    ops::OnehotCrossEntropyOpKernel<paddle::platform::CPUPlace, float>);
+REGISTER_GRADIENT_OP(onehot_cross_entropy, onehot_cross_entropy_grad,
+                     ops::OnehotCrossEntropyGradientOp);
 REGISTER_OP_CPU_KERNEL(
    onehot_cross_entropy_grad,
-    ops::OnehotCrossEntropyGradientOpKernel<ops::CPUPlace, float>);
+    ops::OnehotCrossEntropyGradientOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/cross_entropy_op.cu
+++ b/paddle/operators/cross_entropy_op.cu
@@ -14,3 +14,8 @@

 #define EIGEN_USE_GPU
 #include "paddle/operators/cross_entropy_op.h"
+
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    onehot_cross_entropy,
+    ops::OnehotCrossEntropyOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/cross_entropy_op.h
+++ b/paddle/operators/cross_entropy_op.h
@@ -13,11 +13,13 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+
 template <typename T>
 T tolerable_value(T x) {
  static_assert(std::is_floating_point<T>::value,
@@ -38,9 +40,9 @@ T tolerable_value(T x) {
 }

 template <typename Place, typename T>
-class OnehotCrossEntropyOpKernel : public OpKernel {
+class OnehotCrossEntropyOpKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    auto X = ctx.Input<Tensor>("X");
    const T* Xdata = X->data<T>();
    const int* label_data = ctx.Input<Tensor>(1)->data<int>();
@@ -61,9 +63,9 @@ class OnehotCrossEntropyOpKernel : public OpKernel {
 };

 template <typename Place, typename T>
-class OnehotCrossEntropyGradientOpKernel : public OpKernel {
+class OnehotCrossEntropyGradientOpKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    auto X = ctx.Input<Tensor>("X");
    auto dX = ctx.Output<Tensor>(framework::GradVarName("X"));
    auto dY = ctx.Input<Tensor>(framework::GradVarName("Y"));

--- a/paddle/operators/fc_op.cc
+++ b/paddle/operators/fc_op.cc
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "type_alias.h"
-
-namespace paddle {
-namespace operators {
-
-class FullyConnectedOp : public NetOp {
- public:
-  void Init() override {
-    AddOp(OpRegistry::CreateOp("mul",
-                               {
-                                   Input("X"), Input("W"),
-                               },
-                               {Output("before_act")}, {}));
-    auto b = Input("b");
-    if (b != framework::kEmptyVarName) {
-      AddOp(OpRegistry::CreateOp("rowwise_add",
-                                 {Output("before_act"), Input("b")},
-                                 {Output("before_act")}, {}));
-    }
-
-    auto activation = GetAttr<std::string>("activation");
-    AddOp(OpRegistry::CreateOp(activation, {Output("before_act")},
-                               {Output("Y")}, {}));
-    CompleteAddOp(false);
-  }
-};
-
-class FullyConnectedOpMaker : public OpProtoAndCheckerMaker {
- public:
-  FullyConnectedOpMaker(OpProto *proto, OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input of fc operator");
-    AddInput("W", "the weight of fc operator");
-    AddInput("b", "the bias of fc operator");
-
-    AddOutput("Y", "the output of fc operator");
-    AddOutput("before_act", "the before activation output of fc operator")
-        .SetTemporary();
-    AddAttr<std::string>("activation", "The activation key for fc layer")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "softmax"});
-
-    //! TODO(yuyang18): Complete comment;
-    AddComment("FullyConnected Operator");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-USE_OP(mul);
-USE_OP(rowwise_add);
-USE_OP(sigmoid);
-USE_OP(softmax);
-
-REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker);
--- a/paddle/operators/fill_zeros_like_op.cc
+++ b/paddle/operators/fill_zeros_like_op.cc
@@ -18,6 +18,7 @@ namespace paddle {
 namespace operators {

 class FillZerosLikeOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(FillZerosLikeOp, framework::OperatorWithKernel)
 protected:
  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 1UL,
@@ -50,8 +51,8 @@ The output will have the same size with input.
 }  // namespace operators
 }  // namespace paddle

-REGISTER_OP(fill_zeros_like, paddle::operators::FillZerosLikeOp,
-            paddle::operators::FillZerosLikeOpMaker);
+namespace ops = paddle::operators;
+REGISTER_OP(fill_zeros_like, ops::FillZerosLikeOp, ops::FillZerosLikeOpMaker);
 REGISTER_OP_CPU_KERNEL(
    fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/fill_zeros_like_op.cu
+++ b/paddle/operators/fill_zeros_like_op.cu
@@ -16,6 +16,7 @@
 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/fill_zeros_like_op.h"

+namespace ops = paddle::operators;
 REGISTER_OP_GPU_KERNEL(
    fill_zeros_like,
-    paddle::operators::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
+    ops::FillZerosLikeKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/fill_zeros_like_op.h
+++ b/paddle/operators/fill_zeros_like_op.h
@@ -13,7 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

--- a/paddle/operators/gather.h
+++ b/paddle/operators/gather.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+#include <memory.h>
+#include <cstring>
+
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
+namespace paddle {
+namespace operators {
+
+// Implementation of CPU copy
+template <typename T>
+void CPUGather(const T* params, const int* indices, const int slice_size,
+               const int index_size, T* output) {
+  const size_t slice_bytes = slice_size * sizeof(T);
+
+  for (size_t i = 0; i < index_size; ++i) {
+    int index_ = indices[i];
+    memcpy(output + i * slice_size, params + index_ * slice_size, slice_bytes);
+  }
+}
+
+// Implementation of GPU copy:
+template <typename T>
+void GPUGather(const T* src, const int* index, const int slice_size,
+               const int index_size, T* output);
+
+/**
+ * Return a new tensor from source tensor, gathered according to index
+ * input[src]: type-T source Tensor
+ * input[index]: type-int index Tensor (1-D)
+ * return: output tensor
+ */
+template <typename T>
+void Gather(const platform::Place& place, const paddle::framework::Tensor* src,
+            const paddle::framework::Tensor* index,
+            paddle::framework::Tensor* output) {
+  // check index of shape 1-D
+  PADDLE_ENFORCE(index->dims().size() == 1);
+  int index_size = index->dims()[0];
+
+  auto src_dims = src->dims();
+  paddle::framework::DDim output_dims(src_dims);
+  output_dims[0] = index_size;
+
+  // slice size
+  int slice_size = 1;
+  for (size_t i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i];
+
+  // Gathering
+  if (platform::is_cpu_place(place)) {
+    CPUGather<T>(src->data<T>(), index->data<int>(), slice_size, index_size,
+                 output->data<T>());
+  }
+}
+
+}  // namespace operators
+}  // namespace paddle
--- a/paddle/operators/add_op_test.cc
+++ b/paddle/operators/add_op_test.cc
@@ -12,17 +12,37 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include "paddle/operators/gather.h"
+#include "paddle/framework/ddim.h"
+#include "paddle/framework/tensor.h"
+#include "paddle/platform/place.h"
+
 #include <gtest/gtest.h>
-#define private public
-#include <paddle/framework/op_registry.h>
-USE_OP(add_two);
-// USE_OP(add_two_grad);
-
-TEST(AddOp, GetOpProto) {
-  auto& protos = paddle::framework::OpRegistry::protos();
-  auto it = protos.find("add_two");
-  ASSERT_NE(it, protos.end());
-  auto& op_creators = paddle::framework::OpRegistry::op_creators();
-  auto it1 = op_creators.find("add_two_grad");
-  ASSERT_NE(it1, op_creators.end());
+#include <iostream>
+#include <string>
+
+TEST(Gather, GatherData) {
+  using namespace paddle::framework;
+  using namespace paddle::platform;
+  using namespace paddle::operators;
+
+  Tensor* src = new Tensor();
+  Tensor* index = new Tensor();
+  Tensor* output = new Tensor();
+
+  int* p_src = nullptr;
+  int* p_index = nullptr;
+  p_src = src->mutable_data<int>(make_ddim({3, 4}), CPUPlace());
+  p_index = index->mutable_data<int>(make_ddim({2}), CPUPlace());
+
+  for (size_t i = 0; i < 12; ++i) p_src[i] = i;
+  p_index[0] = 1;
+  p_index[1] = 0;
+
+  int* p_output = output->mutable_data<int>(make_ddim({2, 4}), CPUPlace());
+
+  Gather<int>(CPUPlace(), src, index, output);
+
+  for (size_t i = 0; i < 4; ++i) EXPECT_EQ(p_output[i], i + 4);
+  for (size_t i = 4; i < 8; ++i) EXPECT_EQ(p_output[i], i - 4);
 }
--- a/paddle/operators/gaussian_random_op.cc
+++ b/paddle/operators/gaussian_random_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <random>
+#include "paddle/framework/op_registry.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+class GaussianRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    // TODO(dzh): attribute does not support unsigned int.
+    // And we need a global random seed configuration.
+    int seed = context.op_.GetAttr<int>("seed");
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    std::mt19937 g(seed);
+    std::normal_distribution<T> distribution(mean, std);
+    ssize_t size = framework::product(tensor->dims());
+    for (int i = 0; i < size; ++i) {
+      data[i] = distribution(g);
+    }
+  }
+};
+
+class GaussianRandomOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(GaussianRandomOp, framework::OperatorWithKernel)
+ protected:
+  void InferShape(const framework::InferShapeContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    PADDLE_ENFORCE(dims.size() > 0UL,
+                   "dims can be one int or array. dims must be set.");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+
+class GaussianRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  GaussianRandomOpMaker(framework::OpProto* proto,
+                        framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "output matrix of random op");
+    AddComment(R"DOC(
+GaussianRandom operator.
+Use to initialize tensor with gaussian random generator.
+)DOC");
+
+    AddAttr<std::vector<int>>("dims", "The dimension of random tensor.");
+    AddAttr<float>("mean", "mean value of random.").SetDefault(.0f);
+    AddAttr<float>("std", "minimum value of random value.").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of generator."
+                 "0 means use system wide seed")
+        .SetDefault(0);
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+REGISTER_OP(gaussian_random, ops::GaussianRandomOp, ops::GaussianRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
--- a/paddle/operators/type_alias.h
+++ b/paddle/operators/type_alias.h
@@ -12,44 +12,42 @@
   See the License for the specific language governing permissions and
   limitations under the License. */

-#pragma once
+#include <memory>
+#include <random>
+#include "paddle/platform/dynload/curand.h"
+#include "paddle/platform/gpu_info.h"

-#include "paddle/framework/eigen.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/operators/net_op.h"

 namespace paddle {
 namespace operators {

-using OpKernel = framework::OpKernel;
-using OperatorBase = framework::OperatorBase;
-using InferShapeContext = framework::InferShapeContext;
-using ExecutionContext = framework::ExecutionContext;
-using Variable = framework::Variable;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
-template <typename T, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
-template <typename T, size_t D, int MajorType = Eigen::RowMajor,
-          typename IndexType = Eigen::DenseIndex>
-using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;
-using Tensor = framework::Tensor;
-using Scope = framework::Scope;
-using OperatorWithKernel = framework::OperatorWithKernel;
-using OperatorBase = framework::OperatorBase;
-using OpProtoAndCheckerMaker = framework::OpProtoAndCheckerMaker;
-using OpProto = framework::OpProto;
-using OpAttrChecker = framework::OpAttrChecker;
-using CPUPlace = platform::CPUPlace;
-using GPUPlace = platform::GPUPlace;
-using OpRegistry = framework::OpRegistry;
+template <typename T>
+class GaussianRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    float mean = context.op_.GetAttr<float>("mean");
+    float std = context.op_.GetAttr<float>("std");
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+
+    int seed = context.op_.GetAttr<int>("seed");
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    curandGenerator_t g;
+    PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
+        &g, CURAND_RNG_PSEUDO_DEFAULT));
+    PADDLE_ENFORCE(
+        platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
+    platform::dynload::curandGenerateNormal(
+        g, data, framework::product(tensor->dims()), mean, std);
+  }
+};

 }  // namespace operators
 }  // namespace paddle

 namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
--- a/paddle/operators/mean_op.cc
+++ b/paddle/operators/mean_op.cc
@@ -17,9 +17,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-class MeanOp : public OperatorWithKernel {
+class MeanOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(MeanOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 1, "Input size of AddOp must be one");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "Output size of AddOp must be one");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "input should be set");
@@ -28,9 +29,9 @@ class MeanOp : public OperatorWithKernel {
  }
 };

-class MeanOpMaker : public OpProtoAndCheckerMaker {
+class MeanOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MeanOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  MeanOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The input of mean op");
    AddOutput("Out", "The output of mean op").IgnoreGradient();
@@ -38,10 +39,11 @@ class MeanOpMaker : public OpProtoAndCheckerMaker {
  }
 };

-class MeanGradOp : public OperatorWithKernel {
+class MeanGradOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(MeanGradOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
-    ctx.Output<Tensor>("X" + framework::kGradVarSuffix)
+  void InferShape(const framework::InferShapeContext &ctx) const override {
+    ctx.Output<Tensor>(framework::GradVarName("X"))
        ->Resize(ctx.Input<Tensor>("X")->dims());
  }
 };
@@ -49,7 +51,10 @@ class MeanGradOp : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
 REGISTER_OP(mean, ops::MeanOp, ops::MeanOpMaker);
-REGISTER_OP_CPU_KERNEL(mean, ops::MeanKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::CPUPlace, float>);
 REGISTER_GRADIENT_OP(mean, mean_grad, ops::MeanGradOp);
-REGISTER_OP_CPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/mean_op.cu
+++ b/paddle/operators/mean_op.cu
@@ -16,5 +16,8 @@

 #include "paddle/operators/mean_op.h"

-REGISTER_OP_GPU_KERNEL(mean, ops::MeanKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(mean_grad, ops::MeanGradKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(mean,
+                       ops::MeanKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(mean_grad,
+                       ops::MeanGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/mean_op.h
+++ b/paddle/operators/mean_op.h
@@ -13,15 +13,24 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenScalar = framework::EigenScalar<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class MeanKernel : public OpKernel {
+class MeanKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input = context.Input<Tensor>(0);
    auto output = context.Output<Tensor>(0);

@@ -36,13 +45,13 @@ class MeanKernel : public OpKernel {
 };

 template <typename Place, typename T>
-class MeanGradKernel : public OpKernel {
+class MeanGradKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
-    auto OG = context.Input<Tensor>("Out" + framework::kGradVarSuffix);
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto OG = context.Input<Tensor>(framework::GradVarName("Out"));
    PADDLE_ENFORCE(framework::product(OG->dims()) == 1,
                   "Mean Gradient should be scalar");
-    auto IG = context.Output<Tensor>("X" + framework::kGradVarSuffix);
+    auto IG = context.Output<Tensor>(framework::GradVarName("X"));
    IG->mutable_data<T>(context.GetPlace());

    T ig_size = (T)framework::product(IG->dims());

--- a/paddle/operators/mul_op.cc
+++ b/paddle/operators/mul_op.cc
@@ -17,9 +17,10 @@
 namespace paddle {
 namespace operators {

-class MulOp : public OperatorWithKernel {
+class MulOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(MulOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2, "The mul op must take two inputs");
    auto dim0 = ctx.Input<Tensor>(0)->dims();
    auto dim1 = ctx.Input<Tensor>(1)->dims();
@@ -37,9 +38,9 @@ class MulOp : public OperatorWithKernel {
  }
 };

-class MulOpMaker : public OpProtoAndCheckerMaker {
+class MulOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  MulOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  MulOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The first input of mul op");
    AddInput("Y", "The second input of mul op");
@@ -52,9 +53,10 @@ The equation is: Out = X * Y
  }
 };

-class MulOpGrad : public OperatorWithKernel {
+class MulOpGrad : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(MulOpGrad, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {}
+  void InferShape(const framework::InferShapeContext &ctx) const override {}
  std::string DebugString() const override {
    LOG(INFO) << "MulGrad";
    return "";
@@ -64,7 +66,8 @@ class MulOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
 REGISTER_OP(mul, ops::MulOp, ops::MulOpMaker);
 REGISTER_GRADIENT_OP(mul, mul_grad, ops::MulOpGrad);

-REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(mul, ops::MulKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/mul_op.cu
+++ b/paddle/operators/mul_op.cu
@@ -15,4 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/mul_op.h"

-REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(mul, ops::MulKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/mul_op.h
+++ b/paddle/operators/mul_op.h
@@ -13,16 +13,21 @@
   limitations under the License. */

 #pragma once
-
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class MulKernel : public OpKernel {
+class MulKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    Eigen::array<Eigen::IndexPair<Eigen::DenseIndex>, 1> dim_pair = {
        {Eigen::IndexPair<Eigen::DenseIndex>(1, 0)}};

@@ -40,5 +45,6 @@ class MulKernel : public OpKernel {
    Z.device(place) = X.contract(Y, dim_pair);
  }
 };
+
 }  // namespace operators
 }  // namespace paddle
--- a/paddle/operators/net_op.cc
+++ b/paddle/operators/net_op.cc
@@ -15,7 +15,6 @@
 */

 #include "paddle/operators/net_op.h"
-#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

--- a/paddle/operators/net_op.h
+++ b/paddle/operators/net_op.h
@@ -14,13 +14,7 @@ limitations under the License. */

 #pragma once

-#include "paddle/framework/op_desc.pb.h"
-#include "paddle/framework/op_proto.pb.h"
 #include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/scope.h"
-#include "paddle/operators/type_alias.h"
-#include "paddle/platform/device_context.h"

 namespace paddle {
 namespace operators {
@@ -41,6 +35,8 @@ namespace operators {
 */
 class NetOp : public framework::OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(NetOp, framework::OperatorBase)
+
  /**
   * Infer all the operators' input and output variables' shapes, will be called
   * before every mini-batch
@@ -65,6 +61,15 @@ class NetOp : public framework::OperatorBase {
    }
  }

+  bool SupportGPU() const override {
+    for (auto& op : ops_) {
+      if (!op->SupportGPU()) {
+        return false;
+      }
+    }
+    return true;
+  }
+
  /**
   * @brief Add an operator by ptr
   */

--- a/paddle/operators/net_op_test.cc
+++ b/paddle/operators/net_op_test.cc
@@ -2,31 +2,31 @@

 #include <gtest/gtest.h>

-#include "paddle/framework/op_registry.h"
-#include "paddle/framework/operator.h"
-
 namespace paddle {
 namespace operators {
+using Scope = framework::Scope;
+using DeviceContext = platform::DeviceContext;

 static int infer_shape_cnt = 0;
 static int run_cnt = 0;

-class TestOp : public OperatorBase {
+class TestOp : public framework::OperatorBase {
 public:
-  void InferShape(const framework::Scope& scope) const override {
-    ++infer_shape_cnt;
-  }
-  void Run(const framework::Scope& scope,
-           const paddle::platform::DeviceContext& dev_ctx) const override {
+  DEFINE_OPERATOR_CTOR(TestOp, framework::OperatorBase)
+
+  void InferShape(const Scope& scope) const override { ++infer_shape_cnt; }
+  void Run(const Scope& scope,
+           const platform::DeviceContext& dev_ctx) const override {
    ++run_cnt;
  }
 };

-class EmptyOp : public OperatorBase {
+class EmptyOp : public framework::OperatorBase {
 public:
+  DEFINE_OPERATOR_CTOR(EmptyOp, framework::OperatorBase)
+
  void InferShape(const Scope& scope) const override {}
-  void Run(const Scope& scope,
-           const platform::DeviceContext& dev_ctx) const override {}
+  void Run(const Scope& scope, const DeviceContext& dev_ctx) const override {}
 };

 template <typename T>
@@ -72,7 +72,7 @@ TEST(OpKernel, all) {
  net->Run(scope, dev_ctx);
  ASSERT_EQ(2, infer_shape_cnt);
  ASSERT_EQ(2, run_cnt);
-  ASSERT_THROW(net->AddOp(op2), paddle::platform::EnforceNotMet);
+  ASSERT_THROW(net->AddOp(op2), platform::EnforceNotMet);
 }

 TEST(NetOp, insert_op) {

--- a/paddle/operators/recurrent_op.cc
+++ b/paddle/operators/recurrent_op.cc
@@ -14,17 +14,19 @@

 #include "paddle/operators/recurrent_op.h"

-#include <glog/logging.h>
 #include <cstring>
 #include <sstream>

 #include "paddle/framework/op_registry.h"
 #include "paddle/operators/net_op.h"
-#include "paddle/platform/enforce.h"

 namespace paddle {
 namespace operators {

+using Scope = framework::Scope;
+using Variable = framework::Variable;
+using Tensor = framework::Tensor;
+
 void RecurrentAlgorithm::InferShape(const Scope& scope) const {
  seq_len_ = scope.FindVar((arg_->inlinks[0]).external)
                 ->GetMutable<Tensor>()
@@ -135,10 +137,11 @@ void RecurrentOp::Init() {
  alg_.Init(std::move(arg));
 }

-class RecurrentAlgorithmProtoAndCheckerMaker : public OpProtoAndCheckerMaker {
+class RecurrentAlgorithmProtoAndCheckerMaker
+    : public framework::OpProtoAndCheckerMaker {
 public:
-  RecurrentAlgorithmProtoAndCheckerMaker(OpProto* proto,
-                                         OpAttrChecker* op_checker)
+  RecurrentAlgorithmProtoAndCheckerMaker(framework::OpProto* proto,
+                                         framework::OpAttrChecker* op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    const auto& name = RecurrentOp::kArgName;
    // inputs and outputs stored in proto

--- a/paddle/operators/recurrent_op.h
+++ b/paddle/operators/recurrent_op.h
@@ -100,6 +100,7 @@ class RecurrentGradientAlgorithm {
 };

 class RecurrentOp final : public framework::OperatorBase {
+  DEFINE_OPERATOR_CTOR(RecurrentOp, framework::OperatorBase)
 public:
  void Init() override;


--- a/paddle/operators/recurrent_op_test.cc
+++ b/paddle/operators/recurrent_op_test.cc
@@ -27,6 +27,10 @@ namespace operators {

 using framework::make_ddim;
 using framework::DDim;
+using framework::Tensor;
+using framework::Variable;
+using framework::Scope;
+using framework::OpRegistry;

 class RecurrentOpTest : public ::testing::Test {
 protected:
@@ -164,7 +168,7 @@ class RecurrentOpTest : public ::testing::Test {

  // father scope
  Scope scope_;
-  std::shared_ptr<OperatorBase> rnn_op_;
+  std::shared_ptr<framework::OperatorBase> rnn_op_;
 };

 TEST_F(RecurrentOpTest, Run) {
@@ -391,4 +395,4 @@ TEST(RecurrentOp, LinkMemories) {

 USE_OP(add_two);
 USE_OP(mul);
-USE_OP_WITHOUT_KERNEL(recurrent_op);
+USE_OP_ITSELF(recurrent_op);
--- a/paddle/operators/rnn/recurrent_op_utils.cc
+++ b/paddle/operators/rnn/recurrent_op_utils.cc
@@ -18,7 +18,9 @@ namespace paddle {
 namespace operators {
 namespace rnn {

-namespace fmw = paddle::framework;
+namespace f = paddle::framework;
+
+using Tensor = framework::Tensor;

 void SegmentInputs(const std::vector<Scope*>& step_scopes,
                   const std::vector<Link>& inlinks, const size_t seq_len,
@@ -30,10 +32,10 @@ void SegmentInputs(const std::vector<Scope*>& step_scopes,
                   inlinks[i].external);

    Tensor* input = input_var->GetMutable<Tensor>();
-    fmw::DDim dims = input->dims();
+    f::DDim dims = input->dims();
    PADDLE_ENFORCE(static_cast<size_t>(dims[0]) == seq_len,
                   "all the inlinks must have same length");
-    fmw::DDim step_dims = slice_ddim(dims, 1, dims.size());
+    f::DDim step_dims = slice_ddim(dims, 1, dims.size());
    for (size_t j = 0; j < seq_len; j++) {
      Tensor* step_input =
          step_scopes[j]->NewVar(inlinks[i].internal)->GetMutable<Tensor>();
@@ -58,11 +60,10 @@ void ConcatOutputs(const std::vector<Scope*>& step_scopes,
      auto step_scope_var = step_scopes[0]->FindVar(outlinks[i].internal);
      PADDLE_ENFORCE(step_scope_var != nullptr, "%s not in scope",
                     outlinks[i].internal);
-      fmw::DDim step_dims =
-          step_scope_var->template GetMutable<Tensor>()->dims();
+      f::DDim step_dims = step_scope_var->template GetMutable<Tensor>()->dims();
      std::vector<int> dims_vec = vectorize(step_dims);
      dims_vec.insert(dims_vec.begin(), seq_len);
-      output->Resize(fmw::make_ddim(dims_vec));
+      output->Resize(f::make_ddim(dims_vec));
    } else {
      output->mutable_data<float>(platform::CPUPlace());
      for (size_t j = 0; j < seq_len; j++) {
@@ -104,7 +105,7 @@ void LinkMemories(const std::vector<Scope*>& scopes,
 }

 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const OperatorBase& op) {
+                  const framework::OperatorBase& op) {
  arg->step_net = op.Input(name.step_net);
  arg->step_scopes = op.Output(name.step_scopes);


--- a/paddle/operators/rnn/recurrent_op_utils.h
+++ b/paddle/operators/rnn/recurrent_op_utils.h
@@ -17,12 +17,13 @@
 #include <string>

 #include "paddle/framework/operator.h"
-#include "paddle/operators/type_alias.h"

 namespace paddle {
 namespace operators {
 namespace rnn {

+using Scope = framework::Scope;
+
 /**
 * Memory of a RNN (same as the role of `Momory` in PaddlePaddle).
 *
@@ -86,7 +87,7 @@ void LinkMemories(const std::vector<Scope*>& step_scopes,
                  const int offset, bool infer_shape_mode);

 void InitArgument(const ArgumentName& name, Argument* arg,
-                  const OperatorBase& op);
+                  const framework::OperatorBase& op);

 }  // namespace rnn
 }  // namespace operators

--- a/paddle/operators/rowwise_add_op.cc
+++ b/paddle/operators/rowwise_add_op.cc
@@ -13,12 +13,14 @@
   limitations under the License. */

 #include "paddle/operators/rowwise_add_op.h"
+
 namespace paddle {
 namespace operators {

-class RowWiseAddOp : public OperatorWithKernel {
+class RowWiseAddOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(RowWiseAddOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 2UL,
                   "Two inputs is needed by rowwise add");
    auto dim0 = ctx.Input<Tensor>(0)->dims();
@@ -32,9 +34,10 @@ class RowWiseAddOp : public OperatorWithKernel {
  }
 };

-class RowWiseAddOpMaker : public OpProtoAndCheckerMaker {
+class RowWiseAddOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  RowWiseAddOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  RowWiseAddOpMaker(framework::OpProto *proto,
+                    framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "The left input of row-wise add op, must be matrix");
    AddInput("b", "The right input of row-wise add op, must be vector");
@@ -50,6 +53,7 @@ for i in xrange(X.shape[0]):
 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
 REGISTER_OP(rowwise_add, ops::RowWiseAddOp, ops::RowWiseAddOpMaker);
-REGISTER_OP_CPU_KERNEL(rowwise_add,
-                       ops::RowWiseAddKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    rowwise_add, ops::RowWiseAddKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.cu
+++ b/paddle/operators/rowwise_add_op.cu
@@ -15,5 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/rowwise_add_op.h"

-REGISTER_OP_GPU_KERNEL(rowwise_add,
-                       ops::RowWiseAddKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(
+    rowwise_add, ops::RowWiseAddKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/rowwise_add_op.h
+++ b/paddle/operators/rowwise_add_op.h
@@ -13,15 +13,24 @@
   limitations under the License. */

 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class RowWiseAddKernel : public OpKernel {
+class RowWiseAddKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto out = context.Output<Tensor>(0);
    out->mutable_data<T>(context.GetPlace());


--- a/paddle/operators/sgd_op.cc
+++ b/paddle/operators/sgd_op.cc
@@ -17,9 +17,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-class SGDOp : public OperatorWithKernel {
+class SGDOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(SGDOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 2, "Input size of SGDOp must be two");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1, "Output size of SGDOp must be one");
    PADDLE_ENFORCE_NOT_NULL(ctx.InputVar(0), "inputs[0] mast be set");
@@ -31,9 +32,9 @@ class SGDOp : public OperatorWithKernel {
  }
 };

-class SGDOpMaker : public OpProtoAndCheckerMaker {
+class SGDOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SGDOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SGDOpMaker(framework::OpProto *proto, framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("param", "input parameter");
    AddInput("grad", "input gradient");
@@ -51,5 +52,7 @@ param_out = param - learning_rate * grad;
 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
 REGISTER_OP(sgd, ops::SGDOp, ops::SGDOpMaker);
-REGISTER_OP_CPU_KERNEL(sgd, ops::SGDOpKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sgd_op.cu
+++ b/paddle/operators/sgd_op.cu
@@ -15,4 +15,6 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sgd_op.h"

-REGISTER_OP_GPU_KERNEL(sgd, ops::SGDOpKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+REGISTER_OP_GPU_KERNEL(sgd,
+                       ops::SGDOpKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sgd_op.h
+++ b/paddle/operators/sgd_op.h
@@ -13,15 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class SGDOpKernel : public OpKernel {
+class SGDOpKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& ctx) const override {
+  void Compute(const framework::ExecutionContext& ctx) const override {
    auto param = ctx.Input<Tensor>("param");
    auto grad = ctx.Input<Tensor>("grad");
    auto param_out = ctx.Output<Tensor>(0);

--- a/paddle/operators/sigmoid_op.cc
+++ b/paddle/operators/sigmoid_op.cc
@@ -13,21 +13,24 @@
   limitations under the License. */

 #include "paddle/operators/sigmoid_op.h"
+
 namespace paddle {
 namespace operators {

-class SigmoidOp : public OperatorWithKernel {
+class SigmoidOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(SigmoidOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE(ctx.InputSize() == 1, "Sigmoid Op only have one input");
    PADDLE_ENFORCE(ctx.OutputSize() == 1, "Sigmoid Op only have one output");
    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };

-class SigmoidOpMaker : public OpProtoAndCheckerMaker {
+class SigmoidOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SigmoidOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SigmoidOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "sigmoid input");
    AddOutput("Y", "sigmoid output");
@@ -35,9 +38,10 @@ class SigmoidOpMaker : public OpProtoAndCheckerMaker {
  }
 };

-class SigmoidOpGrad : public OperatorWithKernel {
+class SigmoidOpGrad : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(SigmoidOpGrad, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    ctx.Output<Tensor>(0)->Resize(ctx.Input<Tensor>(0)->dims());
  }
 };
@@ -45,9 +49,11 @@ class SigmoidOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
 REGISTER_OP(sigmoid, ops::SigmoidOp, ops::SigmoidOpMaker);
 REGISTER_GRADIENT_OP(sigmoid, sigmoid_grad, ops::SigmoidOpGrad);

-REGISTER_OP_CPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::CPUPlace, float>);
-REGISTER_OP_CPU_KERNEL(sigmoid_grad,
-                       ops::SigmoidGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/sigmoid_op.cu
+++ b/paddle/operators/sigmoid_op.cu
@@ -15,6 +15,9 @@
 #define EIGEN_USE_GPU
 #include "paddle/operators/sigmoid_op.h"

-REGISTER_OP_GPU_KERNEL(sigmoid, ops::SigmoidKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(sigmoid_grad,
-                       ops::SigmoidGradKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(sigmoid,
+                       ops::SigmoidKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    sigmoid_grad, ops::SigmoidGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/sigmoid_op.h
+++ b/paddle/operators/sigmoid_op.h
@@ -13,16 +13,21 @@
   limitations under the License. */

 #pragma once
-
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenVector = framework::EigenVector<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class SigmoidKernel : public OpKernel {
+class SigmoidKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input = context.Input<Tensor>(0);
    auto output = context.Output<Tensor>(0);
    output->mutable_data<T>(context.GetPlace());
@@ -37,9 +42,9 @@ class SigmoidKernel : public OpKernel {
 };

 template <typename Place, typename T>
-class SigmoidGradKernel : public OpKernel {
+class SigmoidGradKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto Y_t = context.Input<Tensor>("Y");
    auto dY_t = context.Input<Tensor>(framework::GradVarName("Y"));
    auto dX_t = context.Output<Tensor>(framework::GradVarName("X"));

--- a/paddle/operators/softmax_op.cc
+++ b/paddle/operators/softmax_op.cc
@@ -17,9 +17,10 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-class SoftmaxOp : public OperatorWithKernel {
+class SoftmaxOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(SoftmaxOp, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 1UL,
                      "Only one input is need for softmax");
    PADDLE_ENFORCE_EQ(ctx.Input<Tensor>("X")->dims().size(), 2UL,
@@ -30,9 +31,10 @@ class SoftmaxOp : public OperatorWithKernel {
  }
 };

-class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
+class SoftmaxOpMaker : public framework::OpProtoAndCheckerMaker {
 public:
-  SoftmaxOpMaker(OpProto *proto, OpAttrChecker *op_checker)
+  SoftmaxOpMaker(framework::OpProto *proto,
+                 framework::OpAttrChecker *op_checker)
      : OpProtoAndCheckerMaker(proto, op_checker) {
    AddInput("X", "input of softmax");
    AddOutput("Y", "output of softmax");
@@ -40,9 +42,10 @@ class SoftmaxOpMaker : public OpProtoAndCheckerMaker {
  }
 };

-class SoftmaxOpGrad : public OperatorWithKernel {
+class SoftmaxOpGrad : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(SoftmaxOpGrad, framework::OperatorWithKernel)
 protected:
-  void InferShape(const InferShapeContext &ctx) const override {
+  void InferShape(const framework::InferShapeContext &ctx) const override {
    PADDLE_ENFORCE_EQ(ctx.InputSize(), 3UL,
                      "Input of SoftmaxOpGrad should be 3, X, Y, YG");
    PADDLE_ENFORCE_EQ(ctx.OutputSize(), 1UL,
@@ -61,8 +64,11 @@ class SoftmaxOpGrad : public OperatorWithKernel {
 }  // namespace operators
 }  // namespace paddle

+namespace ops = paddle::operators;
+
 REGISTER_OP(softmax, ops::SoftmaxOp, ops::SoftmaxOpMaker);
-REGISTER_OP_CPU_KERNEL(softmax, ops::SoftmaxKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::CPUPlace, float>);
 REGISTER_GRADIENT_OP(softmax, softmax_grad, ops::SoftmaxOpGrad);
-REGISTER_OP_CPU_KERNEL(softmax_grad,
-                       ops::SoftmaxGradKernel<ops::CPUPlace, float>);
+REGISTER_OP_CPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::CPUPlace, float>);
--- a/paddle/operators/softmax_op.cu
+++ b/paddle/operators/softmax_op.cu
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+/* Copyright (c) 2016 PaddlePaddle Authors All Rights Reserve.

   Licensed under the Apache License, Version 2.0 (the "License");
   you may not use this file except in compliance with the License.
@@ -13,9 +13,11 @@
   limitations under the License. */

 #define EIGEN_USE_GPU
-#include "paddle/framework/op_registry.h"
 #include "paddle/operators/softmax_op.h"

-REGISTER_OP_GPU_KERNEL(softmax, ops::SoftmaxKernel<ops::GPUPlace, float>);
-REGISTER_OP_GPU_KERNEL(softmax_grad,
-                       ops::SoftmaxGradKernel<ops::GPUPlace, float>);
+namespace ops = paddle::operators;
+
+REGISTER_OP_GPU_KERNEL(softmax,
+                       ops::SoftmaxKernel<paddle::platform::GPUPlace, float>);
+REGISTER_OP_GPU_KERNEL(
+    softmax_grad, ops::SoftmaxGradKernel<paddle::platform::GPUPlace, float>);
--- a/paddle/operators/softmax_op.h
+++ b/paddle/operators/softmax_op.h
@@ -13,19 +13,21 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #pragma once
-
-#include "paddle/framework/ddim.h"
-#include "paddle/framework/operator.h"
-#include "paddle/framework/tensor.h"
-#include "paddle/operators/type_alias.h"
+#include "paddle/framework/eigen.h"
+#include "paddle/framework/op_registry.h"

 namespace paddle {
 namespace operators {

+using Tensor = framework::Tensor;
+template <typename T, int MajorType = Eigen::RowMajor,
+          typename IndexType = Eigen::DenseIndex>
+using EigenMatrix = framework::EigenMatrix<T, MajorType, IndexType>;
+
 template <typename Place, typename T>
-class SoftmaxKernel : public OpKernel {
+class SoftmaxKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    auto input = context.Input<Tensor>("X");
    auto output = context.Output<Tensor>("Y");
    output->mutable_data<T>(context.GetPlace());
@@ -62,9 +64,9 @@ class SoftmaxKernel : public OpKernel {
 };

 template <typename Place, typename T>
-class SoftmaxGradKernel : public OpKernel {
+class SoftmaxGradKernel : public framework::OpKernel {
 public:
-  void Compute(const ExecutionContext& context) const override {
+  void Compute(const framework::ExecutionContext& context) const override {
    std::shared_ptr<Tensor> scale_ = std::make_shared<Tensor>();

    auto Y = context.Input<Tensor>("Y");

--- a/paddle/operators/uniform_random_op.cc
+++ b/paddle/operators/uniform_random_op.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <random>
+#include <type_traits>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class CPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    std::minstd_rand engine;
+    if (seed == 0) {
+      seed = std::random_device()();
+    }
+    engine.seed(seed);
+    std::uniform_real_distribution<T> dist(
+        static_cast<T>(context.op_.GetAttr<float>("min")),
+        static_cast<T>(context.op_.GetAttr<float>("max")));
+    for (ssize_t i = 0; i < framework::product(tensor->dims()); ++i) {
+      data[i] = dist(engine);
+    }
+  }
+};
+
+class UniformRandomOp : public framework::OperatorWithKernel {
+  DEFINE_OPERATOR_CTOR(UniformRandomOp, framework::OperatorWithKernel)
+ protected:
+  void InferShape(const framework::InferShapeContext& ctx) const override {
+    PADDLE_ENFORCE(GetAttr<float>("min") < GetAttr<float>("max"),
+                   "uniform_random's min must less then max");
+    auto* tensor = ctx.Output<framework::Tensor>(0);
+    auto dims = GetAttr<std::vector<int>>("dims");
+    tensor->Resize(framework::make_ddim(dims));
+  }
+};
+
+class UniformRandomOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  UniformRandomOpMaker(framework::OpProto* proto,
+                       framework::OpAttrChecker* op_checker)
+      : framework::OpProtoAndCheckerMaker(proto, op_checker) {
+    AddOutput("Out", "The output tensor of uniform random op");
+    AddComment(R"DOC(Uniform random operator.
+
+Used to initialize tensor with uniform random generator.
+)DOC");
+    AddAttr<std::vector<int>>("dims", "the dimension of random tensor");
+    AddAttr<float>("min", "Minimum value of uniform random").SetDefault(-1.0f);
+    AddAttr<float>("max", "Maximun value of uniform random").SetDefault(1.0f);
+    AddAttr<int>("seed",
+                 "Random seed of uniform random. "
+                 "0 means generate a seed by system")
+        .SetDefault(0);
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP(uniform_random, paddle::operators::UniformRandomOp,
+            paddle::operators::UniformRandomOpMaker);
+REGISTER_OP_CPU_KERNEL(uniform_random,
+                       paddle::operators::CPUUniformRandomKernel<float>);
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include <thrust/device_ptr.h>
+#include <thrust/iterator/counting_iterator.h>
+#include <thrust/random.h>
+#include <thrust/transform.h>
+#include "paddle/framework/op_registry.h"
+#include "paddle/framework/operator.h"
+
+namespace paddle {
+namespace operators {
+
+template <typename T>
+struct UniformGenerator {
+  T min_, max_;
+  unsigned int seed_;
+
+  __host__ __device__ UniformGenerator(T min, T max, int seed)
+      : min_(min), max_(max), seed_(seed) {}
+
+  __host__ __device__ T operator()(const unsigned int n) const {
+    thrust::minstd_rand rng;
+    rng.seed(seed_);
+    thrust::uniform_real_distribution<T> dist(min_, max_);
+    rng.discard(n);
+    return dist(rng);
+  }
+};
+
+// It seems that Eigen::Tensor::random in GPU will SEGFAULT.
+// Use std::random and thrust::random(thrust is a std library in CUDA) to
+// implement uniform random.
+template <typename T>
+class GPUUniformRandomKernel : public framework::OpKernel {
+ public:
+  void Compute(const framework::ExecutionContext& context) const override {
+    auto* tensor = context.Output<framework::Tensor>(0);
+    T* data = tensor->mutable_data<T>(context.GetPlace());
+    unsigned int seed =
+        static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
+    if (seed == 0) {
+      std::random_device rd;
+      seed = rd();
+    }
+    T min = static_cast<T>(context.op_.GetAttr<float>("min"));
+    T max = static_cast<T>(context.op_.GetAttr<float>("max"));
+    thrust::counting_iterator<unsigned int> index_sequence_begin(0);
+    ssize_t N = framework::product(tensor->dims());
+    thrust::transform(index_sequence_begin, index_sequence_begin + N,
+                      thrust::device_ptr<T>(data),
+                      UniformGenerator<T>(min, max, seed));
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+REGISTER_OP_GPU_KERNEL(uniform_random,
+                       paddle::operators::GPUUniformRandomKernel<float>);
--- a/paddle/platform/CMakeLists.txt
+++ b/paddle/platform/CMakeLists.txt
@@ -8,7 +8,7 @@ cc_test(place_test SRCS place_test.cc DEPS place glog gflags)

 add_subdirectory(dynload)

-cc_test(enforce_test SRCS enforce_test.cc)
+cc_test(enforce_test SRCS enforce_test.cc DEPS stringpiece)

 IF(WITH_GPU)
    set(GPU_CTX_DEPS dynload_cuda dynamic_loader)

--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -48,13 +48,13 @@ extern void *cublas_dso_handle;
  };                                                                \
  extern DynLoad__##__name __name
 #else
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
-  struct DynLoad__##__name {                     \
-    inline template <typename... Args>           \
-    cublasStatus_t operator()(Args... args) {    \
-      return __name(args...);                    \
-    }                                            \
-  };                                             \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
+  struct DynLoad__##__name {                         \
+    template <typename... Args>                      \
+    inline cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                        \
+    }                                                \
+  };                                                 \
  extern DynLoad__##__name __name
 #endif


--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -55,6 +55,7 @@ extern void *curand_dso_handle;
  __macro(curandSetPseudoRandomGeneratorSeed); \
  __macro(curandGenerateUniform);              \
  __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
  __macro(curandDestroyGenerator);

 CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);

--- a/paddle/platform/enforce.h
+++ b/paddle/platform/enforce.h
@@ -15,11 +15,12 @@ limitations under the License. */
 #pragma once

 #include <execinfo.h>
-#include <paddle/string/printf.h>
 #include <iomanip>
 #include <sstream>
 #include <stdexcept>
 #include <string>
+#include "paddle/string/printf.h"
+#include "paddle/string/to_string.h"

 #ifndef PADDLE_ONLY_CPU

@@ -194,8 +195,8 @@ inline void throw_on_error(T e) {
 #define __PADDLE_BINARY_COMPARE(__VAL0, __VAL1, __CMP, __INV_CMP, ...)        \
  PADDLE_ENFORCE(__VAL0 __CMP __VAL1,                                         \
                 "enforce %s " #__CMP " %s failed, %s " #__INV_CMP " %s\n%s", \
-                 #__VAL0, #__VAL1, std::to_string(__VAL0),                    \
-                 std::to_string(__VAL1),                                      \
+                 #__VAL0, #__VAL1, paddle::string::to_string(__VAL0),         \
+                 paddle::string::to_string(__VAL1),                           \
                 paddle::string::Sprintf("" __VA_ARGS__));

 }  // namespace platform

--- a/paddle/platform/enforce_test.cc
+++ b/paddle/platform/enforce_test.cc
@@ -9,10 +9,16 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <array>
+#include <iostream>
 #include <memory>

 #include "gtest/gtest.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/string/piece.h"
+
+using StringPiece = paddle::string::Piece;
+using paddle::string::HasPrefix;

 TEST(ENFORCE, OK) {
  PADDLE_ENFORCE(true, "Enforce is ok %d now %f", 123, 0.345);
@@ -22,19 +28,15 @@ TEST(ENFORCE, OK) {
 }

 TEST(ENFORCE, FAILED) {
-  bool in_catch = false;
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE(false, "Enforce is not ok %d at all", 123);
  } catch (paddle::platform::EnforceNotMet error) {
-    // your error handling code here
-    in_catch = true;
-    std::string msg = "Enforce is not ok 123 at all";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "Enforce is not ok 123 at all"));
  }
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE, NO_ARG_OK) {
@@ -47,41 +49,27 @@ TEST(ENFORCE, NO_ARG_OK) {

 TEST(ENFORCE_EQ, NO_EXTRA_MSG_FAIL) {
  int a = 2;
-  bool in_catch = false;
-
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_EQ(a, 1 + 3);
-
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce a == 1 + 3 failed, 2 != 4";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()), "enforce a == 1 + 3 failed, 2 != 4");
  }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE_EQ, EXTRA_MSG_FAIL) {
  int a = 2;
-  bool in_catch = false;
-
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_EQ(a, 1 + 3, "%s size not match", "their");
-
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg =
-        "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    HasPrefix(StringPiece(error.what()),
+              "enforce a == 1 + 3 failed, 2 != 4\ntheir size not match");
  }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE_NE, OK) {
@@ -89,42 +77,32 @@ TEST(ENFORCE_NE, OK) {
  PADDLE_ENFORCE_NE(1.0, 2UL);
 }
 TEST(ENFORCE_NE, FAIL) {
-  bool in_catch = false;
+  bool caught_exception = false;

  try {
    // 2UL here to check data type compatible
    PADDLE_ENFORCE_NE(1.0, 1UL);
-
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1.0 != 1UL failed, 1.000000 == 1";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1.0 != 1UL failed, 1 == 1"))
+        << error.what() << " does not have expected prefix";
  }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE_GT, OK) { PADDLE_ENFORCE_GT(2, 1); }
 TEST(ENFORCE_GT, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
  try {
-    // 2UL here to check data type compatible
    PADDLE_ENFORCE_GT(1, 2UL);

  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
  }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE_GE, OK) {
@@ -134,21 +112,16 @@ TEST(ENFORCE_GE, OK) {
  PADDLE_ENFORCE_GE(3.21, 2UL);
 }
 TEST(ENFORCE_GE, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_GE(1, 2UL);

  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1 >= 2UL failed, 1 < 2";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 >= 2UL failed, 1 < 2"));
  }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE_LE, OK) {
@@ -159,21 +132,16 @@ TEST(ENFORCE_LE, OK) {
  PADDLE_ENFORCE_LE(2UL, 3.2);
 }
 TEST(ENFORCE_LE, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_GT(1, 2UL);

  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1 > 2UL failed, 1 <= 2";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(
+        HasPrefix(StringPiece(error.what()), "enforce 1 > 2UL failed, 1 <= 2"));
  }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE_LT, OK) {
@@ -182,21 +150,15 @@ TEST(ENFORCE_LT, OK) {
  PADDLE_ENFORCE_LT(2UL, 3);
 }
 TEST(ENFORCE_LT, FAIL) {
-  bool in_catch = false;
-
+  bool caught_exception = false;
  try {
    PADDLE_ENFORCE_LT(1UL, 0.12);
-
  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "enforce 1UL < 0.12 failed, 1 >= 0.12";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
-    }
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()),
+                          "enforce 1UL < 0.12 failed, 1 >= 0.12"));
  }
-
-  ASSERT_TRUE(in_catch);
+  EXPECT_TRUE(caught_exception);
 }

 TEST(ENFORCE_NOT_NULL, OK) {
@@ -205,20 +167,50 @@ TEST(ENFORCE_NOT_NULL, OK) {
  delete a;
 }
 TEST(ENFORCE_NOT_NULL, FAIL) {
-  bool in_catch = false;
-  int* a{nullptr};
-
+  bool caught_exception = false;
  try {
+    int* a = nullptr;
    PADDLE_ENFORCE_NOT_NULL(a);

  } catch (paddle::platform::EnforceNotMet error) {
-    in_catch = true;
-    const std::string msg = "a should not be null";
-    const char* what = error.what();
-    for (size_t i = 0; i < msg.length(); ++i) {
-      ASSERT_EQ(what[i], msg[i]);
+    caught_exception = true;
+    EXPECT_TRUE(HasPrefix(StringPiece(error.what()), "a should not be null"));
+  }
+  EXPECT_TRUE(caught_exception);
+}
+
+struct Dims {
+  size_t dims_[4];
+
+  bool operator==(const Dims& o) const {
+    for (size_t i = 0; i < 4; ++i) {
+      if (dims_[i] != o.dims_[i]) return false;
    }
+    return true;
  }
+};

-  ASSERT_TRUE(in_catch);
+std::ostream& operator<<(std::ostream& os, const Dims& d) {
+  for (size_t i = 0; i < 4; ++i) {
+    if (i == 0) {
+      os << "[";
+    }
+    os << d.dims_[i];
+    if (i == 4 - 1) {
+      os << "]";
+    } else {
+      os << ", ";
+    }
+  }
+  return os;
 }
+
+TEST(ENFORCE_USER_DEFINED_CLASS, EQ) {
+  Dims a{{1, 2, 3, 4}}, b{{1, 2, 3, 4}};
+  PADDLE_ENFORCE_EQ(a, b);
+}
+
+TEST(ENFORCE_USER_DEFINED_CLASS, NE) {
+  Dims a{{1, 2, 3, 4}}, b{{5, 6, 7, 8}};
+  ASSERT_THROW(PADDLE_ENFORCE_EQ(a, b), paddle::platform::EnforceNotMet);
+}
\ No newline at end of file
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -14,8 +14,8 @@ limitations under the License. */

 #pragma once

-#include <boost/variant.hpp>
 #include <iostream>
+#include "paddle/platform/variant.h"

 namespace paddle {
 namespace platform {

--- a/paddle/platform/variant.h
+++ b/paddle/platform/variant.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/config.hpp>
+
+#ifndef PADDLE_ONLY_CPU
+
+// Because boost's variadic templates has bug on nvcc, boost will disable
+// variadic template support when GPU enabled on nvcc.
+// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.
+//
+// https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#endif
+#endif
+
+#include <boost/variant.hpp>
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -3,7 +3,7 @@ add_unittest_without_exec(socket_test
    SocketTest.cpp)

 add_test(NAME socket_test
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
        ${CMAKE_CURRENT_BINARY_DIR}/socket_test --loop_time=10)

 ####################### test_ProtoServer ####################
@@ -12,7 +12,7 @@ add_unittest_without_exec(test_ProtoServer

 IF(NOT ON_TRAVIS)
    add_test(NAME test_ProtoServer
-        COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
            ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
 ENDIF(NOT ON_TRAVIS)

@@ -24,5 +24,5 @@ ENDIF(NOT ON_TRAVIS)
 add_unittest_without_exec(test_ParameterServer2
    test_ParameterServer2.cpp)
 add_test(NAME test_ParameterServer2
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port -n 4
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 4
        ${CMAKE_CURRENT_BINARY_DIR}/test_ParameterServer2)
--- a/paddle/pybind/CMakeLists.txt
+++ b/paddle/pybind/CMakeLists.txt
-cc_library(paddle_pybind SHARED
-    SRCS pybind.cc
-    DEPS pybind python backward
-	fc_op
-	sgd_op
-	add_op
-	mean_op
-	cross_entropy_op
-	recurrent_op
-	fill_zeros_like_op)
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -31,7 +31,7 @@ Configuring cmake in /paddle/build ...
      -DWITH_DOC=OFF
      -DWITH_GPU=${WITH_GPU:-OFF}
      -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_GOLANG=${WITH_GOLANG:-OFF}
+      -DWITH_GOLANG=${WITH_GOLANG:-ON}
      -DWITH_SWIG_PY=ON
      -DWITH_C_API=${WITH_C_API:-OFF}
      -DWITH_PYTHON=${WITH_PYTHON:-ON}
@@ -51,7 +51,7 @@ cmake .. \
      -DWITH_DOC=OFF \
      -DWITH_GPU=${WITH_GPU:-OFF} \
      -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
+      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
      -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
      -DWITH_C_API=${WITH_C_API:-OFF} \
      -DWITH_PYTHON=${WITH_PYTHON:-ON} \
@@ -74,11 +74,11 @@ cat <<EOF
 Running unit tests ...
 ========================================
 EOF
+    ctest --output-on-failure
    # make install should also be test when unittest
    make install -j `nproc`
    pip install /usr/local/opt/paddle/share/wheels/*.whl
    paddle version
-    ctest --output-on-failure
 fi


@@ -130,7 +130,7 @@ fi

 # generate deb package for current build
 # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-if [[ ${WITH_DEB:-OFF} == "ON" ]]; then
+if [[ ${WITH_DEB:-ON} == "ON" ]]; then
    cat <<EOF
 ========================================
 Generating .deb package ...

--- a/paddle/scripts/travis/build_doc.sh
+++ b/paddle/scripts/travis/build_doc.sh
@@ -5,15 +5,9 @@ set -e
 mkdir -p $TRAVIS_BUILD_DIR/build
 cd $TRAVIS_BUILD_DIR/build

-# Compile paddle binaries first
-cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_DOC=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_GOLANG=ON -DWITH_STYLE_CHECK=OFF
-
-mkdir output
-make -j `nproc`
-find .. -name '*whl' | xargs pip install  # install all wheels.
-rm -rf *
 # Compile Documentation only.
 cmake .. -DCMAKE_BUILD_TYPE=Debug -DWITH_GPU=OFF -DWITH_MKLDNN=OFF -DWITH_MKLML=OFF -DWITH_DOC=ON
+make -j `nproc` gen_proto_py
 make -j `nproc` paddle_docs paddle_docs_cn

 # check websites for broken links
@@ -35,6 +29,7 @@ TARGET_BRANCH="gh-pages"
 SOURCE_BRANCH="master"

 # Clone the repo to output directory
+mkdir output
 git clone $REPO output
 cd output


--- a/paddle/string/CMakeLists.txt
+++ b/paddle/string/CMakeLists.txt
@@ -2,3 +2,4 @@ cc_library(stringpiece SRCS piece.cc)
 cc_test(stringpiece_test SRCS piece_test.cc DEPS stringpiece glog gflags)

 cc_test(stringprintf_test SRCS printf_test.cc DEPS glog gflags)
+cc_test(to_string_test SRCS to_string_test.cc)
--- a/paddle/string/to_string.h
+++ b/paddle/string/to_string.h
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+#include <sstream>
+#include <string>
+
+namespace paddle {
+namespace string {
+template <typename T>
+inline std::string to_string(T v) {
+  std::ostringstream sout;
+  sout << v;
+  return sout.str();
+}
+
+// Faster std::string/const char* type
+template <>
+inline std::string to_string(std::string v) {
+  return v;
+}
+
+template <>
+inline std::string to_string(const char* v) {
+  return std::string(v);
+}
+
+}  // namespace string
+}  // namespace paddle
--- a/paddle/string/to_string_test.cc
+++ b/paddle/string/to_string_test.cc
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#include "paddle/string/to_string.h"
+#include <gtest/gtest.h>
+
+constexpr char kOutputString[] = "User Defined Output";
+class UserDefinedClass {
+public:
+};
+
+std::ostream& operator<<(std::ostream& s, const UserDefinedClass& ins) {
+  s << kOutputString;
+  return s;
+}
+
+TEST(to_string, normal) {
+  using namespace paddle::string;
+  ASSERT_EQ("10", to_string(10));
+  ASSERT_EQ("abc", to_string("abc"));
+  ASSERT_EQ("1.2", to_string(1.2));
+}
+
+TEST(to_string, user_defined) {
+  using namespace paddle::string;
+  UserDefinedClass instance;
+  ASSERT_EQ(kOutputString, to_string(instance));
+}
\ No newline at end of file
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -50,8 +50,8 @@ void NewRemoteParameterUpdater::init(

  // create parameter server client.
  if (useEtcd_) {
-    parameterClient_ = paddle_new_etcd_pserver_client(
-        (char *)pserverSpec_.c_str(), FLAGS_trainer_id == 0);
+    parameterClient_ =
+        paddle_new_etcd_pserver_client((char *)pserverSpec_.c_str());
  } else {
    parameterClient_ = paddle_new_pserver_client((char *)pserverSpec_.c_str(),
                                                 FLAGS_trainer_id == 0);
@@ -66,28 +66,92 @@ void NewRemoteParameterUpdater::init(
  // from parameter server
  if (paddle_begin_init_params(parameterClient_)) {
    LOG(INFO) << "paddle_begin_init_params start";
+    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
+    // This makes golang pserver compatible with handy V1 demos.
+    // TODO(wuyi): Refine or remove these ugly converting lines
+    OptimizerConfig optimizerConfigV2;
+    if (trainerConfig_.learning_method() == "momentum") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    } else if (trainerConfig_.learning_method() == "adagrad") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adagrad()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+    } else if (trainerConfig_.learning_method() == "adadelta") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adadelta()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
+    } else if (trainerConfig_.learning_method() == "adam") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
+      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
+      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
+      optimizerConfigV2.mutable_adam()->set_epsilon(
+          trainerConfig_.adam_epsilon());
+    } else {
+      LOG(ERROR) << "got unsupported v1 optimizer config: "
+                 << trainerConfig_.learning_method();
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    }
+
+    if (trainerConfig_.learning_rate_schedule() == "constant") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
+      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
+          trainerConfig_.learning_rate_decay_a());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
+          trainerConfig_.learning_rate_decay_b());
+    } else {
+      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
+                 << trainerConfig_.learning_rate_schedule() << ", set to const";
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+    }
+
+    // overwrite optimizerConfigV2 for per-parameter(layer) configs
    for (int i = 0; i < parameterSize(); ++i) {
      auto paramConfig = parameters_[i]->getConfig();
-      LOG(INFO) << "old param config: " << paramConfig.DebugString();
-      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
-      OptimizerConfig optimizeConfigV2;
-      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
-      sgdConfigV2->set_momentum(paramConfig.momentum());
-      sgdConfigV2->set_decay(paramConfig.decay_rate());
-      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      auto constlr = optimizeConfigV2.mutable_const_lr();
+      if (paramConfig.has_momentum() &&
+          trainerConfig_.learning_method() == "momentum") {
+        optimizerConfigV2.mutable_sgd()->set_momentum(paramConfig.momentum());
+      }
      if (paramConfig.has_learning_rate()) {
-        constlr->set_learning_rate(paramConfig.learning_rate());
-      } else {
-        constlr->set_learning_rate(trainerConfig_.learning_rate());
+        switch (optimizerConfigV2.lr_policy()) {
+          case 0:
+            optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+                paramConfig.learning_rate());
+            break;
+          case 1:
+            optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+                paramConfig.learning_rate());
+            break;
+        }
      }
-      if (trainerConfig_.algorithm() == "sgd") {
-        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-        // FIXME: config all algorithms
-      } else {
-        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      if (paramConfig.has_decay_rate()) {
+        switch (optimizerConfigV2.optimizer()) {
+          case 1:  // SGD
+            optimizerConfigV2.mutable_sgd()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 2:  // Adadelta
+            optimizerConfigV2.mutable_adadelta()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 3:  // Adagrad
+            optimizerConfigV2.mutable_adagrad()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 4:  // Adam
+            optimizerConfigV2.mutable_adam()->set_decay(
+                paramConfig.decay_rate());
+            break;
+        }
      }
-      std::string bytes = optimizeConfigV2.SerializeAsString();
+      // send param and config to pserver
+      std::string bytes = optimizerConfigV2.SerializeAsString();
      const char *array = bytes.data();
      int size = (int)bytes.size();
      paddle_init_param(

--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -28,6 +28,8 @@ DECLARE_bool(with_cost);
 DECLARE_bool(with_gpu);
 DECLARE_bool(parallel_nn);
 DECLARE_string(config_args);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkldnn_wgt);

 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
@@ -44,6 +46,8 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
  configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
             << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
             << ",parallel_nn=" << FLAGS_parallel_nn
+             << ",use_mkldnn=" << FLAGS_use_mkldnn
+             << ",use_mkldnn_wgt=" << FLAGS_use_mkldnn_wgt
             << ",cudnn_version=" << hl_get_cudnn_lib_version();
  if (!FLAGS_config_args.empty()) {
    configArgs << "," << FLAGS_config_args;

--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -2,19 +2,19 @@
 add_unittest_without_exec(test_Compare
    test_Compare.cpp)
 add_test(NAME test_Compare
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
        ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

 ################# test_Trainer ###########################
 add_unittest_without_exec(test_Trainer
    test_Trainer.cpp)
 add_test(NAME test_Trainer
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
-        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/gen_proto_data.py &&
+        ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
        ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
@@ -23,60 +23,60 @@ if(WITH_PYTHON)
  add_unittest_without_exec(test_TrainerOnePass
      test_TrainerOnePass.cpp)
  add_test(NAME test_TrainerOnePass
-    COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-          ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
-          ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    COMMAND  ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+          ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
+          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 ################ test_CompareTwoNets ######################
 add_unittest_without_exec(test_CompareTwoNets
    test_CompareTwoNets.cpp)
 add_test(NAME test_CompareTwoNets
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
            --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

 ############### test_CompareTwoOpts ###################
 add_unittest_without_exec(test_CompareTwoOpts
    test_CompareTwoOpts.cpp)
 add_test(NAME test_CompareTwoOpts
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
        ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
            --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
            --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

 ################# test_CompareSparse ##################
 add_unittest_without_exec(test_CompareSparse
    test_CompareSparse.cpp)
 if(NOT ON_TRAVIS)
  add_test(NAME test_CompareSparse
-    COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
              ./.set_port.sh -p port -n 6
                  ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
    test_recurrent_machine_generation.cpp)
 add_test(NAME test_recurrent_machine_generation
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
        ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

 #################### test_PyDataProviderWrapper #########################
 add_unittest_without_exec(test_PyDataProviderWrapper
    test_PyDataProviderWrapper.cpp)

 add_test(NAME test_PyDataProviderWrapper
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
        ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)

 #################### test_config_parser #########################
 add_test(NAME test_config_parser
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -20,6 +20,14 @@ DEFINE_bool(use_gpu, false, "Only support CPU training");
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif

+#ifdef PADDLE_USE_MKLDNN
+// TODO(TJ): change to true when MKLDNN layers support multi-inputs
+DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
+#else
+DEFINE_bool(use_mkldnn, false, "Only support CPU training");
+#endif
+
+DEFINE_bool(use_mkldnn_wgt, false, "Init weight from CPU weight");
 DEFINE_bool(parallel_nn,
            false,
            "Whether to use multi-threads to calculate one neural network."

--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -40,3 +40,5 @@ DECLARE_bool(show_layer_stat);
 DECLARE_string(predict_file);
 DECLARE_bool(prev_batch_state);
 DECLARE_string(init_model_path);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkldnn_wgt);
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -13,6 +13,6 @@ add_executable(
 link_paddle_exe(test_CustomStackTracePrint)
 if(NOT APPLE)
    add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/utils/tests/test_CustomStackTracePrint.sh
        WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -9,15 +9,15 @@ foreach(filename ${proto_filenames})
    get_filename_component(ABS_FIL ${filename} ABSOLUTE)
    get_filename_component(FIL_WE ${filename} NAME_WE)
    set(CUR_PROTO_GEN_PY
-            ${PROJ_ROOT}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
+            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
    set(PROTO_GEN_PY
            ${CUR_PROTO_GEN_PY}
            ${PROTO_GEN_PY})
    add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
            COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
+            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
            "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
-            DEPENDS ${ABS_FIL} ${external_project_dependencies})
+            DEPENDS ${ABS_FIL} protoc)
 endforeach()

 add_custom_target(gen_proto_py ALL DEPENDS ${PROTO_GEN_PY})
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
-set(OUTPUT_DIR
-    "${CMAKE_CURRENT_BINARY_DIR}/build")

 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
@@ -18,7 +16,7 @@ SET(COPY_PADDLE_MASTER "")
 if(WITH_GOLANG)
  SET(COPY_PADDLE_MASTER "copy_paddle_master")
  add_custom_command(TARGET ${COPY_PADDLE_MASTER}
-    COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/
+    COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
    )
  add_dependencies(copy_paddle_master paddle_master)
 endif(WITH_GOLANG)
@@ -27,19 +25,21 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
    ${CMAKE_CURRENT_BINARY_DIR}/setup.py)


-add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
        DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)


-add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
+add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
    DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})

 add_custom_target(paddle_python ALL DEPENDS
-    ${OUTPUT_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model python_api_wheel)
+    ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model python_api_wheel)

 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)


--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1604,6 +1604,8 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):

 @config_layer('fc')
 class FCLayer(LayerBase):
+    layer_type = 'fc'
+
    def __init__(self,
                 name,
                 size,
@@ -1611,14 +1613,27 @@ class FCLayer(LayerBase):
                 bias=True,
                 error_clipping_threshold=None,
                 **xargs):
-        super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn_wgt = bool(
+            int(g_command_config_args.get("use_mkldnn_wgt", 0)))
+        if use_mkldnn:
+            self.layer_type = 'mkldnn_fc'
+            config_assert(
+                len(inputs) == 1,
+                "MkldnnFCLayer support one and only one input!")
+        super(FCLayer, self).__init__(
+            name, self.layer_type, size, inputs=inputs, **xargs)
        for input_index in xrange(len(self.inputs)):
            input_layer = self.get_input_layer(input_index)
            psize = self.config.size * input_layer.size
            dims = [input_layer.size, self.config.size]
            format = self.inputs[input_index].format
            sparse = format == "csr" or format == "csc"
-
+            if use_mkldnn:
+                config_assert(not sparse,
+                              "MkldnnFCLayer do not support sparse format yet")
+                if use_mkldnn_wgt:
+                    dims = [self.config.size, input_layer.size]
            if sparse:
                psize = self.inputs[input_index].nnz
            else:
@@ -1631,6 +1646,11 @@ class FCLayer(LayerBase):
            self.config.error_clipping_threshold = error_clipping_threshold


+@config_layer('mkldnn_fc')
+class MkldnnFcLayer(FCLayer):
+    layer_type = 'mkldnn_fc'
+
+
 @config_layer('selective_fc')
 class SelectiveFCLayer(LayerBase):
    def __init__(self,
@@ -3248,6 +3268,16 @@ class CTCLayer(LayerBase):
        config_assert(len(self.inputs) == 2, 'CTCLayer must have 2 inputs')


+@config_layer('kmax_seq_score')
+class KmaxSeqScoreLayer(LayerBase):
+    def __init__(self, name, inputs, beam_size, **xargs):
+        super(KmaxSeqScoreLayer, self).__init__(
+            name, 'kmax_seq_score', 0, inputs=inputs, **xargs)
+        config_assert(
+            len(self.inputs) == 1, 'KmaxSeqScoreLayer has only one input.')
+        self.config.beam_size = beam_size
+
+
 @config_layer('warp_ctc')
 class WarpCTCLayer(LayerBase):
    def __init__(self,

--- a/python/paddle/trainer_config_helpers/layers.py
+++ b/python/paddle/trainer_config_helpers/layers.py
@@ -132,6 +132,7 @@ __all__ = [
    'sub_nested_seq_layer',
    'clip_layer',
    'slice_projection',
+    'kmax_sequence_score_layer',
 ]


@@ -228,6 +229,8 @@ class LayerType(object):
    SUB_NESTED_SEQ = 'sub_nested_seq'
    CLIP_LAYER = 'clip'

+    KMAX_SEQ_SCORE = 'kmax_seq_score'
+
    @staticmethod
    def is_layer_type(type_name):
        """
@@ -6158,7 +6161,8 @@ def clip_layer(input, min, max, name=None):
    :type min: double
    :param max: The upper threshold for clipping.
    :type max: double
-    :return: LayerOutput
+    :return: LayerOutput object.
+    :rtype: LayerOutput
    """
    Layer(
        name=name,
@@ -6168,3 +6172,41 @@ def clip_layer(input, min, max, name=None):
        max=max)
    return LayerOutput(
        name, LayerType.CLIP_LAYER, parents=[input], size=input.size)
+
+
+@wrap_name_default()
+@layer_support()
+def kmax_sequence_score_layer(input, name=None, beam_size=1):
+    """
+    This layer accepts one input which are scores over a sequence or a nested
+    sequence, and returns indices of beam_size sequences with highest scores.
+
+    .. code-block:: python
+
+        kmax_indices = kmax_sequence_score_layer(input=input_layer, beam_size)
+
+
+    :param name: The Layer Name.
+    :type name: basestring
+    :param input: The input layer. It stores scores over a sequence or a nested
+        sequence and its size must be 1.
+    :type input: LayerOutput.
+    :param beam_size: squence indices with top beam_size scores are returned.
+    :type beam_size: double
+    :return: LayerOutput object.
+    :rtype: LayerOutput
+    """
+    assert isinstance(input, LayerOutput), ("kmax_sequence_score_layer "
+                                            "accepts only one input.")
+    assert input.size == 1, (
+        "input of kmax_sequence_score_layer is a score"
+        "over a sequence or a nested sequence, so its width must be 1.")
+
+    Layer(
+        name=name,
+        type=LayerType.KMAX_SEQ_SCORE,
+        inputs=[input.name],
+        beam_size=beam_size)
+
+    return LayerOutput(
+        name, LayerType.KMAX_SEQ_SCORE, parents=[input], size=input.size)
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)

 add_test(NAME test_reset_hook
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)

 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
  COMMAND
-  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
  ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
--- a/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
+++ b/python/paddle/trainer_config_helpers/tests/configs/file_list.sh
@@ -8,6 +8,6 @@ test_spp_layer test_bilinear_interp test_maxout test_bi_grumemory math_ops
 test_seq_concat_reshape test_pad test_smooth_l1 test_multiplex_layer
 test_prelu_layer test_row_conv test_detection_output_layer test_multibox_loss_layer
 test_recursive_topology test_gated_unit_layer test_clip_layer test_row_l2_norm_layer
-test_seq_select_layers)
+test_kmax_seq_socre_layer test_seq_select_layers)

 export whole_configs=(test_split_datasource)
--- a/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+++ b/python/paddle/trainer_config_helpers/tests/configs/protostr/test_kmax_seq_socre_layer.protostr
+type: "nn"
+layers {
+  name: "input"
+  type: "data"
+  size: 300
+  active_type: ""
+}
+layers {
+  name: "data"
+  type: "data"
+  size: 128
+  active_type: ""
+}
+layers {
+  name: "__fc_layer_0__"
+  type: "fc"
+  size: 1
+  active_type: "exponential"
+  inputs {
+    input_layer_name: "data"
+    input_parameter_name: "___fc_layer_0__.w0"
+  }
+  bias_parameter_name: "___fc_layer_0__.wbias"
+}
+layers {
+  name: "__kmax_sequence_score_layer_0__"
+  type: "kmax_seq_score"
+  active_type: ""
+  inputs {
+    input_layer_name: "__fc_layer_0__"
+  }
+  beam_size: 5
+}
+parameters {
+  name: "___fc_layer_0__.w0"
+  size: 128
+  initial_mean: 0.0
+  initial_std: 0.0883883476483
+  dims: 128
+  dims: 1
+  initial_strategy: 0
+  initial_smart: true
+}
+parameters {
+  name: "___fc_layer_0__.wbias"
+  size: 1
+  initial_mean: 0.0
+  initial_std: 0.0
+  dims: 1
+  dims: 1
+  initial_strategy: 0
+  initial_smart: false
+}
+input_layer_names: "data"
+output_layer_names: "__kmax_sequence_score_layer_0__"
+sub_models {
+  name: "root"
+  layer_names: "input"
+  layer_names: "data"
+  layer_names: "__fc_layer_0__"
+  layer_names: "__kmax_sequence_score_layer_0__"
+  input_layer_names: "data"
+  output_layer_names: "__kmax_sequence_score_layer_0__"
+  is_recurrent_layer_group: false
+}
+
--- a/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+++ b/python/paddle/trainer_config_helpers/tests/configs/test_kmax_seq_socre_layer.py
+#!/usr/bin/env python
+#coding=utf-8
+from paddle.trainer_config_helpers import *
+
+data = data_layer(name='input', size=300)
+
+data = data_layer(name="data", size=128)
+scores = fc_layer(input=data, size=1, act=ExpActivation())
+kmax_seq_id = kmax_sequence_score_layer(input=scores, beam_size=5)
+
+outputs(kmax_seq_id)
--- a/python/paddle/v2/framework/.gitignore
+++ b/python/paddle/v2/framework/.gitignore
+proto
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
 py_test(test_net SRCS test_net.py)

-py_test(test_fc_op SRCS test_fc_op.py)
 py_test(test_scope SRCS test_scope.py)

 py_test(test_tensor SRCS test_tensor.py)
@@ -13,6 +12,7 @@ py_test(test_protobuf SRCS test_protobuf.py)
 py_test(test_add_two_op SRCS test_add_two_op.py)
 py_test(test_sigmoid_op SRCS test_sigmoid_op.py)
 py_test(test_softmax_op SRCS test_softmax_op.py)
+py_test(test_cross_entropy_op SRCS test_cross_entropy_op.py)
 py_test(test_fill_zeros_like_op SRCS test_fill_zeros_like_op.py)

 py_test(gradient_checker SRCS gradient_checker.py)
@@ -20,4 +20,7 @@ py_test(gradient_checker SRCS gradient_checker.py)
 py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)

 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
+
 py_test(test_operator SRCS test_operator.py)
+# py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
+py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
--- a/python/paddle/v2/framework/tests/gradient_checker.py
+++ b/python/paddle/v2/framework/tests/gradient_checker.py
+import unittest
+
+import numpy
 import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator
-import numpy
-import unittest

 __all__ = ['get_numeric_gradient']


+def create_op(op_type):
+    kwargs = dict()
+    for in_name in Operator.get_op_input_names(op_type):
+        kwargs[in_name] = in_name
+    for out_name in Operator.get_op_output_names(op_type):
+        kwargs[out_name] = out_name
+
+    return Operator(op_type, **kwargs)
+
+
+def grad_var_name(var_name):
+    return var_name + "@GRAD"
+
+
 def get_numeric_gradient(op,
                         input_values,
                         output_name,
                         input_to_check,
-                         delta=1e-2,
+                         delta=0.005,
                         local_scope=None):
    """
    Get Numeric Gradient for an operator's input.
@@ -58,24 +73,151 @@ def get_numeric_gradient(op,
    def product(dim):
        return reduce(lambda a, b: a * b, dim, 1)

+    # get the input tensor that we want to get it's numeric gradient.
    tensor_to_check = local_scope.find_var(input_to_check).get_tensor()
    tensor_size = product(tensor_to_check.get_dims())
+    # prepare a numpy array to store the gradient.
    gradient_flat = numpy.zeros(shape=(tensor_size, ), dtype='float32')
+
+    # we only compute gradient of one element each time.
+    # we use a for loop to compute the gradient of every element.
    for i in xrange(tensor_size):
+        # get one input element throw it's index i.
        origin = tensor_to_check.get_float_element(i)
+
+        # add delta to it, run op and then get the sum of the result tensor.
        x_pos = origin + delta
        tensor_to_check.set_float_element(i, x_pos)
        y_pos = get_output()

+        # plus delta to this element, run op and get the sum of the result tensor.
        x_neg = origin - delta
        tensor_to_check.set_float_element(i, x_neg)
        y_neg = get_output()

-        tensor_to_check.set_float_element(i, origin)  # restore old value
+        # restore old value
+        tensor_to_check.set_float_element(i, origin)
+
+        # compute the gradient of this element and store it into a numpy array.
        gradient_flat[i] = (y_pos - y_neg) / delta / 2
+
+    # reshape the gradient result to the shape of the source tensor.
    return gradient_flat.reshape(tensor_to_check.get_dims())


+class GradientChecker(unittest.TestCase):
+    def assert_is_close(self, numeric_grads, scope, max_relative_error,
+                        msg_prefix):
+        for name in numeric_grads:
+            b = numpy.array(scope.find_var(grad_var_name(name)).get_tensor())
+            a = numeric_grads[name]
+
+            abs_a = numpy.abs(a)
+            # if abs_a is nearly zero, then use abs error for a, not relative
+            # error.
+            abs_a[abs_a < 1e-3] = 1
+
+            diff_mat = numpy.abs(a - b) / abs_a
+            max_diff = numpy.max(diff_mat)
+
+            def err_msg():
+                offset = numpy.argmax(diff_mat > max_relative_error)
+                return "%s Variable %s max gradient diff %f over limit %f, the first " \
+                       "error element is %d" % (
+                       msg_prefix, name, max_diff, max_relative_error, offset)
+
+            self.assertLessEqual(max_diff, max_relative_error, err_msg())
+
+    def check_grad(self,
+                   forward_op,
+                   input_vars,
+                   inputs_to_check,
+                   output_name,
+                   no_grad_set=None,
+                   only_cpu=False,
+                   max_relative_error=0.005):
+        """
+        :param forward_op: used to create backward_op
+        :param input_vars: numpy value of input variable. The following
+            computation will use these variables.
+        :param inputs_to_check: inputs var names that should check gradient.
+        :param output_name: output name that used to
+        :param max_relative_error: The relative tolerance parameter.
+        :param no_grad_set: used when create backward ops
+        :param only_cpu: only compute and check gradient on cpu kernel.
+        :return:
+        """
+        if no_grad_set is None:
+            no_grad_set = set()
+
+        tmp_outs = forward_op.temp_outputs()
+        no_tmp_out = filter(lambda name: name not in tmp_outs,
+                            forward_op.outputs())
+        if len(no_tmp_out) != 1:
+            raise ValueError("non temp out_names should be 1")
+
+        in_names = forward_op.inputs()
+        for no_grad in no_grad_set:
+            if no_grad not in in_names:
+                raise ValueError("no_grad should be in in_names")
+
+        backward_op = core.Operator.backward(forward_op, no_grad_set)
+
+        places = [core.CPUPlace()]
+        if not only_cpu and core.is_compile_gpu() and backward_op.support_gpu():
+            places.append(core.GPUPlace(0))
+
+        numeric_grad = dict()
+        # get numeric gradient
+        for check_name in inputs_to_check:
+            numeric_grad[check_name] = \
+                get_numeric_gradient(forward_op, input_vars, output_name,
+                                     check_name)
+
+        # get operator gradient according to different device
+        for place in places:
+            scope = core.Scope()
+            ctx = core.DeviceContext.create(place)
+
+            # create input var and set value
+            for name, value in input_vars.iteritems():
+                if name not in in_names:
+                    raise ValueError(name + " not in op.inputs_")
+                var = scope.new_var(name).get_tensor()
+                var.set_dims(value.shape)
+                var.set(value, place)
+
+            # create output var
+            for out_name in forward_op.outputs():
+                scope.new_var(out_name).get_tensor()
+
+            # infer the shape of output var and compute/set value of output var
+            forward_op.infer_shape(scope)
+            forward_op.run(scope, ctx)
+
+            # create output grad var
+            # set shape as the output var
+            # set value of this grad to ones
+            for name in forward_op.outputs():
+                out_tensor = scope.find_var(name).get_tensor()
+                grad_tensor = scope.new_var(grad_var_name(name)).get_tensor()
+                grad_tensor.set_dims(out_tensor.shape())
+                data = 1.0 * numpy.ones(out_tensor.shape())
+                grad_tensor.set(data, place)
+
+            # create input grad var
+            for name in backward_op.outputs():
+                scope.new_var(name).get_tensor()
+
+            # infer the shape of input gradient var and compute/set it's value
+            # with backward op
+            backward_op.infer_shape(scope)
+            backward_op.run(scope, ctx)
+
+            self.assert_is_close(numeric_grad, scope, max_relative_error,
+                                 "Gradient Check On %s" % str(place))
+
+
 if __name__ == '__main__':

    class GetNumericGradientTest(unittest.TestCase):
@@ -87,4 +229,28 @@ if __name__ == '__main__':
            arr = get_numeric_gradient(add_op, {'X': x, "Y": y}, 'Z', 'X')
            self.assertAlmostEqual(arr.mean(), 1.0, delta=1e-2)

+        def test_softmax_op(self):
+            def stable_softmax(x):
+                """Compute the softmax of vector x in a numerically stable way."""
+                shiftx = x - numpy.max(x)
+                exps = numpy.exp(shiftx)
+                return exps / numpy.sum(exps)
+
+            def label_softmax_grad(Y, dY):
+                dX = Y * 0.0
+                for i in range(Y.shape[0]):
+                    d = numpy.dot(Y[i, :], dY[i, :])
+                    dX[i, :] = Y[i, :] * (dY[i, :] - d)
+                return dX
+
+            softmax_op = Operator("softmax", X="X", Y="Y")
+
+            X = numpy.random.random((2, 2)).astype("float32")
+            Y = numpy.apply_along_axis(stable_softmax, 1, X)
+            dY = numpy.ones(Y.shape)
+            dX = label_softmax_grad(Y, dY)
+
+            arr = get_numeric_gradient(softmax_op, {"X": X}, 'Y', 'X')
+            numpy.testing.assert_almost_equal(arr, dX, decimal=1e-2)
+
    unittest.main()
--- a/python/paddle/v2/framework/tests/op_test_util.py
+++ b/python/paddle/v2/framework/tests/op_test_util.py
-import paddle.v2.framework.core as core
-import unittest
 import numpy
+import paddle.v2.framework.core as core
 from paddle.v2.framework.op import Operator


@@ -24,7 +23,7 @@ class OpTestMeta(type):
            scope = core.Scope()
            kwargs = dict()
            places = [core.CPUPlace()]
-            if core.is_compile_gpu() and core.Operator.support_gpu(self.type):
+            if core.is_compile_gpu():
                places.append(core.GPUPlace(0))

            for place in places:
@@ -53,6 +52,8 @@ class OpTestMeta(type):
                        kwargs[attr_name] = self.attrs[attr_name]

                op = Operator(self.type, **kwargs)
+                if isinstance(place, core.GPUPlace) and not op.support_gpu():
+                    return

                op.infer_shape(scope)


--- a/python/paddle/v2/framework/tests/test_cross_entropy_op.py
+++ b/python/paddle/v2/framework/tests/test_cross_entropy_op.py
 import unittest
 import numpy
 from op_test_util import OpTestMeta
+from gradient_checker import GradientChecker, create_op


-class TestSGD(unittest.TestCase):
+class TestCrossEntropy(unittest.TestCase):
    __metaclass__ = OpTestMeta

    def setUp(self):
@@ -20,7 +21,18 @@ class TestSGD(unittest.TestCase):
        self.outputs = {'Y': numpy.array(Y).astype("float32")}


-# TODO(superjom) add gradient check
+class CrossEntropyGradOpTest(GradientChecker):
+    def test_softmax_grad(self):
+        op = create_op("onehot_cross_entropy")
+        batch_size = 100
+        class_num = 10
+        inputs = {
+            "X": numpy.random.uniform(
+                0.1, 1.0, [batch_size, class_num]).astype("float32"),
+            "label": (class_num / 2) * numpy.ones(batch_size).astype("int32")
+        }
+        self.check_grad(op, inputs, set("X"), "Y")
+

 if __name__ == "__main__":
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ b/python/paddle/v2/framework/tests/test_fc_op.py
@@ -40,7 +40,5 @@ class TestFc(unittest.TestCase):
        self.assertTrue(np.allclose(py_data, op_data))


-
-
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+++ b/python/paddle/v2/framework/tests/test_gaussian_random_op.py
+import unittest
+import paddle.v2.framework.core as core
+from paddle.v2.framework.op import Operator
+import numpy
+
+
+class GaussianRandomTest(unittest.TestCase):
+    def test_cpu(self):
+        self.gaussian_random_test(place=core.CPUPlace())
+
+    def test_gpu(self):
+        if core.is_compile_gpu():
+            self.gaussian_random_test(place=core.GPUPlace(0))
+
+    def gaussian_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("Out").get_tensor()
+
+        op = Operator(
+            "gaussian_random",
+            Out="Out",
+            dims=[1000, 784],
+            mean=.0,
+            std=1.,
+            seed=10)
+
+        op.infer_shape(scope)
+        context = core.DeviceContext.create(place)
+        op.run(scope, context)
+        tensor = numpy.array(scope.find_var("Out").get_tensor())
+        self.assertAlmostEqual(numpy.mean(tensor), .0, delta=0.1)
+        self.assertAlmostEqual(numpy.std(tensor), 1., delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -3,6 +3,15 @@ from paddle.v2.framework.op import Operator
 import unittest


+def fc(X, W, Y):
+    ret_v = core.Net.create()
+
+    ret_v.add_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
+    ret_v.add_op(Operator("sigmoid", X="pre_activation", Y=Y))
+    ret_v.complete_add_op(True)
+    return ret_v
+
+
 class TestNet(unittest.TestCase):
    def test_net_all(self):
        net = core.Net.create()
@@ -10,18 +19,18 @@ class TestNet(unittest.TestCase):
        net.add_op(op1)

        net2 = core.Net.create()
-        net2.add_op(Operator("fc", X="X", W="w", Y="fc.out"))
+        net2.add_op(fc(X="X", W="w", Y="fc.out"))
        net2.complete_add_op(True)
        net.add_op(net2)
        net.complete_add_op(True)

        expected = '''
-Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
+Op(plain_net), inputs:(W, X, Y), outputs:(Out, fc.out, pre_activation).
    Op(add_two), inputs:(X, Y), outputs:(Out).
-    Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
-        Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
-            Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
-            Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
+    Op(plain_net), inputs:(W, X), outputs:(fc.out, pre_activation).
+        Op(plain_net), inputs:(W, X), outputs:(fc.out, pre_activation).
+            Op(mul), inputs:(X, W), outputs:(pre_activation).
+            Op(sigmoid), inputs:(pre_activation), outputs:(fc.out).
 '''
        self.assertEqual(expected, "\n" + str(net))


--- a/python/paddle/v2/framework/tests/test_softmax_op.py
+++ b/python/paddle/v2/framework/tests/test_softmax_op.py
 import unittest

 import numpy as np
-import paddle.v2.framework.core as core
-from paddle.v2.framework.op import Operator

+from gradient_checker import GradientChecker, create_op
 from op_test_util import OpTestMeta


@@ -25,62 +24,11 @@ class TestSoftmaxOp(unittest.TestCase):
        }


-class TestSoftmaxGradOp(unittest.TestCase):
-    def test_softmax_grad(self):
-        op = Operator('softmax', X="X", Y="Y")
-        backward_op = core.Operator.backward(op, set())
-        self.assertEqual(backward_op.type(), "softmax_grad")
-        expected = '''Op(softmax_grad), inputs:(X, Y, Y@GRAD), outputs:(X@GRAD).'''
-        self.assertEqual(expected, str(backward_op))
-
-        batch_size = 3
-        class_num = 5
-        # Initialize X and add 1e-2 for numerical stability
-        Y = np.random.rand(batch_size, class_num).astype(np.float32)
-        Y = Y + 1e-2
-        dY = np.random.rand(batch_size, class_num).astype(np.float32)
-
-        # Reference implementation of cross entropy with soft labels
-        def label_softmax_grad(Y, dY):
-            dX = Y * 0.0
-            for i in range(batch_size):
-                d = np.dot(Y[i, :], dY[i, :])
-                dX[i, :] = Y[i, :] * (dY[i, :] - d)
-            return dX
-
-        expected = label_softmax_grad(Y, dY)
-
-        scope = core.Scope()
-        places = []
-        places.append(core.CPUPlace())
-        if core.is_compile_gpu():
-            places.append(core.GPUPlace(0))
-
-        for place in places:
-            y = scope.new_var("Y")
-            y_tensor = y.get_tensor()
-            y_tensor.set_dims([batch_size, class_num])
-            y_tensor.alloc_float(place)
-            y_tensor.set(Y, place)
-
-            dy = scope.new_var("Y@GRAD")
-            dy_tensor = dy.get_tensor()
-            dy_tensor.set_dims([batch_size, class_num])
-            dy_tensor.alloc_float(place)
-            dy_tensor.set(dY, place)
-
-            x = scope.new_var("X")
-            dx = scope.new_var("X@GRAD")
-
-            tensor = scope.find_var("X@GRAD").get_tensor()
-            backward_op.infer_shape(scope)
-            self.assertEqual([batch_size, class_num], tensor.shape())
-
-            ctx = core.DeviceContext.create(place)
-            backward_op.run(scope, ctx)
-            actual = np.array(tensor)
-
-            np.testing.assert_almost_equal(actual, expected, decimal=3)
+class SoftmaxGradOpTest(GradientChecker):
+    def test_softmax(self):
+        op = create_op("softmax")
+        inputs = {"X": np.random.uniform(0.1, 1, [10, 10]).astype("float32")}
+        self.check_grad(op, inputs, set("X"), "Y")


 if __name__ == '__main__':

--- a/python/paddle/v2/framework/tests/test_uniform_random_op.py
+++ b/python/paddle/v2/framework/tests/test_uniform_random_op.py
+import unittest
+from paddle.v2.framework.op import Operator
+import paddle.v2.framework.core as core
+import numpy
+
+
+class UniformRandomTest(unittest.TestCase):
+    def test_uniform_random_cpu(self):
+        self.uniform_random_test(place=core.CPUPlace())
+
+    def test_uniform_random_gpu(self):
+        if core.is_compile_gpu():
+            self.uniform_random_test(place=core.GPUPlace(0))
+
+    def uniform_random_test(self, place):
+        scope = core.Scope()
+        scope.new_var("X").get_tensor()
+
+        op = Operator(
+            "uniform_random",
+            Out="X",
+            dims=[1000, 784],
+            min=-5.0,
+            max=10.0,
+            seed=10)
+
+        op.infer_shape(scope)
+        ctx = core.DeviceContext.create(place)
+        op.run(scope, ctx)
+        tensor = numpy.array(scope.find_var("X").get_tensor())
+        self.assertAlmostEqual(tensor.mean(), 2.5, delta=0.1)
+
+
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
-import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
-import paddle.trainer_config_helpers.optimizers as v1_optimizers
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Optimizers(update equation) for SGD method.

-TODO(zhihong) : create new optimizer with proto config, add new optimizer here
-
 TODO(yuyang18): Complete comments.
 """

+import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
+import paddle.trainer_config_helpers.optimizers as v1_optimizers
+from paddle.proto.OptimizerConfig_pb2 import OptimizerConfig
+
 __all__ = [
    'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
    'RMSProp', 'ModelAverage', 'L2Regularization'
@@ -70,7 +83,8 @@ class Optimizer(object):
                        gradient_machine.prefetch(in_args)
                        parameter_updater.getParametersRemote()

-        :param pserver_spec: pserver location, eg: localhost:3000
+        :param pserver_spec: pserver location, eg: localhost:3000, if use etcd,
+        pserver_spec should be the etcd endpoints, eg: http://localhost:2379
        :return: parameter_updater
        """
        if is_local:

--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
@@ -113,16 +127,7 @@ class Parameters(object):
        """
        return iter(self.__param_conf__)

-    def __getitem__(self, key):
-        """
-        Get parameter by parameter name. It uses Python dict syntax.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :return: parameter value
-        :rtype: np.ndarray
-        """
+    def __getter_inner(self, key, param_type):
        import py_paddle.swig_paddle as api
        shape = self.get_shape(key)

@@ -138,7 +143,7 @@ class Parameters(object):
                    each_gradient_machine, key)
                # for simplify implementation now, we always copy from C++
                assert isinstance(param, api.Parameter)
-                val = param.getBuf(api.PARAMETER_VALUE)
+                val = param.getBuf(param_type)
                assert isinstance(val, api.Vector)
                val = val.copyToNumpyArray()
                return val
@@ -146,6 +151,19 @@ class Parameters(object):

            raise RuntimeError("Unexpected branch")

+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        return self.__getter_inner(key, api.PARAMETER_VALUE)
+
    def get_shape(self, key):
        """
        get shape of the parameter.
@@ -202,6 +220,19 @@ class Parameters(object):
        """
        return self.__getitem__(key=parameter_name)

+    def get_grad(self, key):
+        """
+        Get grandient by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: parameter name
+        :type key: basestring
+        :return: The grandient matrix.
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        return self.__getter_inner(key, api.PARAMETER_GRADIENT)
+
    def set(self, parameter_name, value):
        """
        Set parameter by parameter name & matrix.
@@ -250,7 +281,13 @@ class Parameters(object):
        size = reduce(lambda a, b: a * b, param.shape)
        f.write(struct.pack("IIQ", 0, 4, size))
        param = param.astype(np.float32)
-        f.write(param.tostring())
+        s = param.tostring()
+        wrote_size = 0
+        buf = buffer(s, wrote_size, 65535)
+        while buf:  # f.write crashes with big data blog.
+            f.write(buf)
+            wrote_size += 65535
+            buf = buffer(s, wrote_size, 65535)

    def deserialize(self, name, f):
        """

--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -161,14 +161,14 @@ class SGD(object):
                    self.__parameter_updater__.update(each_param)
                cost_sum = out_args.sum()
                cost = cost_sum / len(data_batch)
-                self.__parameter_updater__.finishBatch(cost)
-                batch_evaluator.finish()
                event_handler(
                    v2_event.EndIteration(
                        pass_id=pass_id,
                        batch_id=batch_id,
                        cost=cost,
                        evaluator=batch_evaluator))
+                self.__parameter_updater__.finishBatch(cost)
+                batch_evaluator.finish()

            self.__parameter_updater__.finishPass()
            pass_evaluator.finish()

--- a/python/requirements.txt
+++ b/python/requirements.txt
+requests==2.9.2
+numpy>=1.12
+protobuf==3.1
+recordio
+matplotlib
+rarfile
+scipy>=0.19.0
+Pillow
+nltk>=3.2.2
--- a/python/setup.py.in
+++ b/python/setup.py.in
 from setuptools import setup, Distribution
-
 class BinaryDistribution(Distribution):
    def has_ext_modules(foo):
        return True
@@ -18,15 +17,8 @@ packages=['paddle',
          'paddle.v2.framework.proto',
          'py_paddle']

-setup_requires=["requests",
-                "numpy>=1.12",
-                "protobuf==3.1",
-                "recordio",
-                "matplotlib",
-                "rarfile",
-                "scipy>=0.19.0",
-                "Pillow",
-                "nltk>=3.2.2"]
+with open('@PADDLE_SOURCE_DIR@/python/requirements.txt') as f:
+    setup_requires = f.read().splitlines()

 if '${CMAKE_SYSTEM_PROCESSOR}' not in ['arm', 'armv7-a', 'aarch64']:
    setup_requires+=["opencv-python"]
@@ -45,14 +37,14 @@ setup(name='paddlepaddle',
          '': '${CMAKE_CURRENT_SOURCE_DIR}',
          # The paddle.v2.framework.proto will be generated while compiling.
          # So that package points to other directory.
-          'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework',
-          'py_paddle': '${PROJ_ROOT}/paddle/py_paddle'
+          'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
      },
-      scripts=['${PROJ_BINARY_ROOT}/paddle/scripts/paddle'],
+      scripts=['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'],
      distclass=BinaryDistribution,
      data_files=[('/usr/local/opt/paddle/bin',
-                       ['${PROJ_BINARY_ROOT}/paddle/scripts/paddle_usage',
-                        '${PROJ_BINARY_ROOT}/paddle/trainer/paddle_trainer',
-                        '${PROJ_BINARY_ROOT}/paddle/trainer/paddle_merge_model',
-                        '${PROJ_BINARY_ROOT}/paddle/pserver/paddle_pserver_main'])]
+                       ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
+                        '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+                        '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
+                        '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main'])]
 )