diff --git a/CMakeLists.txt b/CMakeLists.txt
index b174831109372cb014741d63032fa6a470e74042..c75b83e50cf9cef8290c37f88b38cdc3d77df39c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -14,8 +14,8 @@
 
 cmake_minimum_required(VERSION 3.0)
 set(CMAKE_MODULE_PATH ${CMAKE_MODULE_PATH} "${CMAKE_CURRENT_SOURCE_DIR}/cmake")
-set(PROJ_ROOT ${CMAKE_CURRENT_SOURCE_DIR})
-set(PROJ_BINARY_ROOT ${CMAKE_CURRENT_BINARY_DIR})
+set(PADDLE_SOURCE_DIR ${CMAKE_CURRENT_SOURCE_DIR})
+set(PADDLE_BINARY_DIR ${CMAKE_CURRENT_BINARY_DIR})
 
 include(system)
 
@@ -36,8 +36,8 @@ include(simd)
 ################################ Configurations #######################################
 option(WITH_GPU         "Compile PaddlePaddle with NVIDIA GPU"          ${CUDA_FOUND})
 option(WITH_AVX         "Compile PaddlePaddle with AVX intrinsics"      ${AVX_FOUND})
-option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    ${AVX_FOUND})
-option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      ${AVX_FOUND})
+option(WITH_MKLDNN      "Compile PaddlePaddle with mkl-dnn support."    OFF)
+option(WITH_MKLML       "Compile PaddlePaddle with mklml package."      OFF)
 option(WITH_DSO         "Compile PaddlePaddle with dynamic linked CUDA" ON)
 option(WITH_TESTING     "Compile PaddlePaddle with unit testing"        ON)
 option(WITH_SWIG_PY     "Compile PaddlePaddle with inference api"       ON)
@@ -121,8 +121,8 @@ include(version)            # set PADDLE_VERSION
 include(coveralls)          # set code coverage
 
 
-include_directories("${PROJ_ROOT}")
-include_directories("${PROJ_ROOT}/paddle/cuda/include")
+include_directories("${PADDLE_SOURCE_DIR}")
+include_directories("${PADDLE_SOURCE_DIR}/paddle/cuda/include")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/proto")
 include_directories("${CMAKE_CURRENT_BINARY_DIR}/go/pserver/client/c")
 include_directories(${Boost_INCLUDE_DIRS})
@@ -144,7 +144,7 @@ if(WITH_GPU)
 endif(WITH_GPU)
 
 if(WITH_MKLDNN)
-    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIBRARY} ${MKLDNN_IOMP_LIB})
+    list(APPEND EXTERNAL_LIBS ${MKLDNN_LIB} ${MKLDNN_IOMP_LIB})
 endif()
 
 if(USE_NNPACK)
@@ -164,10 +164,12 @@ if(WITH_GOLANG)
     add_subdirectory(go)
 endif(WITH_GOLANG)
 
+set(PADDLE_PYTHON_BUILD_DIR "${CMAKE_CURRENT_BINARY_DIR}/python/build")
 add_subdirectory(paddle)
 if(WITH_PYTHON)
   add_subdirectory(python)
 endif()
+
 if(WITH_DOC)
     add_subdirectory(doc)
 endif()
diff --git a/Dockerfile b/Dockerfile
index 8ac123bf9c0f24b47b741611f3b80213c61b82e9..41b6729124228cec16be35d9b26da8042824b0b0 100644
--- a/Dockerfile
+++ b/Dockerfile
@@ -64,13 +64,28 @@ RUN pip install --upgrade pip && \
     pip install -U sphinx-rtd-theme==0.1.9 recommonmark && \
     pip install pre-commit 'requests==2.9.2' 'ipython==5.3.0' && \
     pip install 'ipykernel==4.6.0' 'jupyter==1.0.0' && \
-    pip install rarfile
+    pip install opencv-python rarfile 'scipy>=0.19.0' 'nltk>=3.2.2'
 
 # To fix https://github.com/PaddlePaddle/Paddle/issues/1954, we use
 # the solution in https://urllib3.readthedocs.io/en/latest/user-guide.html#ssl-py2
 RUN apt-get install -y libssl-dev libffi-dev
 RUN pip install certifi urllib3[secure]
 
+# TODO(qijun) The template library Eigen doesn't work well with GCC 5 
+# coming with the default Docker image, so we switch to use GCC 4.8 
+# by default. And I will check Eigen library later.
+
+RUN ln -sf gcc-4.8 /usr/bin/gcc && \
+    ln -sf gcc-ar-4.8 /usr/bin/gcc-ar && \
+    ln -sf gcc-nm-4.8 /usr/bin/gcc-nm && \
+    ln -sf gcc-ranlib-4.8 /usr/bin/gcc-ranlib && \
+    ln -sf gcc-4.8 /usr/bin/x86_64-linux-gnu-gcc && \
+    ln -sf gcc-ar-4.8 /usr/bin/x86_64-linux-gnu-gcc-ar && \
+    ln -sf gcc-nm-4.8 /usr/bin/x86_64-linux-gnu-gcc-nm && \
+    ln -sf gcc-ranlib-4.8 /usr/bin/x86_64-linux-gnu-gcc-ranlib && \
+    ln -sf g++-4.8 /usr/bin/g++ && \
+    ln -sf g++-4.8 /usr/bin/x86_64-linux-gnu-g++ 
+
 # Install woboq_codebrowser to /woboq
 RUN git clone https://github.com/woboq/woboq_codebrowser /woboq && \
     (cd /woboq \
diff --git a/cmake/configure.cmake b/cmake/configure.cmake
index 2ac098954647d37e26ac2499e0675dae39910edc..209f9078a637ac581d90212a48216eb388c477ed 100644
--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -129,7 +129,7 @@ if(WITH_GOLANG)
     add_custom_command(OUTPUT ${CMAKE_BINARY_DIR}/glide
       COMMAND env GOPATH=${GOPATH} ${GLIDE} install
       COMMAND touch ${CMAKE_BINARY_DIR}/glide
-      DEPENDS ${PROJ_ROOT}/go/glide.lock
+      DEPENDS ${PADDLE_SOURCE_DIR}/go/glide.lock
       WORKING_DIRECTORY "${PADDLE_IN_GOPATH}/go"
       )
 
diff --git a/cmake/cpplint.cmake b/cmake/cpplint.cmake
index 5184f0815faac005b3dff1015395235f4e19d65b..8d5d533126c9b7fa84c725d614cf3486126d0284 100644
--- a/cmake/cpplint.cmake
+++ b/cmake/cpplint.cmake
@@ -52,7 +52,7 @@ macro(add_style_check_target TARGET_NAME)
 
         if(SOURCES_LIST)
             add_custom_command(TARGET ${TARGET_NAME} POST_BUILD
-                COMMAND "${PYTHON_EXECUTABLE}" "${PROJ_ROOT}/paddle/scripts/cpplint.py"
+                COMMAND "${PYTHON_EXECUTABLE}" "${PADDLE_SOURCE_DIR}/paddle/scripts/cpplint.py"
                         "--filter=${STYLE_FILTER}"
                         ${SOURCES_LIST}
                 COMMENT "cpplint: Checking source code style"
diff --git a/cmake/flags.cmake b/cmake/flags.cmake
index e26d8d9df386e65137aa83cc60a43bfeabf7a4a6..b27eb71550b68b5c27e47bf067ae0df329bbd628 100644
--- a/cmake/flags.cmake
+++ b/cmake/flags.cmake
@@ -9,10 +9,12 @@ function(CheckCompilerCXX11Flag)
         if(${CMAKE_CXX_COMPILER_VERSION} VERSION_LESS 4.8)
             message(FATAL_ERROR "Unsupported GCC version. GCC >= 4.8 required.")
         endif()
-        # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
-        # Use Debug mode instead for now.
-        if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9) 
-            set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+        if(NOT ANDROID)
+            # TODO(qijun) gcc 4.9 or later versions raise SEGV due to the optimization problem.
+            # Use Debug mode instead for now.
+            if(CMAKE_CXX_COMPILER_VERSION VERSION_GREATER 4.9 OR CMAKE_CXX_COMPILER_VERSION VERSION_EQUAL 4.9)
+                set(CMAKE_BUILD_TYPE "Debug" CACHE STRING "" FORCE)
+            endif()
         endif()
     elseif(CMAKE_CXX_COMPILER_ID STREQUAL "AppleClang" OR CMAKE_CXX_COMPILER_ID STREQUAL "Clang")
         # cmake >= 3.0 compiler id "AppleClang" on Mac OS X, otherwise "Clang"
diff --git a/cmake/generic.cmake b/cmake/generic.cmake
index 957c20bcf603f2f264b4658f63ac0eec438f12b1..d2aab938d4636b1583062e27b73cb30f5d56b7b0 100644
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -411,7 +411,7 @@ function(py_test TARGET_NAME)
     set(multiValueArgs SRCS DEPS)
     cmake_parse_arguments(py_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})  
     add_test(NAME ${TARGET_NAME}
-             COMMAND env PYTHONPATH=${PADDLE_PYTHON_PACKAGE_DIR}
+             COMMAND env PYTHONPATH=${PADDLE_PYTHON_BUILD_DIR}/lib-python
              python2 ${py_test_SRCS}
              WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR})
   endif()
diff --git a/cmake/package.cmake b/cmake/package.cmake
index ff49a2d08e8f6004320acfce266339aa301eb9c4..79e02147f3f7cc19c1bf45d8a1d208a9a32416ff 100644
--- a/cmake/package.cmake
+++ b/cmake/package.cmake
@@ -12,7 +12,7 @@ set(CPACK_PACKAGE_DESCRIPTION "")
 set(CPACK_DEBIAN_PACKAGE_DEPENDS "libpython2.7-dev, libstdc++6, python-pip, curl, libgfortran3, python-pip-whl")
 set(CPACK_DEBIAN_PACKAGE_SECTION Devel)
 set(CPACK_DEBIAN_PACKAGE_VERSION ${PADDLE_VERSION})
-set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PROJ_ROOT}/paddle/scripts/deb/postinst")
+set(CPACK_DEBIAN_PACKAGE_CONTROL_EXTRA "${PADDLE_SOURCE_DIR}/paddle/scripts/deb/postinst")
 #set(CPACK_GENERATOR "DEB")
 # Start cpack
 include (CMakePackageConfigHelpers)
diff --git a/cmake/util.cmake b/cmake/util.cmake
index 4a27623b7ffc0b389680baee52db440c78442f46..0da4969d310368ab27b0ed65237813c07d6e59f0 100644
--- a/cmake/util.cmake
+++ b/cmake/util.cmake
@@ -141,8 +141,8 @@ endmacro()
 function(create_resources res_file output_file)
   add_custom_command(
     OUTPUT ${output_file}
-    COMMAND python ARGS ${PROJ_ROOT}/cmake/make_resource.py ${res_file} ${output_file}
-    DEPENDS ${res_file} ${PROJ_ROOT}/cmake/make_resource.py)
+    COMMAND python ARGS ${PADDLE_SOURCE_DIR}/cmake/make_resource.py ${res_file} ${output_file}
+    DEPENDS ${res_file} ${PADDLE_SOURCE_DIR}/cmake/make_resource.py)
 endfunction()
 
 
diff --git a/cmake/version.cmake b/cmake/version.cmake
index ac1583a24c828629c46cb9cf4e965f8da2273732..cde650128a068faf32f4abfff5cdfdeb656d8577 100644
--- a/cmake/version.cmake
+++ b/cmake/version.cmake
@@ -4,7 +4,7 @@ set(tmp_version "HEAD")
 while ("${PADDLE_VERSION}" STREQUAL "")
   execute_process(
     COMMAND ${GIT_EXECUTABLE} describe --tags --abbrev=0 ${tmp_version}
-    WORKING_DIRECTORY ${PROJ_ROOT}
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}
     OUTPUT_VARIABLE GIT_TAG_NAME
     RESULT_VARIABLE GIT_RESULT
     ERROR_QUIET OUTPUT_STRIP_TRAILING_WHITESPACE)
diff --git a/doc/templates/conf.py.cn.in b/doc/templates/conf.py.cn.in
index 673948dfe7928240817b552141ec9bc2f8a672b7..41b35b5b233abd737db07aaeb6c6dd4bf6d42b08 100644
--- a/doc/templates/conf.py.cn.in
+++ b/doc/templates/conf.py.cn.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
@@ -24,7 +24,7 @@ AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -120,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/doc/templates/conf.py.en.in b/doc/templates/conf.py.en.in
index b6b50b7dcd5647b50a13703160489323ed90a1b4..5822c2481dd61da2084b0de76f6f65aa4e32e033 100644
--- a/doc/templates/conf.py.en.in
+++ b/doc/templates/conf.py.en.in
@@ -13,7 +13,7 @@
 # serve to show the default.
 import sys
 import os, subprocess
-sys.path.insert(0, os.path.abspath('@PROJ_ROOT@/python'))
+sys.path.insert(0, os.path.abspath('@PADDLE_SOURCE_DIR@/python'))
 import shlex
 from recommonmark import parser, transform
 import paddle
@@ -25,7 +25,7 @@ AutoStructify = transform.AutoStructify
 # If extensions (or modules to document with autodoc) are in another directory,
 # add these directories to sys.path here. If the directory is relative to the
 # documentation root, use os.path.abspath to make it absolute, like shown here.
-templates_path = ["@PROJ_ROOT@/doc_theme/templates"]
+templates_path = ["@PADDLE_SOURCE_DIR@/doc_theme/templates"]
 
 # -- General configuration ------------------------------------------------
 
@@ -120,7 +120,7 @@ html_theme = 'sphinx_rtd_theme'
 # Add any paths that contain custom static files (such as style sheets) here,
 # relative to this directory. They are copied after the builtin static files,
 # so a file named "default.css" will overwrite the builtin "default.css".
-html_static_path = ['@PROJ_ROOT@/doc_theme/static']
+html_static_path = ['@PADDLE_SOURCE_DIR@/doc_theme/static']
 
 # Output file base name for HTML help builder.
 htmlhelp_basename = project + 'doc'
diff --git a/go/pserver/client/c/test/test_train.py b/go/pserver/client/c/test/test_train.py
index 572a61e4ccaa9ef3d03a60d916e80eab907c6d88..8d9c6b9b20f515ed0865df8cf46b6dfc2d8ffa34 100644
--- a/go/pserver/client/c/test/test_train.py
+++ b/go/pserver/client/c/test/test_train.py
@@ -17,12 +17,10 @@ def main():
     # network config
     x = paddle.layer.data(name='x', type=paddle.data_type.dense_vector(13))
     y_predict = paddle.layer.fc(input=x,
-                                param_attr=paddle.attr.Param(
-                                    name='w', learning_rate=1e-3),
+                                param_attr=paddle.attr.Param(name='w'),
                                 size=1,
                                 act=paddle.activation.Linear(),
-                                bias_attr=paddle.attr.Param(
-                                    name='b', learning_rate=1e-3))
+                                bias_attr=paddle.attr.Param(name='b'))
     y = paddle.layer.data(name='y', type=paddle.data_type.dense_vector(1))
     cost = paddle.layer.mse_cost(input=y_predict, label=y)
 
diff --git a/paddle/api/CMakeLists.txt b/paddle/api/CMakeLists.txt
index 7a1e8b8b26ac6330c3799b7dfeb4447e171fe0f1..d7b3d2bdec1687425df804c0d56d568241f9e8b0 100644
--- a/paddle/api/CMakeLists.txt
+++ b/paddle/api/CMakeLists.txt
@@ -19,9 +19,9 @@ add_library(paddle_api STATIC ${API_SOURCES})
 add_dependencies(paddle_api paddle_proto paddle_trainer_lib)
 
 INCLUDE(${SWIG_USE_FILE})
-INCLUDE_DIRECTORIES(${PROJ_ROOT}/paddle)
+INCLUDE_DIRECTORIES(${PADDLE_SOURCE_DIR}/paddle)
 
-FILE(GLOB PY_PADDLE_PYTHON_FILES ${PROJ_ROOT}/paddle/py_paddle/*.py)
+FILE(GLOB PY_PADDLE_PYTHON_FILES ${PADDLE_SOURCE_DIR}/paddle/py_paddle/*.py)
 
 SET_SOURCE_FILES_PROPERTIES(Paddle.i PROPERTIES CPLUSPLUS ON)
 
@@ -79,16 +79,16 @@ SWIG_LINK_LIBRARIES(swig_paddle
     ${START_END}
 )
 
-add_custom_command(OUTPUT ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PROJ_ROOT}/paddle/py_paddle
-    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PROJ_ROOT}/paddle/py_paddle
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/swig_paddle.py ${PADDLE_SOURCE_DIR}/paddle/py_paddle
+    COMMAND cp ${CMAKE_CURRENT_BINARY_DIR}/_swig_paddle.so ${PADDLE_SOURCE_DIR}/paddle/py_paddle
     COMMAND ${CMAKE_COMMAND} -E touch .timestamp
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
     DEPENDS _swig_paddle
 )
 
 # TODO(yuyang18) : make wheel name calculated by cmake
-add_custom_target(python_api_wheel ALL DEPENDS ${PROJ_ROOT}/paddle/py_paddle/_swig_paddle.so)
+add_custom_target(python_api_wheel ALL DEPENDS ${PADDLE_SOURCE_DIR}/paddle/py_paddle/_swig_paddle.so)
 
 if(WITH_TESTING)
     IF(NOT PY_PIP_FOUND)
diff --git a/paddle/api/ParameterUpdater.cpp b/paddle/api/ParameterUpdater.cpp
index 5934cb898b5f6adc74c237b1733a7459d8437a28..8cd73b348c507386cd88e907f7b431ef25e793aa 100644
--- a/paddle/api/ParameterUpdater.cpp
+++ b/paddle/api/ParameterUpdater.cpp
@@ -41,7 +41,7 @@ ParameterUpdater *ParameterUpdater::createNewRemoteUpdater(
       config->m->getConfig(), pserverSpec, useEtcd));
   return updater;
 #else
-  throw UnsupportError();
+  throw UnsupportError("not compiled with WITH_GOLANG");
 #endif
 }
 
diff --git a/paddle/capi/Arguments.cpp b/paddle/capi/Arguments.cpp
index 8b81ec69e60399af86f055d2258276ac06e0b13a..1ec403077e7ea0bc8299e6266167b50ed81c3b08 100644
--- a/paddle/capi/Arguments.cpp
+++ b/paddle/capi/Arguments.cpp
@@ -90,6 +90,18 @@ paddle_error paddle_arguments_set_ids(paddle_arguments args,
   return kPD_NO_ERROR;
 }
 
+paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                              uint64_t ID,
+                                              uint64_t frameHeight,
+                                              uint64_t frameWidth) {
+  if (args == nullptr) return kPD_NULLPTR;
+  auto a = castArg(args);
+  if (ID >= a->args.size()) return kPD_OUT_OF_RANGE;
+  a->args[ID].setFrameHeight(frameHeight);
+  a->args[ID].setFrameWidth(frameWidth);
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_arguments_set_sequence_start_pos(paddle_arguments args,
                                                      uint64_t ID,
                                                      uint32_t nestedLevel,
diff --git a/paddle/capi/arguments.h b/paddle/capi/arguments.h
index d71ea26a5d1aff130d974541532fda3b09bf6fe5..7c32524a00b11573a037968cf10d6488ea6c5644 100644
--- a/paddle/capi/arguments.h
+++ b/paddle/capi/arguments.h
@@ -111,6 +111,20 @@ PD_API paddle_error paddle_arguments_set_ids(paddle_arguments args,
                                              uint64_t ID,
                                              paddle_ivector ids);
 
+/**
+ * @brief paddle_arguments_set_frame_shape Set the fram size of one argument
+ *        in array, which index is `ID`.
+ * @param [in] args arguments array
+ * @param [in] ID array index
+ * @param [in] frameHeight maximum height of input images
+ * @param [in] frameWidth maximum width of input images
+ * @return paddle_error
+ */
+PD_API paddle_error paddle_arguments_set_frame_shape(paddle_arguments args,
+                                                     uint64_t ID,
+                                                     uint64_t frameHeight,
+                                                     uint64_t frameWidth);
+
 /**
  * @brief PDArgsSetSequenceStartPos Set sequence start position vector of one
  *        argument in array, which index is `ID`.
diff --git a/paddle/capi/examples/model_inference/common/common.h b/paddle/capi/examples/model_inference/common/common.h
index a78522e4a7c3cb34b341b7f4c89b53d32b72f114..e32f2f9836f63ba10ef5be447a4c41514e079219 100644
--- a/paddle/capi/examples/model_inference/common/common.h
+++ b/paddle/capi/examples/model_inference/common/common.h
@@ -3,18 +3,21 @@
 #include <stdio.h>
 #include <stdlib.h>
 
-#define CHECK(stmt)                                                \
-  do {                                                             \
-    paddle_error __err__ = stmt;                                   \
-    if (__err__ != kPD_NO_ERROR) {                                 \
-      fprintf(stderr, "Invoke paddle error %d \n" #stmt, __err__); \
-      exit(__err__);                                               \
-    }                                                              \
+#define CHECK(stmt)                                                      \
+  do {                                                                   \
+    paddle_error __err__ = stmt;                                         \
+    if (__err__ != kPD_NO_ERROR) {                                       \
+      fprintf(stderr, "Invoke paddle error %d in " #stmt "\n", __err__); \
+      exit(__err__);                                                     \
+    }                                                                    \
   } while (0)
 
 void* read_config(const char* filename, long* size) {
   FILE* file = fopen(filename, "r");
-  if (file == NULL) return NULL;
+  if (file == NULL) {
+    fprintf(stderr, "Open %s error\n", filename);
+    return NULL;
+  }
   fseek(file, 0L, SEEK_END);
   *size = ftell(file);
   fseek(file, 0L, SEEK_SET);
diff --git a/paddle/capi/gradient_machine.cpp b/paddle/capi/gradient_machine.cpp
index 00f76e0152366834eafc22df710cf3d6c7b8471f..b3287552db87d25edbf6e7f3d5e68121df49e9d6 100644
--- a/paddle/capi/gradient_machine.cpp
+++ b/paddle/capi/gradient_machine.cpp
@@ -54,6 +54,31 @@ paddle_error paddle_gradient_machine_create_for_inference(
   return kPD_NO_ERROR;
 }
 
+paddle_error paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size) {
+  if (mergedModel == nullptr) return kPD_NULLPTR;
+  std::istringstream is(std::string(static_cast<char*>(mergedModel), size));
+  int64_t modelConfigSize = 0;
+  is.read((char*)(&modelConfigSize), sizeof(modelConfigSize));
+  std::string modelConfigProtobuf;
+  modelConfigProtobuf.resize(modelConfigSize);
+  is.read(&modelConfigProtobuf[0], modelConfigSize);
+  paddle::TrainerConfig config;
+  if (!config.ParseFromString(modelConfigProtobuf) || !config.IsInitialized()) {
+    return kPD_PROTOBUF_ERROR;
+  }
+  auto ptr = new paddle::capi::CGradientMachine();
+  ptr->machine.reset(paddle::GradientMachine::create(
+      config.model_config(), CREATE_MODE_TESTING, {paddle::PARAMETER_VALUE}));
+  std::vector<paddle::ParameterPtr>& parameters = ptr->machine->getParameters();
+  for (auto& para : parameters) {
+    para->load(is);
+  }
+
+  *machine = ptr;
+  return kPD_NO_ERROR;
+}
+
 paddle_error paddle_gradient_machine_destroy(paddle_gradient_machine machine) {
   delete cast(machine);
   return kPD_NO_ERROR;
diff --git a/paddle/capi/gradient_machine.h b/paddle/capi/gradient_machine.h
index d7e2dd9bf8037ed474971624d4518160604abe4d..c613ade5b24efbbf52f21c7ee86dd3189981c5ef 100644
--- a/paddle/capi/gradient_machine.h
+++ b/paddle/capi/gradient_machine.h
@@ -36,6 +36,18 @@ typedef void* paddle_gradient_machine;
 PD_API paddle_error paddle_gradient_machine_create_for_inference(
     paddle_gradient_machine* machine, void* modelConfigProtobuf, int size);
 
+/**
+ * @brief Create a gradient machine used for model inference, using config with
+ *        parameters which is generated by `paddle merge_model`.
+ * @param [out] machine that used for model inference.
+ * @param [in] mergedModel
+ * @param [in] size
+ * @return paddle_error
+ */
+PD_API paddle_error
+paddle_gradient_machine_create_for_inference_with_parameters(
+    paddle_gradient_machine* machine, void* mergedModel, uint64_t size);
+
 /**
  * @brief Load parameter from disk.
  * @param machine Gradient Machine.
diff --git a/paddle/capi/tests/CMakeLists.txt b/paddle/capi/tests/CMakeLists.txt
index d73f6b7733950bd472a46afb21694aac943fc909..8208808b94f54f2ddaf4d426a65b8db562b36aca 100644
--- a/paddle/capi/tests/CMakeLists.txt
+++ b/paddle/capi/tests/CMakeLists.txt
@@ -10,5 +10,5 @@ target_include_directories(capi_test_gradientMachine PUBLIC
   ${PADDLE_CAPI_INC_PATH})
 target_link_libraries(capi_test_gradientMachine paddle_capi)
 add_test(NAME capi_test_gradientMachine
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
-  WORKING_DIRECTORY ${PROJ_ROOT}/paddle/capi/tests)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/capi_test_gradientMachine
+  WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/capi/tests)
diff --git a/paddle/framework/CMakeLists.txt b/paddle/framework/CMakeLists.txt
index f6df89369c52797f7269c41f635756582fadbc47..9e98afb3119856776ef80e502379460b8c019d40 100644
--- a/paddle/framework/CMakeLists.txt
+++ b/paddle/framework/CMakeLists.txt
@@ -35,6 +35,11 @@ py_proto_compile(framework_py_proto SRCS attribute.proto op_proto.proto op_desc.
 # Generate an empty __init__.py to make framework_py_proto as a valid python module.
 add_custom_target(framework_py_proto_init ALL COMMAND ${CMAKE_COMMAND} -E touch __init__.py)
 add_dependencies(framework_py_proto framework_py_proto_init)
+add_custom_command(TARGET framework_py_proto POST_BUILD
+    COMMAND ${CMAKE_COMMAND} -E make_directory ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto
+    COMMAND cp *.py ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/proto/
+    COMMENT "Copy generated python proto into directory paddle/v2/framework/proto."
+    WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 
 cc_library(backward SRCS backward.cc DEPS net_op)
 cc_test(backward_test SRCS backward_test.cc DEPS backward)
@@ -43,9 +48,12 @@ if(WITH_PYTHON)
 cc_library(paddle_pybind SHARED
     SRCS pybind.cc
     DEPS pybind python backward
-    fc_op
     sgd_op
     add_op
+    mul_op
+    rowwise_add_op
+    sigmoid_op
+    softmax_op
     mean_op
     cross_entropy_op
     recurrent_op
diff --git a/paddle/framework/attribute.h b/paddle/framework/attribute.h
index 3a5820e9c60539e3c771df5da4e82f6c1cae688f..49a62bedb6aadab5ff05d8aa7dda42fe983314a0 100644
--- a/paddle/framework/attribute.h
+++ b/paddle/framework/attribute.h
@@ -14,7 +14,6 @@ limitations under the License. */
 
 #pragma once
 
-#include <boost/variant.hpp>
 #include <functional>
 #include <string>
 #include <unordered_map>
@@ -24,6 +23,7 @@ limitations under the License. */
 #include "paddle/framework/attribute.pb.h"
 #include "paddle/framework/op_desc.pb.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace framework {
diff --git a/paddle/framework/ddim.h b/paddle/framework/ddim.h
index 5aa5af0c19be5a209c760282cb1a090fc57a53ad..b2d9fec047109fc53d281e2fbe9d2594a65a9201 100644
--- a/paddle/framework/ddim.h
+++ b/paddle/framework/ddim.h
@@ -14,12 +14,12 @@ limitations under the License. */
 
 #pragma once
 
-#include <boost/variant.hpp>
 #include <initializer_list>
 #include <stdexcept>
 #include <vector>
 #include "paddle/framework/dim.h"
 #include "paddle/platform/enforce.h"
+#include "paddle/platform/variant.h"
 #include "unsupported/Eigen/CXX11/Tensor"
 
 namespace paddle {
diff --git a/paddle/framework/operator.h b/paddle/framework/operator.h
index 8949baf60e80d9802693cb4b28c99bb3c258c79c..50fc6d10134e26043c73e32e90b97cd7d40f01d2 100644
--- a/paddle/framework/operator.h
+++ b/paddle/framework/operator.h
@@ -15,7 +15,6 @@ limitations under the License. */
 #pragma once
 
 #include <algorithm>
-#include <boost/variant.hpp>
 #include <string>
 #include <unordered_map>
 #include <vector>
@@ -27,6 +26,7 @@ limitations under the License. */
 #include "paddle/framework/tensor.h"
 #include "paddle/platform/device_context.h"
 #include "paddle/platform/place.h"
+#include "paddle/platform/variant.h"
 #include "paddle/utils/Error.h"
 
 namespace paddle {
diff --git a/paddle/framework/pybind.cc b/paddle/framework/pybind.cc
index c18d38d2f9a146e40a6b3de8c46b453d79c7c11c..75cd5bcb38e1d864358314c1c15b6fb59e9c3752 100644
--- a/paddle/framework/pybind.cc
+++ b/paddle/framework/pybind.cc
@@ -31,7 +31,6 @@ namespace py = pybind11;
 
 USE_OP(add_two);
 USE_CPU_OP(onehot_cross_entropy);
-USE_OP_ITSELF(fc);
 USE_NO_GRAD_OP(sgd);
 USE_OP(mul);
 USE_OP(mean);
diff --git a/paddle/framework/tensor.h b/paddle/framework/tensor.h
index b57958591fb752132407c35958db0781d0e023f0..cd1b4de426a49fa66dbbf8cf7d09990ac8d21227 100644
--- a/paddle/framework/tensor.h
+++ b/paddle/framework/tensor.h
@@ -79,11 +79,11 @@ class Tensor {
   inline const DDim& dims() const;
 
   /*! Resize the dimensions of the memory block. */
-  inline void Resize(const DDim& dims);
+  inline Tensor& Resize(const DDim& dims);
 
   /*! The internal of two tensors share the same memory block. */
   template <typename T>
-  inline void ShareDataWith(const Tensor& src);
+  inline Tensor& ShareDataWith(const Tensor& src);
 
   /**
    * @brief   Copy the content of external tensor to a new place.
diff --git a/paddle/framework/tensor_impl.h b/paddle/framework/tensor_impl.h
index 8d9bec6dc9c3f0af822a0d8cd8588dc932970652..7d7263b899afb7a2128548f264065a8013b6f0c9 100644
--- a/paddle/framework/tensor_impl.h
+++ b/paddle/framework/tensor_impl.h
@@ -23,9 +23,11 @@ template <typename T>
 inline void Tensor::check_memory_size() const {
   PADDLE_ENFORCE_NOT_NULL(
       holder_, "Tenosr holds no memory. Call Tensor::mutable_data first.");
-  PADDLE_ENFORCE_GE(holder_->size(), product(dims_) * sizeof(T) + offset_,
-                    "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
-                    "first to re-allocate memory.");
+  PADDLE_ENFORCE_GE(
+      holder_->size(), product(dims_) * sizeof(T) + offset_,
+      "Tensor's dims_ is out of bound. Call Tensor::mutable_data "
+      "first to re-allocate memory.\n"
+      "or maybe the required data-type mismatches the data already stored.");
 }
 
 template <typename T>
@@ -78,9 +80,10 @@ inline T* Tensor::mutable_data(platform::Place place) {
 }
 
 template <typename T>
-inline void Tensor::ShareDataWith(const Tensor& src) {
+inline Tensor& Tensor::ShareDataWith(const Tensor& src) {
   src.check_memory_size<T>();
   *this = src;
+  return *this;
 }
 
 template <typename T>
@@ -136,7 +139,10 @@ inline Tensor Tensor::Slice(const int& begin_idx, const int& end_idx) const {
   return dst;
 }
 
-inline void Tensor::Resize(const DDim& dims) { dims_ = dims; }
+inline Tensor& Tensor::Resize(const DDim& dims) {
+  dims_ = dims;
+  return *this;
+}
 
 inline const DDim& Tensor::dims() const { return dims_; }
 
diff --git a/paddle/function/CMakeLists.txt b/paddle/function/CMakeLists.txt
index 93304f73037690b5cf3ac8189aabc28f51316a77..7dfb6f61c50959f7269725a00dbc4f9c27474bdf 100644
--- a/paddle/function/CMakeLists.txt
+++ b/paddle/function/CMakeLists.txt
@@ -38,10 +38,11 @@ if(WITH_GPU)
     add_simple_unittest(RowConvOpTest)
     add_simple_unittest(BlockExpandOpTest)
     add_simple_unittest(CropOpTest)
+    add_simple_unittest(DepthwiseConvOpTest)
 endif()
 
-add_simple_unittest(ConvOpTest)
 add_simple_unittest(Im2ColTest)
+add_simple_unittest(GemmConvOpTest)
 endif()
 
 add_style_check_target(paddle_function ${h_files})
diff --git a/paddle/function/ConvOpTest.cpp b/paddle/function/ConvOpTest.cpp
deleted file mode 100644
index 7f32c734791853a8cd0287a80a7955dbd1bd7571..0000000000000000000000000000000000000000
--- a/paddle/function/ConvOpTest.cpp
+++ /dev/null
@@ -1,306 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#include <gtest/gtest.h>
-#include <memory>
-#include "Function.h"
-#include "FunctionTest.h"
-
-namespace paddle {
-
-enum TestType {
-  kForwardTest = 0,
-  kBackwardInputTest = 1,
-  kBackwardFilterTest = 2,
-};
-
-template <DeviceType DType1, DeviceType DType2>
-class ConvolutionTest {
-public:
-  ConvolutionTest(const std::string& conv1,
-                  const std::string& conv2,
-                  TestType type,
-                  bool useGroups = true,
-                  std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64}) {
-              if (inputChannels > outputChannels) break;
-              size_t groups;
-              if (!useGroups) {
-                groups = 1;
-              } else {
-                if (outputChannels % inputChannels != 0) continue;
-                groups = inputChannels;
-              }
-
-              for (size_t stride : {1, 2}) {
-                for (size_t padding : {0, 1}) {
-                  if (padding >= filterSize) break;
-                  size_t outputSize =
-                      (inputSize - filterSize + 2 * padding + stride) / stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputSize
-                          << " inputWidth=" << inputSize
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterSize
-                          << " filterWidth=" << filterSize
-                          << " outputHeight=" << outputSize
-                          << " outputWidth=" << outputSize
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", groups)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputSize, inputSize};
-
-                  TensorShape filter;
-                  if (groups > 1)
-                    filter = TensorShape({groups,
-                                          outputChannels / groups,
-                                          inputChannels / groups,
-                                          filterSize,
-                                          filterSize});
-                  else
-                    filter = TensorShape({outputChannels,
-                                          inputChannels,
-                                          filterSize,
-                                          filterSize});
-                  TensorShape output{
-                      batchSize, outputChannels, outputSize, outputSize};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
-                                    ADD_TO);
-                    test.run();
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-// Mainly used to test cases where the height and width (input, filter)
-// are not equal.
-template <DeviceType DType1, DeviceType DType2>
-class ConvolutionTest2 {
-public:
-  ConvolutionTest2(const std::string& conv1,
-                   const std::string& conv2,
-                   TestType type,
-                   bool useGroups = true,
-                   std::string algo = "auto") {
-    for (size_t batchSize : {16}) {
-      for (size_t inputHeight : {7, 31}) {
-        for (size_t inputWidth : {10, 54}) {
-          for (size_t filterHeight : {1, 5}) {
-            for (size_t filterWidth : {3, 7}) {
-              for (size_t inputChannels : {7}) {
-                for (size_t outputChannels : {7}) {
-                  size_t groups;
-                  if (!useGroups) {
-                    groups = 1;
-                  } else {
-                    if (outputChannels % inputChannels != 0) continue;
-                    groups = inputChannels;
-                  }
-
-                  size_t stride = 1;
-                  size_t padding = 0;
-                  size_t outputHeight =
-                      (inputHeight - filterHeight + 2 * padding + stride) /
-                      stride;
-                  size_t outputWidth =
-                      (inputWidth - filterWidth + 2 * padding + stride) /
-                      stride;
-                  VLOG(3) << " batchSize=" << batchSize
-                          << " inputChannels=" << inputChannels
-                          << " inputHeight=" << inputHeight
-                          << " inputWidth=" << inputWidth
-                          << " outputChannels=" << outputChannels
-                          << " filterHeight=" << filterHeight
-                          << " filterWidth=" << filterWidth
-                          << " outputHeight=" << outputHeight
-                          << " outputWidth=" << outputWidth
-                          << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DType1, DType2> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", groups)
-                          .set("algo", algo));
-
-                  TensorShape input{
-                      batchSize, inputChannels, inputHeight, inputWidth};
-
-                  TensorShape filter;
-                  if (groups > 1)
-                    filter = TensorShape({groups,
-                                          outputChannels / groups,
-                                          inputChannels / groups,
-                                          filterHeight,
-                                          filterWidth});
-                  else
-                    filter = TensorShape({outputChannels,
-                                          inputChannels,
-                                          filterHeight,
-                                          filterWidth});
-                  TensorShape output{
-                      batchSize, outputChannels, outputHeight, outputWidth};
-
-                  if (type == kForwardTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.run();
-                  } else if (type == kBackwardInputTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
-                    test.run();
-                  } else if (type == kBackwardFilterTest) {
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
-                    test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
-                    test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter),
-                                    ADD_TO);
-                    test.run();
-                  }
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
-
-// ======Start Convolution TEST======
-
-TEST(Forward, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test2(
-      "NaiveConv-CPU", "GemmConv-CPU", kForwardTest, false);
-}
-
-#ifndef PADDLE_ONLY_CPU
-TEST(Forward, GEMM2) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "GemmConv-GPU", kForwardTest, false);
-}
-
-TEST(BackwardInput, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU",
-      "GemmConvGradInput-GPU",
-      kBackwardInputTest,
-      false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU",
-      "GemmConvGradInput-GPU",
-      kBackwardInputTest,
-      false);
-}
-
-TEST(BackwardFilter, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU",
-      "GemmConvGradFilter-GPU",
-      kBackwardFilterTest,
-      false);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU",
-      "GemmConvGradFilter-GPU",
-      kBackwardFilterTest,
-      false);
-}
-#endif
-// ======End Convolution TEST======
-
-// ======Start DepthwiseConvolution TEST======
-
-// TODO(zhaolong) The depthwise convolution cpu test will be added when the cpu
-// version of depthwiseConv is implemented.
-
-#ifndef PADDLE_ONLY_CPU
-
-TEST(DepthwiseConvForward, GEMM2) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConv-CPU", "DepthwiseConv-GPU", kForwardTest);
-}
-
-TEST(DepthwiseConvBackwardInput, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradInput-CPU",
-      "DepthwiseConvGradInput-GPU",
-      kBackwardInputTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradInput-CPU",
-      "DepthwiseConvGradInput-GPU",
-      kBackwardInputTest);
-}
-
-TEST(DepthwiseConvBackwardFilter, GEMM) {
-  ConvolutionTest<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test(
-      "GemmConvGradFilter-CPU",
-      "DepthwiseConvGradFilter-GPU",
-      kBackwardFilterTest);
-  ConvolutionTest2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU> test2(
-      "GemmConvGradFilter-CPU",
-      "DepthwiseConvGradFilter-GPU",
-      kBackwardFilterTest);
-}
-
-#endif
-// ======End DepthwiseConvolution TEST======
-
-}  // namespace paddle
diff --git a/paddle/function/ConvOpTest.h b/paddle/function/ConvOpTest.h
new file mode 100644
index 0000000000000000000000000000000000000000..cb02a96d0dbef6f64fd9e7576179572e68bf5513
--- /dev/null
+++ b/paddle/function/ConvOpTest.h
@@ -0,0 +1,256 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "FunctionTest.h"
+
+namespace paddle {
+
+template <DeviceType DType1, DeviceType DType2>
+void forward(Compare2Function<DType1, DType2>& test,
+             const TensorShape& input,
+             const TensorShape& filter,
+             const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_input(Compare2Function<DType1, DType2>& test,
+                    const TensorShape& input,
+                    const TensorShape& filter,
+                    const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, filter));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, input), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+void backward_filter(Compare2Function<DType1, DType2>& test,
+                     const TensorShape& input,
+                     const TensorShape& filter,
+                     const TensorShape& output) {
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, output));
+  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, input));
+  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, filter), ADD_TO);
+  test.run();
+}
+
+template <DeviceType DType1, DeviceType DType2>
+using Function = void (*)(Compare2Function<DType1, DType2>& test,
+                          const TensorShape& input,
+                          const TensorShape& filter,
+                          const TensorShape& output);
+
+/**
+ * \brief A basic convolution function test interface.
+ *
+ * \param conv1         type name of convolution function 1.
+ * \param conv2         type name of convolution function 2.
+ * \param function      test function, can be one of the forward, backward_input
+ *                      backward_filter function.
+ * Example:
+ * 1. Compare GemmConv's CPU and GPU implementation:
+ *   Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+ *      "GemmConv-CPU", "GemmConv-GPU", forward);
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution(const std::string& conv1,
+                 const std::string& conv2,
+                 Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 5}) {
+    for (size_t inputSize : {7, 14, 31}) {
+      for (size_t filterSize : {1, 3, 5}) {
+        for (size_t inputChannels : {3, 16}) {
+          for (size_t outputChannels : {3, 16}) {
+            if (outputChannels < inputChannels) continue;
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                if (padding >= filterSize) break;
+
+                // NNPACK only supports stride = 1 if batchSize > 1
+                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                    batchSize > 1 && stride > 1)
+                  break;
+
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", (size_t)1)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{
+                    outputChannels, inputChannels, filterSize, filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for
+ *        image height is not equal image width.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void Convolution2(const std::string& conv1,
+                  const std::string& conv2,
+                  Function<DType1, DType2> function) {
+  for (size_t batchSize : {4}) {
+    for (size_t inputHeight : {7, 31}) {
+      for (size_t inputWidth : {10, 54}) {
+        for (size_t filterHeight : {1, 5}) {
+          for (size_t filterWidth : {3, 7}) {
+            for (size_t inputChannels : {7}) {
+              for (size_t outputChannels : {7}) {
+                size_t stride = 1;
+                size_t padding = 0;
+                size_t outputHeight =
+                    (inputHeight - filterHeight + 2 * padding + stride) /
+                    stride;
+                size_t outputWidth =
+                    (inputWidth - filterWidth + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputHeight
+                        << " inputWidth=" << inputWidth
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterHeight
+                        << " filterWidth=" << filterWidth
+                        << " outputHeight=" << outputHeight
+                        << " outputWidth=" << outputWidth
+                        << " stride=" << stride << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", (size_t)1)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputHeight, inputWidth};
+                TensorShape filter{
+                    outputChannels, inputChannels, filterHeight, filterWidth};
+                TensorShape output{
+                    batchSize, outputChannels, outputHeight, outputWidth};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+/**
+ * \brief A convolution function test interface for depthwise convolution.
+ */
+template <DeviceType DType1, DeviceType DType2>
+void DepthwiseConvolution(const std::string& conv1,
+                          const std::string& conv2,
+                          Function<DType1, DType2> function) {
+  for (size_t batchSize : {1, 32}) {
+    for (size_t inputSize : {7, 14, 54}) {
+      for (size_t filterSize : {3, 4}) {
+        for (size_t inputChannels : {32}) {
+          for (size_t outputChannels : {32, 64}) {
+            for (size_t stride : {1, 2}) {
+              for (size_t padding : {0, 1}) {
+                // NNPACK only supports stride = 1 if batchSize > 1,
+                // and there has some bug when batchSize > 1 and groups != 1
+                if ((conv1 == "NNPACKConv-CPU" || conv2 == "NNPACKConv-CPU") &&
+                    batchSize > 1)
+                  break;
+
+                size_t outputSize =
+                    (inputSize - filterSize + 2 * padding + stride) / stride;
+                VLOG(3) << " batchSize=" << batchSize
+                        << " inputChannels=" << inputChannels
+                        << " inputHeight=" << inputSize
+                        << " inputWidth=" << inputSize
+                        << " outputChannels=" << outputChannels
+                        << " filterHeight=" << filterSize
+                        << " filterWidth=" << filterSize
+                        << " outputHeight=" << outputSize
+                        << " outputWidth=" << outputSize << " stride=" << stride
+                        << " padding=" << padding;
+
+                std::vector<size_t> paddings = {padding, padding};
+                std::vector<size_t> strides = {stride, stride};
+                size_t groups = inputChannels;
+                Compare2Function<DType1, DType2> test(
+                    conv1,
+                    conv2,
+                    FuncConfig()
+                        .set("paddings", paddings)
+                        .set("strides", strides)
+                        .set("groups", groups)
+                        .set("algo", (std::string) "auto"));
+
+                TensorShape input{
+                    batchSize, inputChannels, inputSize, inputSize};
+                TensorShape filter{groups,
+                                   outputChannels / groups,
+                                   inputChannels / groups,
+                                   filterSize,
+                                   filterSize};
+                TensorShape output{
+                    batchSize, outputChannels, outputSize, outputSize};
+
+                function(test, input, filter, output);
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+
+}  // namespace paddle
diff --git a/paddle/function/DepthwiseConvOpTest.cpp b/paddle/function/DepthwiseConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..f44ae0c342e9536366e2b537694cee81fcb1a6ed
--- /dev/null
+++ b/paddle/function/DepthwiseConvOpTest.cpp
@@ -0,0 +1,37 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ConvOpTest.h"
+
+namespace paddle {
+
+#ifndef PADDLE_ONLY_CPU
+TEST(DepthwiseConv, Forward) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "DepthwiseConv-GPU", forward);
+}
+
+TEST(DepthwiseConv, BackwardInput) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "DepthwiseConvGradInput-GPU", backward_input);
+}
+
+TEST(DepthwiseConv, BackwardFilter) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "DepthwiseConvGradFilter-GPU", backward_filter);
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/GemmConvOpTest.cpp b/paddle/function/GemmConvOpTest.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..5283d79a5a53d979ae4e134f7e46b7ee106e9c44
--- /dev/null
+++ b/paddle/function/GemmConvOpTest.cpp
@@ -0,0 +1,50 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include "ConvOpTest.h"
+
+namespace paddle {
+
+TEST(GemmConv, NaiveConv) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "NaiveConv-CPU", "GemmConv-CPU", forward);
+}
+
+#ifndef PADDLE_ONLY_CPU
+TEST(GemmConv, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConv-CPU", "GemmConv-GPU", forward);
+}
+
+TEST(GemmConv, BackwardInput) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradInput-CPU", "GemmConvGradInput-GPU", backward_input);
+}
+
+TEST(GemmConv, BackwardFilter) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+  Convolution2<DEVICE_TYPE_CPU, DEVICE_TYPE_GPU>(
+      "GemmConvGradFilter-CPU", "GemmConvGradFilter-GPU", backward_filter);
+}
+#endif
+
+}  // namespace paddle
diff --git a/paddle/function/nnpack/NNPACKConvOp.cpp b/paddle/function/nnpack/NNPACKConvOp.cpp
index 00d048eb216baf37c875c870a31cfd55a97f2974..6ccc487cf1c26b181b025cc62c93807c8a2848ef 100644
--- a/paddle/function/nnpack/NNPACKConvOp.cpp
+++ b/paddle/function/nnpack/NNPACKConvOp.cpp
@@ -196,30 +196,30 @@ public:
         CHECK_EQ(status, nnp_status_success);
       }
     } else {
-      for (size_t g = 0; g < groups_; g++) {
-        // only supports stride = 1
-        CHECK_EQ(strideH(), 1);
-        CHECK_EQ(strideW(), 1);
-        nnp_status status =
-            nnp_convolution_output(algorithm_,
-                                   batchSize,
-                                   inputChannels / groups_,
-                                   outputChannels / groups_,
-                                   inputSize,
-                                   padding,
-                                   kernelSize,
-                                   inputData + inputOffset * g,
-                                   filterData + filterOffset * g,
-                                   nullptr, /* bias */
-                                   outputData + outputOffset * g,
-                                   bufferPtr,
-                                   sizePtr,
-                                   nnp_activation_identity,
-                                   nullptr,
-                                   threadpool_, /* threadpool */
-                                   nullptr);
-        CHECK_EQ(status, nnp_status_success);
-      }
+      // only supports stride = 1
+      CHECK_EQ(strideH(), 1);
+      CHECK_EQ(strideW(), 1);
+
+      // TODO(hedaoyuan): There has some bug when batchSize > 1 and groups_ > 1.
+      CHECK_EQ(groups_, static_cast<size_t>(1));
+      nnp_status status = nnp_convolution_output(algorithm_,
+                                                 batchSize,
+                                                 inputChannels,
+                                                 outputChannels,
+                                                 inputSize,
+                                                 padding,
+                                                 kernelSize,
+                                                 inputData,
+                                                 filterData,
+                                                 nullptr, /* bias */
+                                                 outputData,
+                                                 bufferPtr,
+                                                 sizePtr,
+                                                 nnp_activation_identity,
+                                                 nullptr,
+                                                 threadpool_, /* threadpool */
+                                                 nullptr);
+      CHECK_EQ(status, nnp_status_success);
     }
   }
 
diff --git a/paddle/function/nnpack/NNPACKConvOpTest.cpp b/paddle/function/nnpack/NNPACKConvOpTest.cpp
index 48180112111c67f36ddd425008187201655089c9..4dd3982487f3567f461ddaea8c5dc719fff04736 100644
--- a/paddle/function/nnpack/NNPACKConvOpTest.cpp
+++ b/paddle/function/nnpack/NNPACKConvOpTest.cpp
@@ -13,87 +13,18 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 
 #include <gtest/gtest.h>
-#include "paddle/function/Function.h"
-#include "paddle/function/FunctionTest.h"
-
-DEFINE_string(algo,
-              "auto",
-              "The algorithm (auto, ft8x8, ft16x16, wt8x8, "
-              "implicit-gemm, or direct) for computing convolution of NNPACK.");
+#include "paddle/function/ConvOpTest.h"
 
 namespace paddle {
 
-#define IS_NNPACK_SUPPORT(algo, filterSize, stride)        \
-  if (algo == "direct" && filterSize != 1) continue;       \
-  if (algo == "direct" && batchSize != 1) continue;        \
-  if (algo == "wt8x8" && filterSize != 3) continue;        \
-  if (algo == "implicit-gemm" && batchSize != 1) continue; \
-  if (algo != "auto" && algo != "implicit-gemm" && stride > 1) continue;
-
-class ConvolutionTest {
-public:
-  ConvolutionTest(const std::string& conv1,
-                  const std::string& conv2,
-                  std::string algo = "auto") {
-    for (size_t batchSize : {1, 32}) {
-      for (size_t inputSize : {7, 14, 54}) {
-        for (size_t filterSize : {1, 3, 5}) {
-          for (size_t inputChannels : {3, 64}) {
-            for (size_t outputChannels : {3, 64, 128}) {
-              if (inputChannels < outputChannels) break;
-              for (size_t stride : {1, 2}) {
-                // if batchSize > 1 NNPACKConv only supports stride = 1
-                if (batchSize > 1 && stride > 1) break;
-                for (size_t padding : {0, 1}) {
-                  if (padding >= filterSize) break;
-                  size_t outputSize =
-                      (inputSize - filterSize + 2 * padding + stride) / stride;
-                  IS_NNPACK_SUPPORT(algo, filterSize, stride);
-                  LOG(INFO) << " batchSize=" << batchSize
-                            << " inputChannels=" << inputChannels
-                            << " inputHeight=" << inputSize
-                            << " inputWidth=" << inputSize
-                            << " outputChannels=" << outputChannels
-                            << " filterHeight=" << filterSize
-                            << " filterWidth=" << filterSize
-                            << " outputHeight=" << outputSize
-                            << " outputWidth=" << outputSize
-                            << " stride=" << stride << " padding=" << padding;
-
-                  std::vector<size_t> paddings = {padding, padding};
-                  std::vector<size_t> strides = {stride, stride};
-                  Compare2Function<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU> test(
-                      conv1,
-                      conv2,
-                      FuncConfig()
-                          .set("paddings", paddings)
-                          .set("strides", strides)
-                          .set("groups", (size_t)1)
-                          .set("algo", algo));
-
-                  TensorShape shape0{
-                      batchSize, inputChannels, inputSize, inputSize};
-                  TensorShape shape1{
-                      outputChannels, inputChannels, filterSize, filterSize};
-                  TensorShape shape2{
-                      batchSize, outputChannels, outputSize, outputSize};
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape0));
-                  test.addInputs(BufferArg(VALUE_TYPE_FLOAT, shape1));
-                  test.addOutputs(BufferArg(VALUE_TYPE_FLOAT, shape2));
-                  test.run();
-                }
-              }
-            }
-          }
-        }
-      }
-    }
-  }
-};
+TEST(NNPACK, Forward) {
+  Convolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
+}
 
-TEST(Convolution, NNPACK) {
-  // NNPACK only supports stride = 1
-  ConvolutionTest test("GemmConv-CPU", "NNPACKConv-CPU", FLAGS_algo);
+TEST(NNPACK, Depthwise) {
+  DepthwiseConvolution<DEVICE_TYPE_CPU, DEVICE_TYPE_CPU>(
+      "GemmConv-CPU", "NNPACKConv-CPU", forward);
 }
 
 }  // namespace paddle
diff --git a/paddle/gserver/CMakeLists.txt b/paddle/gserver/CMakeLists.txt
index 0012636b8f618a1b45cfc801c04781e67694956f..62cff9361ccba3ae3b9359ddb932f5b26146eb97 100644
--- a/paddle/gserver/CMakeLists.txt
+++ b/paddle/gserver/CMakeLists.txt
@@ -23,6 +23,17 @@ endmacro()
 
 filter_test(GSERVER_HEADER)
 filter_test(GSERVER_SOURCES)
+
+if(NOT WITH_MKLDNN)
+    file(GLOB_RECURSE DNN_HEADER RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.h")
+    file(GLOB_RECURSE DNN_SOURCES RELATIVE "${CMAKE_CURRENT_SOURCE_DIR}" "MKLDNN*.cpp")
+    list(REMOVE_ITEM GSERVER_HEADER ${DNN_HEADER})
+    list(REMOVE_ITEM GSERVER_SOURCES ${DNN_SOURCES})
+    message(STATUS "Skip compiling with MKLDNNLayers and MKLDNNActivations")
+else()
+    message(STATUS "Compile with MKLDNNLayers and MKLDNNActivations")
+endif()
+
 if(NOT WITH_GPU)
     list(REMOVE_ITEM GSERVER_HEADER
         layers/CudnnConvBaseLayer.h
diff --git a/paddle/gserver/activations/ActivationFunction.cpp b/paddle/gserver/activations/ActivationFunction.cpp
index 5de2170877ed6f6c70c5617918ad2c4e3b3ed2ee..78e958e06fac84fa956abc9faea60157bf6132eb 100644
--- a/paddle/gserver/activations/ActivationFunction.cpp
+++ b/paddle/gserver/activations/ActivationFunction.cpp
@@ -112,7 +112,6 @@ BEGIN_DEFINE_ACTIVATION(softmax)
 private:
 MatrixPtr sftMaxSum_;
 MatrixPtr sftMaxDot_;
-MatrixPtr one_;
 
 public:
 Error __must_check forward(Argument& act) {
@@ -138,14 +137,6 @@ Error __must_check backward(Argument& act) {
                            1,
                            /* trans */ false,
                            useGpu(act.deviceId));
-    if (!one_ || one_->getWidth() != outputG->getWidth()) {
-      Matrix::resizeOrCreate(one_,
-                             1,
-                             outputG->getWidth(),
-                             /* trans */ false,
-                             useGpu(act.deviceId));
-      one_->one();
-    }
 
     sftMaxDot_->dotMul(*outputG, *outputV);
     sftMaxSum_->colMerge(*sftMaxDot_);
diff --git a/paddle/gserver/layers/MKLDNNBase.h b/paddle/gserver/layers/MKLDNNBase.h
new file mode 100644
index 0000000000000000000000000000000000000000..4c0234e7b3a91053596c32cea581fa5d1e26b9d5
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNBase.h
@@ -0,0 +1,97 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+typedef enum {
+  MKLDNN_BASE = 1,   // basical info of MKLDNN
+  MKLDNN_TESTS = 1,  // gtest info of MKLDNN
+  MKLDNN_SIZES = 2,  // size info of MKLDNN
+  MKLDNN_FMTS = 3,   // format info of MKLDNN
+  MKLDNN_ALL = 4,    // show all info of MKLDNN
+} MKLDNN_LOG_LEVEL;
+
+/**
+ * @brief MKLDNN CPU engine.
+ *
+ */
+class CPUEngine {
+public:
+  static CPUEngine& Instance() {
+    // Thread-safe in C++11.
+    static CPUEngine myInstance;
+    return myInstance;
+  }
+
+  // Disallow copy or move
+  CPUEngine(const CPUEngine&) = delete;             // Copy constructor
+  CPUEngine(CPUEngine&&) = delete;                  // Move constructor
+  CPUEngine& operator=(const CPUEngine&) = delete;  // Copy assignment
+  CPUEngine& operator=(CPUEngine&&) = delete;       // Move assignment
+
+  mkldnn::engine& getEngine() { return cpuEngine_; }
+
+protected:
+  CPUEngine() : cpuEngine_(mkldnn::engine::cpu, 0) {}
+  //    CPUEngine() : cpuEngine_(mkldnn::engine::cpu_lazy, 0) {}
+  ~CPUEngine() {}
+
+private:
+  mkldnn::engine cpuEngine_;
+};
+
+/**
+ * @brief MKLDNN Stream.
+ *
+ */
+class MKLDNNStream {
+public:
+  MKLDNNStream() : ready_(false) { resetState(); }
+
+  virtual ~MKLDNNStream() {}
+
+  /**
+   * @brief Submit stream
+   * @param prims The primitives vector
+   * @param block Waiting for the stream to complete
+   */
+  void submit(std::vector<mkldnn::primitive>& prims, bool block = true) {
+    resetState();
+    stream_->submit(prims).wait(block);
+    ready_ = false;
+  }
+
+  /**
+   * @brief Reset the mkldnn stream
+   */
+  void resetState() {
+    if (ready_) {
+      return;
+    }
+    // TODO(TJ): change me when mkldnn have method to reset this state
+    // stream_.reset(new mkldnn::stream(mkldnn::stream::kind::lazy));
+    stream_.reset(new mkldnn::stream(mkldnn::stream::kind::eager));
+    ready_ = true;
+  }
+
+private:
+  bool ready_;
+  std::shared_ptr<mkldnn::stream> stream_;
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.cpp b/paddle/gserver/layers/MKLDNNFcLayer.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..30f567eaf8248a8fba1b461a2bdbf2aab13f9e08
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNFcLayer.cpp
@@ -0,0 +1,282 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNFcLayer.h"
+#include "paddle/utils/Logging.h"
+#include "paddle/utils/Stat.h"
+
+using namespace mkldnn;  // NOLINT
+typedef memory::format format;
+typedef inner_product_forward fc_fwd;
+typedef inner_product_backward_weights fc_bwdWgt;
+typedef inner_product_backward_data fc_bwdData;
+
+namespace paddle {
+
+REGISTER_LAYER(mkldnn_fc, MKLDNNFcLayer);
+
+bool MKLDNNFcLayer::init(const LayerMap& layerMap,
+                         const ParameterMap& parameterMap) {
+  if (!MKLDNNLayer::init(layerMap, parameterMap)) {
+    return false;
+  }
+
+  CHECK_EQ(inputLayers_.size(), 1) << "Only support one input layer yet";
+  CHECK_EQ(inputLayers_.size(), parameters_.size());
+  CHECK(!parameters_[0]->isSparse()) << "Do not support sparse yet";
+
+  // output size, cat not be changed
+  oc_ = getSize();
+  oh_ = 1;
+  ow_ = 1;
+
+  // input size can not change in FC
+  iLayerSize_ = inputLayers_[0]->getSize();
+  CHECK_EQ(parameters_[0]->getSize(), iLayerSize_ * oc_);
+
+  // create weight
+  weight_ =
+      std::unique_ptr<Weight>(new Weight(oc_, iLayerSize_, parameters_[0], 0));
+
+  // create biases
+  if (biasParameter_.get() != NULL) {
+    biases_ = std::unique_ptr<Weight>(new Weight(1, oc_, biasParameter_));
+  }
+  return true;
+}
+
+void MKLDNNFcLayer::convertWeightsFromPaddle() {
+  if (FLAGS_use_mkldnn_wgt) {
+    return;
+  }
+
+  if (hasInitedWgt_) {
+    return;
+  }
+
+  // The weight_ is transposed from initial paddle weight
+  MatrixPtr paddleWgt = Matrix::create(
+      weight_->getW()->getData(), iLayerSize_, oc_, false, false);
+
+  // TODO(TJ): remove this print when do not need differ weights
+  std::ostringstream ostr;
+  paddleWgt->print(ostr);
+  VLOG(MKLDNN_ALL) << "Initial Weight from paddle: " << std::endl << ostr.str();
+
+  // The mkldnn weight is transposed from initial paddle matrix
+  MatrixPtr paddleWgtT;
+  paddleWgt->transpose(paddleWgtT, true);
+  weight_->getW()->copyFrom(*paddleWgtT);
+  hasInitedWgt_ = true;
+}
+
+void MKLDNNFcLayer::convertWeightsToPaddle() {
+  MatrixPtr dnnWgt = weight_->getW();
+  MatrixPtr paddleWgt;
+  dnnWgt->transpose(paddleWgt, true);
+
+  // copy paddle weight and override on weight_
+  MatrixPtr dnnWgtT = Matrix::create(
+      dnnWgt->getData(), dnnWgt->getWidth(), dnnWgt->getHeight(), false, false);
+  dnnWgtT->copyFrom(*paddleWgt);
+}
+
+void MKLDNNFcLayer::reshape() {
+  const Argument& input = getInput(0);
+  int batchSize = input.getBatchSize();
+  if (bs_ == batchSize) {
+    return;
+  }
+  bs_ = batchSize;
+  ih_ = input.getFrameHeight();
+  iw_ = input.getFrameWidth();
+  if (ih_ == 0) {
+    ih_ = 1;
+  }
+  if (iw_ == 0) {
+    iw_ = 1;
+  }
+  hasSpatial_ = true;
+  if (ih_ == 1 && iw_ == 1) {
+    hasSpatial_ = false;
+  }
+  CHECK_EQ(iLayerSize_, inputLayers_[0]->getSize());
+  ic_ = iLayerSize_ / (ih_ * iw_);
+  CHECK_EQ(size_t(ic_ * ih_ * iw_), iLayerSize_) << "not divisible";
+  CHECK_EQ(size_t(oc_), getSize());
+  printSizeInfo();
+
+  // reset output
+  output_.setFrameHeight(oh_);
+  output_.setFrameWidth(ow_);
+  resetOutput(bs_, oc_);
+
+  // reset mkldnn forward
+  resetFwd();
+  needResetBwd_ = true;
+
+  convertWeightsFromPaddle();
+}
+
+void MKLDNNFcLayer::resetFwd() {
+  bool hasBias = biases_ && biases_->getW();
+  real* iData = getInputValue(0)->getData();
+  real* oData = getOutputValue()->getData();
+  real* wData = weight_->getW()->getData();
+  real* bData = hasBias ? biases_->getW()->getData() : NULL;
+
+  // TODO(TJ): below create should be covered in MkldnnMatrix
+  // create memory desc
+  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+                                 : createMD({bs_, ic_}, format::nc);
+  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
+                                 : createMD({oc_, ic_}, format::oi);
+  memory::desc bMD = bData != NULL ? createMD({oc_}, format::x)
+                                   : createMD({}, format::format_undef);
+  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+
+  // create memory primitive desc and memory self
+  inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+  wgtVal_.reset(new memory(memory::primitive_desc(wMD, engine_), wData));
+  outVal_.reset(new memory(memory::primitive_desc(oMD, engine_), oData));
+
+  prop_kind pk = prop_kind::forward;
+  fc_fwd::desc fwdDesc = bData != NULL ? fc_fwd::desc(pk, iMD, wMD, bMD, oMD)
+                                       : fc_fwd::desc(pk, iMD, wMD, oMD);
+  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+
+  if (bData != NULL) {
+    biasVal_.reset(new memory(memory::primitive_desc(bMD, engine_), bData));
+    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *biasVal_, *outVal_));
+  } else {
+    fwd_.reset(new fc_fwd(fwdPD, *inVal_, *wgtVal_, *outVal_));
+  }
+  pipelineFwd_.clear();
+  pipelineFwd_.push_back(*fwd_);
+}
+
+void MKLDNNFcLayer::resetBwd() {
+  if (!needResetBwd_) {
+    return;
+  }
+  needResetBwd_ = false;
+
+  bool hasBias = biases_ && biases_->getWGrad();
+  real* iData = getInputValue(0)->getData();
+  real* iDiff = getInputGrad(0) != nullptr ? getInputGrad(0)->getData() : NULL;
+  real* oDiff = getOutputGrad()->getData();
+  real* wDiff = weight_->getWGrad()->getData();
+  real* bDiff = hasBias ? biases_->getWGrad()->getData() : NULL;
+
+  /// backward weight
+  // create memory desc for backward memory
+  memory::desc iMD = hasSpatial_ ? createMD({bs_, ic_, ih_, iw_}, format::nchw)
+                                 : createMD({bs_, ic_}, format::nc);
+  memory::desc wMD = hasSpatial_ ? createMD({oc_, ic_, ih_, iw_}, format::oihw)
+                                 : createMD({oc_, ic_}, format::oi);
+  memory::desc oMD = createMD({bs_, oc_}, format::nc);
+  memory::desc bMD = bDiff != NULL ? createMD({oc_}, format::x)
+                                   : createMD({}, format::format_undef);
+
+  if (inVal_) {
+    // update data
+    inVal_->set_data_handle(iData);
+  } else {
+    inVal_.reset(new memory(memory::primitive_desc(iMD, engine_), iData));
+  }
+
+  // create memory primitive desc and memory self
+  wgtGrad_.reset(new memory(memory::primitive_desc(wMD, engine_), wDiff));
+  outGrad_.reset(new memory(memory::primitive_desc(oMD, engine_), oDiff));
+
+  fc_fwd::desc fwdDesc = fc_fwd::desc(prop_kind::forward, iMD, wMD, oMD);
+  fc_fwd::primitive_desc fwdPD = fc_fwd::primitive_desc(fwdDesc, engine_);
+  fc_bwdWgt::desc bwdWgtDesc = bDiff != NULL
+                                   ? fc_bwdWgt::desc(iMD, wMD, bMD, oMD)
+                                   : fc_bwdWgt::desc(iMD, wMD, oMD);
+  fc_bwdWgt::primitive_desc bwdWgtPD =
+      fc_bwdWgt::primitive_desc(bwdWgtDesc, engine_, fwdPD);
+
+  if (bDiff != NULL) {
+    biasGrad_.reset(new memory(memory::primitive_desc(bMD, engine_), bDiff));
+    bwdWgt_.reset(
+        new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_, *biasGrad_));
+  } else {
+    bwdWgt_.reset(new fc_bwdWgt(bwdWgtPD, *inVal_, *outGrad_, *wgtGrad_));
+  }
+  pipelineBwd_.clear();
+  pipelineBwd_.push_back(*bwdWgt_);
+
+  /// backward data
+  if (iDiff == NULL) {
+    return;
+  }
+  fc_bwdData::desc bwdDataDesc = fc_bwdData::desc(iMD, wMD, oMD);
+  fc_bwdData::primitive_desc bwdDataPD =
+      fc_bwdData::primitive_desc(bwdDataDesc, engine_, fwdPD);
+  inGrad_.reset(new memory(memory::primitive_desc(iMD, engine_), iDiff));
+  CHECK(wgtVal_) << "Should have weight memory";
+  bwdData_.reset(new fc_bwdData(bwdDataPD, *outGrad_, *wgtVal_, *inGrad_));
+  pipelineBwd_.push_back(*bwdData_);
+}
+
+void MKLDNNFcLayer::forward(PassType passType) {
+  Layer::forward(passType);
+  reshape();
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_FwdTimer", getName().c_str());
+
+    // update input data
+    // since it might be changed if this is after data layer
+    real* iData = getInputValue(0)->getData();
+    inVal_->set_data_handle(iData);
+
+    // just submit forward pipeline
+    stream_->submit(pipelineFwd_);
+  }
+
+  /* activation */ {
+    REGISTER_TIMER_INFO("FwActTimer", getName().c_str());
+    forwardActivation();
+  }
+}
+
+void MKLDNNFcLayer::backward(const UpdateCallback& callback) {
+  /* Do derivation */ {
+    REGISTER_TIMER_INFO("BpActTimer", getName().c_str());
+    backwardActivation();
+  }
+
+  {
+    REGISTER_TIMER_INFO("mkldnn_bwdTimer", getName().c_str());
+    resetBwd();
+
+    // update diff
+    real* oDiff = getOutputGrad()->getData();
+    outGrad_->set_data_handle(oDiff);
+
+    // just sumbmit backward pipeline
+    stream_->submit(pipelineBwd_);
+  }
+
+  {
+    REGISTER_TIMER_INFO("WeightUpdate", getName().c_str());
+    weight_->getParameterPtr()->incUpdate(callback);
+    if (biases_ && biases_->getWGrad()) {
+      biases_->getParameterPtr()->incUpdate(callback);
+    }
+  }
+}
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNFcLayer.h b/paddle/gserver/layers/MKLDNNFcLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..7954852a23f81d36d5fb0ae6a19768f419886fb1
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNFcLayer.h
@@ -0,0 +1,80 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include "MKLDNNLayer.h"
+#include "mkldnn.hpp"
+
+namespace paddle {
+
+/**
+ * @brief A subclass of MKLDNNLayer fc layer.
+ *
+ * The config file api is mkldnn_fc
+ */
+class MKLDNNFcLayer : public MKLDNNLayer {
+protected:
+  // input layer size, can not be change after init
+  size_t iLayerSize_;  // == ic * ih * iw
+
+  // if has already init the weight
+  bool hasInitedWgt_;
+
+  // if input layer has image size info (ih>1 && iw>1)
+  bool hasSpatial_;
+
+  // fc weight and bias
+  std::unique_ptr<Weight> weight_;
+  std::unique_ptr<Weight> biases_;
+
+public:
+  explicit MKLDNNFcLayer(const LayerConfig& config)
+      : MKLDNNLayer(config), hasInitedWgt_(false), hasSpatial_(true) {}
+
+  ~MKLDNNFcLayer() {}
+
+  bool init(const LayerMap& layerMap,
+            const ParameterMap& parameterMap) override;
+
+  void convertWeightsFromPaddle() override;
+
+  void convertWeightsToPaddle() override;
+
+  void forward(PassType passType) override;
+
+  void backward(const UpdateCallback& callback) override;
+
+protected:
+  /**
+   * reshape the input image sizes
+   * and reset output buffer size
+   * and reset mkldnn forward
+   */
+  void reshape();
+
+  /**
+   * reset the forward primitve and memory
+   * only would be called when input size changes
+   */
+  void resetFwd();
+
+  /**
+   * reset the backward primitve and memory for mkldnn fc
+   * only would be called when needed
+   */
+  void resetBwd();
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/layers/MKLDNNLayer.h b/paddle/gserver/layers/MKLDNNLayer.h
new file mode 100644
index 0000000000000000000000000000000000000000..63e29f447eede5ff9df8715bc9140b64ab7f7d17
--- /dev/null
+++ b/paddle/gserver/layers/MKLDNNLayer.h
@@ -0,0 +1,132 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <vector>
+#include "Layer.h"
+#include "MKLDNNBase.h"
+#include "mkldnn.hpp"
+
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkldnn_wgt);
+
+namespace paddle {
+
+class MKLDNNLayer;
+typedef std::shared_ptr<MKLDNNLayer> MKLDNNLayerPtr;
+
+/**
+ * @brief Base class of MKLDNNlayer.
+ *
+ */
+class MKLDNNLayer : public Layer {
+protected:
+  // batch size
+  int bs_;
+  // input image channel, height and width
+  int ic_, ih_, iw_;
+  // output image channel, height and width
+  int oc_, oh_, ow_;
+
+  // backward also need reset after reset forward handle
+  bool needResetBwd_;
+
+  // mkldnn engine, stream and primivtives
+  mkldnn::engine engine_;
+  std::shared_ptr<MKLDNNStream> stream_;
+  std::shared_ptr<mkldnn::primitive> fwd_;
+  std::shared_ptr<mkldnn::primitive> bwdWgt_;
+  std::shared_ptr<mkldnn::primitive> bwdData_;
+  std::vector<mkldnn::primitive> pipelineFwd_;
+  std::vector<mkldnn::primitive> pipelineBwd_;
+
+  // TODO(TJ): change below memory as MKLDNNMatrixPtr type
+  std::shared_ptr<mkldnn::memory> inVal_;
+  std::shared_ptr<mkldnn::memory> inGrad_;
+  std::shared_ptr<mkldnn::memory> outVal_;
+  std::shared_ptr<mkldnn::memory> outGrad_;
+  std::shared_ptr<mkldnn::memory> wgtVal_;
+  std::shared_ptr<mkldnn::memory> wgtGrad_;
+  std::shared_ptr<mkldnn::memory> biasVal_;
+  std::shared_ptr<mkldnn::memory> biasGrad_;
+
+public:
+  explicit MKLDNNLayer(const LayerConfig& config)
+      : Layer(config),
+        bs_(0),
+        ic_(0),
+        ih_(0),
+        iw_(0),
+        oc_(0),
+        oh_(0),
+        ow_(0),
+        needResetBwd_(true),
+        engine_(mkldnn::engine::cpu, 0),
+        stream_(nullptr),
+        fwd_(nullptr),
+        bwdWgt_(nullptr),
+        bwdData_(nullptr) {}
+
+  ~MKLDNNLayer() {}
+
+  virtual bool init(const LayerMap& layerMap,
+                    const ParameterMap& parameterMap) {
+    if (!Layer::init(layerMap, parameterMap)) {
+      return false;
+    }
+
+    CHECK(FLAGS_use_mkldnn) << "MkldnnLayers only support use_mkldnn."
+                            << "Please set WITH_MKLDNN=ON "
+                            << "and set use_mkldnn=True";
+    stream_.reset(new MKLDNNStream());
+    engine_ = CPUEngine::Instance().getEngine();
+
+    // TODO(TJ): deivecId
+    return true;
+  }
+
+  /**
+   * convert weight from paddle format to mkldnn format
+   * weight_ will be override
+   */
+  virtual void convertWeightsFromPaddle() {}
+
+  /**
+   * convert mkldnn weight to paddle format
+   * weight_ will be override
+   */
+  virtual void convertWeightsToPaddle() {}
+
+  /**
+   * print info about sizes
+   */
+  virtual void printSizeInfo() {
+    VLOG(MKLDNN_SIZES) << getName() << ": bs: " << bs_ << ", ic: " << ic_
+                       << ", ih: " << ih_ << ", iw: " << iw_ << ", oc: " << oc_
+                       << ", oh: " << oh_ << ", ow: " << ow_;
+  }
+
+  // TODO(TJ): move to MkldnnMatrix
+  // create memory desc
+  inline mkldnn::memory::desc createMD(
+      mkldnn::memory::dims dims,
+      mkldnn::memory::format fmt,
+      mkldnn::memory::data_type type = mkldnn::memory::data_type::f32) {
+    // TODO(TJ): isFmtSuppoted(fmt)
+    return mkldnn::memory::desc(dims, type, fmt);
+  }
+};
+
+}  // namespace paddle
diff --git a/paddle/gserver/tests/CMakeLists.txt b/paddle/gserver/tests/CMakeLists.txt
index 209d0ab9c8d7e8463c8636b1412622a94f359fb1..c2a2993620492a9ec5dae932ff1292ced2c00064 100644
--- a/paddle/gserver/tests/CMakeLists.txt
+++ b/paddle/gserver/tests/CMakeLists.txt
@@ -9,7 +9,7 @@ add_unittest_without_exec(test_ProtoDataProvider
 # mkdir will get error.
 add_test(NAME test_ProtoDataProvider
     COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoDataProvider
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 
 ################# test_LayerGrad #######################
 add_unittest_without_exec(test_LayerGrad
@@ -18,6 +18,15 @@ add_unittest_without_exec(test_LayerGrad
 add_test(NAME test_LayerGrad
     COMMAND test_LayerGrad)
 
+########## test_Mkldnn layers and activations ##########
+if(WITH_MKLDNN)
+    add_unittest_without_exec(test_MKLDNN
+        test_MKLDNN.cpp
+        MKLDNNTester.cpp
+        LayerGradUtil.cpp)
+    add_test(NAME test_MKLDNN COMMAND test_MKLDNN)
+endif()
+
 ################ test_CRFLayerGrad ####################
 add_unittest_without_exec(test_CRFLayerGrad
     test_CRFLayerGrad.cpp
@@ -92,8 +101,8 @@ if(WITH_PYTHON)
         test_PyDataProvider.cpp)
 
     add_test(NAME test_PyDataProvider
-        COMMAND .set_python_path.sh -d ./gserver/tests:${PROJ_ROOT}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ./gserver/tests:${PADDLE_SOURCE_DIR}/python/ ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 ############### test_RecurrentLayer #######################
@@ -106,7 +115,7 @@ if(NOT WITH_DOUBLE)
 
     add_test(NAME test_WarpCTCLayer
         COMMAND ${CMAKE_CURRENT_BINARY_DIR}/test_WarpCTCLayer --warpctc_dir=${WARPCTC_LIB_DIR}
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 ############### test_RecurrentGradientMachine ###############
@@ -116,20 +125,20 @@ add_unittest_without_exec(test_RecurrentGradientMachine
     test_RecurrentGradientMachine.cpp)
 add_test(NAME test_RecurrentGradientMachine
     COMMAND .set_python_path.sh -d
-            ${PROJ_ROOT}/python:${PROJ_ROOT}/paddle/gserver/tests
+            ${PADDLE_SOURCE_DIR}/python:${PADDLE_SOURCE_DIR}/paddle/gserver/tests
             ${CMAKE_CURRENT_BINARY_DIR}/test_RecurrentGradientMachine
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 
 add_unittest_without_exec(test_NetworkCompare
     test_NetworkCompare.cpp)
 if(WITH_GPU)
     add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=true
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 else()
     add_test(NAME test_NetworkCompare
-        COMMAND .set_python_path.sh -d ${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle)
+        COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_NetworkCompare --use_gpu=false
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle)
 endif()
 
 
@@ -137,6 +146,6 @@ add_unittest_without_exec(test_PyDataProvider2
         test_PyDataProvider2.cpp)
 
 add_test(NAME test_PyDataProvider2
-   COMMAND .set_python_path.sh -d ${PROJ_ROOT}/paddle/gserver/tests:${PROJ_ROOT}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
-        WORKING_DIRECTORY ${PROJ_ROOT}/paddle
+   COMMAND .set_python_path.sh -d ${PADDLE_SOURCE_DIR}/paddle/gserver/tests:${PADDLE_SOURCE_DIR}/python ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProvider2
+        WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle
 )
diff --git a/paddle/gserver/tests/MKLDNNTester.cpp b/paddle/gserver/tests/MKLDNNTester.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..99c8c4948c9b05ad15d1217ebb70026bbd48453f
--- /dev/null
+++ b/paddle/gserver/tests/MKLDNNTester.cpp
@@ -0,0 +1,369 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "MKLDNNTester.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+#include "paddle/gserver/layers/MKLDNNLayer.h"
+
+namespace paddle {
+
+// init data layer and test layer of both dnn and reference
+void MKLDNNTester::reset(const TestConfig& dnn,
+                         const TestConfig& ref,
+                         size_t batchSize) {
+  const bool trans = false;
+  const bool useGpu = false;
+
+  // clear
+  configs_.clear();
+  layerNames_.clear();
+  dataLayers_.clear();
+  datas_.clear();
+  layerMaps_.clear();
+  parameters_.clear();
+  testLayers_.clear();
+
+  // resize
+  configs_.resize(NUM);
+  layerNames_.resize(NUM);
+  dataLayers_.resize(NUM);
+  datas_.resize(NUM);
+  layerMaps_.resize(NUM);
+  parameters_.resize(NUM);
+  testLayers_.resize(NUM);
+
+  // reset configs and layer names
+  configs_[DNN] = dnn;
+  configs_[REF] = ref;
+  layerNames_[DNN] = "mkldnn";     // the first is mkldnn layer
+  layerNames_[REF] = "reference";  // second is reference layer
+
+  // reset others
+  for (size_t i = 0; i < NUM; ++i) {
+    configs_[i].layerConfig.set_name(layerNames_[i]);
+    initDataLayer(configs_[i],
+                  &(dataLayers_[i]),
+                  &(datas_[i]),
+                  &(layerMaps_[i]),
+                  layerNames_[i],
+                  batchSize,
+                  trans,
+                  useGpu);
+    initTestLayer(
+        configs_[i], &(layerMaps_[i]), &(parameters_[i]), &(testLayers_[i]));
+  }
+  dnnLayer_ = testLayers_[DNN];
+  refLayer_ = testLayers_[REF];
+  EXPECT_EQ(dataLayers_[DNN].size(), dataLayers_[REF].size());
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+
+  setInputImgSize();
+}
+
+void MKLDNNTester::setInputImgSize() {
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      // TODO(TJ): fix me when concat and elewise ready
+      dataLayers_[n][i]->getOutput().setFrameHeight(ih_);
+      dataLayers_[n][i]->getOutput().setFrameWidth(iw_);
+    }
+  }
+}
+
+// init randome parameters of ref, and copy to mkldnn
+void MKLDNNTester::randomWgtDatas() {
+  EXPECT_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  for (size_t i = 0; i < parameters_[REF].size(); ++i) {
+    const VectorPtr& dnnValue = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& refValue = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    parameters_[REF][i]->randomize();
+    dnnValue->copyFrom(*refValue);
+
+    VLOG(lvl_) << "Random weight data " << parameters_[DNN][i]->getName();
+    printVector(dnnValue);
+  }
+}
+
+// random botdata of ref layer and copy same to mkldnn
+void MKLDNNTester::randomBotDatas() {
+  CHECK_EQ(dataLayers_.size(), NUM);
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    dataLayers_[REF][i]->getOutputValue()->randomizeUniform();
+    dataLayers_[DNN][i]->getOutputValue()->copyFrom(
+        *(dataLayers_[REF][i]->getOutputValue()));
+    VLOG(lvl_) << "Input " << i << " data:";
+    printMatrix(dataLayers_[REF][i]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::randomTopDiffs() {
+  refLayer_->getOutputGrad()->randomizeUniform();
+  dnnLayer_->getOutputGrad()->copyFrom(*(refLayer_->getOutputGrad()));
+  VLOG(lvl_) << "Random dom Backward Input, TopDiff: ";
+  printMatrix(refLayer_->getOutputGrad());
+}
+
+void MKLDNNTester::checkForward() {
+  printTopDatas();
+  double delta = compareMatrix(testLayers_[DNN]->getOutputValue(),
+                               testLayers_[REF]->getOutputValue());
+  VLOG(MKLDNN_ALL) << "Check Forward";
+  EXPECT_LE(fabs(delta), eps_);
+}
+
+void MKLDNNTester::checkBackwardData() {
+  // TODO(TJ): uncomment me when batch norm ready
+  // const bool isBN = dnnLayer_->getType() == "mkldnn_batch_norm";
+  for (size_t i = 0; i < dataLayers_[DNN].size(); ++i) {
+    const MatrixPtr& dnnDiff = dataLayers_[DNN][i]->getOutputGrad();
+    const MatrixPtr& refDiff = dataLayers_[REF][i]->getOutputGrad();
+    VLOG(lvl_) << "Mkldnn Backward Output BotDiff " << i;
+    printMatrix(dnnDiff);
+    VLOG(lvl_) << "Reference Backward Output BotDiff " << i;
+    printMatrix(refDiff);
+
+    double delta = compareMatrix(dnnDiff, refDiff);
+    EXPECT_LE(fabs(delta), eps_);
+    // TODO(TJ): uncomment me when batch norm ready
+    // if (isBN) {
+    //  // the other two inputs in batch norm are for moving mean and var
+    //  break;
+    // }
+  }
+}
+
+void MKLDNNTester::checkBackwardWgts() {
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  vector<VectorPtr> dnnWgts;  // used to temply save mkldnn weights
+  saveWgt(parameters_[DNN], dnnWgts);
+
+  const MKLDNNLayerPtr dnnlayer =
+      std::dynamic_pointer_cast<MKLDNNLayer>(dnnLayer_);
+  CHECK(dnnlayer);
+  dnnlayer->convertWeightsToPaddle();
+  for (size_t i = 0; i < parameters_[DNN].size(); ++i) {
+    const VectorPtr& dnn = parameters_[DNN][i]->getBuf(PARAMETER_VALUE);
+    const VectorPtr& ref = parameters_[REF][i]->getBuf(PARAMETER_VALUE);
+    VLOG(lvl_) << "Mkldnn Output weight " << parameters_[DNN][i]->getName();
+    printVector(dnn);
+    VLOG(lvl_) << "Reference Output weight " << parameters_[REF][i]->getName();
+    printVector(ref);
+
+    double delta = compareVector(dnn, ref);
+    EXPECT_LE(fabs(delta), eps_);
+  }
+
+  VLOG(MKLDNN_ALL) << "Restore dnn weights before comapre";
+  restoreWgt(dnnWgts, parameters_[DNN]);
+}
+
+void MKLDNNTester::saveWgt(const vector<ParameterPtr>& from,
+                           vector<VectorPtr>& to) {
+  const bool useGpu = false;
+  to.resize(from.size());
+  for (size_t i = 0; i < to.size(); ++i) {
+    const VectorPtr& wgt = from[i]->getBuf(PARAMETER_VALUE);
+    to[i] = Vector::create(wgt->getSize(), useGpu);
+    to[i]->copyFrom(*wgt);
+  }
+}
+
+void MKLDNNTester::restoreWgt(const vector<VectorPtr>& from,
+                              vector<ParameterPtr>& to) {
+  CHECK_EQ(from.size(), to.size());
+  for (size_t i = 0; i < from.size(); ++i) {
+    const VectorPtr& wgt = to[i]->getBuf(PARAMETER_VALUE);
+    wgt->copyFrom(*from[i]);
+  }
+}
+
+// clear parameters grad
+void MKLDNNTester::clearWgtDiffs() {
+  for (size_t n = 0; n < parameters_.size(); ++n) {
+    for (size_t i = 0; i < parameters_[n].size(); ++i) {
+      const VectorPtr& grad = parameters_[n][i]->getBuf(PARAMETER_GRADIENT);
+      if (grad) {
+        grad->zeroMem();
+      }
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs() {
+  // dnn and ref
+  for (size_t n = 0; n < dataLayers_.size(); ++n) {
+    // all inputs layers
+    for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+      dataLayers_[n][i]->getOutputGrad()->zeroMem();
+    }
+  }
+}
+
+void MKLDNNTester::clearBotDiffs(int n) {
+  CHECK_LT(n, NUM);
+  // all inputs layers
+  for (size_t i = 0; i < dataLayers_[n].size(); ++i) {
+    dataLayers_[n][i]->getOutputGrad()->zeroMem();
+  }
+}
+
+void MKLDNNTester::clearTopDatas() {
+  for (size_t i = 0; i < testLayers_.size(); ++i) {
+    testLayers_[i]->getOutputValue()->zeroMem();
+  }
+}
+
+void MKLDNNTester::printTopDatas() {
+  if (!log_) {
+    return;
+  }
+
+  for (int n = 0; n < NUM; ++n) {
+    VLOG(lvl_) << testLayers_[n]->getType() << " forward output TopData: ";
+    printMatrix(testLayers_[n]->getOutputValue());
+  }
+}
+
+void MKLDNNTester::printMatrix(const MatrixPtr& m) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  m->print(ostr);
+  VLOG(lvl_) << std::endl << ostr.str();
+}
+
+void MKLDNNTester::printVector(const VectorPtr& v) {
+  if (!log_) {
+    return;
+  }
+
+  std::ostringstream ostr;
+  v->print(ostr, v->getSize());
+  VLOG(lvl_) << std::endl << ostr.str();
+}
+
+double MKLDNNTester::getDelta(const real* d1,
+                              const real* d2,
+                              size_t len,
+                              const float failRate,
+                              const float thres) {
+  double delta = 0, sum = 0;
+  int failCnt = 0;
+  const double eps = 1e-5;
+  double maxOut = 0;
+  for (size_t i = 0; i < len; ++i) {
+    double ref = fabs(d2[i]);
+    double diff = fabs(d1[i] - d2[i]);
+    delta += diff;
+    sum += ref;
+    if (ref > eps && fabs(d1[i]) > eps && diff / ref > thres) {
+      maxOut = std::max(maxOut, diff / ref);
+      failCnt++;
+    }
+  }
+  EXPECT_TRUE(std::isnormal(sum));
+  EXPECT_FALSE(std::isinf(sum));
+  EXPECT_FALSE(std::isnan(delta));
+  VLOG(MKLDNN_ALL) << "reference avg data: " << sum / len
+                   << ", delta: " << delta / sum << ", failCnt:" << failCnt;
+  return (failCnt / (float)len) > failRate ? maxOut : delta / sum;
+}
+
+double MKLDNNTester::compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2) {
+  CHECK_EQ(m1->getElementCnt(), m2->getElementCnt());
+  return getDelta(m1->getData(), m2->getData(), m1->getElementCnt());
+}
+
+double MKLDNNTester::compareVector(const VectorPtr& v1, const VectorPtr& v2) {
+  CHECK_EQ(v1->getSize(), v2->getSize());
+  return getDelta(v1->getData(), v2->getData(), v1->getSize());
+}
+
+void MKLDNNTester::runOnce() {
+  // test forward
+  randomBotDatas();
+  dnnLayer_->forward(PASS_TRAIN);
+  refLayer_->forward(PASS_TRAIN);
+  checkForward();
+
+  // test backward
+  randomTopDiffs();
+  dnnLayer_->backward(nullptr);
+  refLayer_->backward(nullptr);
+  checkBackwardData();
+  checkBackwardWgts();
+
+  // clear buffers
+  // ref code will addto the diff, dnn code will writeto it
+  // and clearTopDatas() and clearWgtDiffs() should be coverd by test layers
+  clearBotDiffs(REF);
+}
+
+void MKLDNNTester::run(const TestConfig& dnn,
+                       const TestConfig& ref,
+                       size_t batchSize,
+                       size_t inputImgH,
+                       size_t inputImgW,
+                       size_t iter,
+                       float epsilon,
+                       bool log,
+                       int level) {
+  VLOG(MKLDNN_TESTS) << "Test MKLDNN functionality: " << dnn.layerConfig.type()
+                     << " vs " << ref.layerConfig.type();
+  ih_ = inputImgH;
+  iw_ = inputImgW;
+  iter_ = iter;
+  eps_ = epsilon;
+  log_ = log;
+  lvl_ = level;
+
+  // Firstly test FLAGS_use_mkldnn_wgt = false
+  FLAGS_use_mkldnn_wgt = false;
+  // reset and run once
+  reset(dnn, ref, batchSize);
+  randomWgtDatas();
+  clearWgtDiffs();
+  clearBotDiffs();
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+
+  // Then test FLAGS_use_mkldnn_wgt = true
+  FLAGS_use_mkldnn_wgt = true;
+  // after run once the mkldnn weight has been stored in dnnlayer
+  // then save the weights and restart again
+  vector<VectorPtr> dnnWgts, refWgts;
+  CHECK_EQ(parameters_[DNN].size(), parameters_[REF].size());
+  saveWgt(parameters_[DNN], dnnWgts);
+  saveWgt(parameters_[REF], refWgts);
+
+  // restart again with flag true
+  reset(dnn, ref, batchSize);
+
+  // restore wgt
+  restoreWgt(dnnWgts, parameters_[DNN]);
+  restoreWgt(refWgts, parameters_[REF]);
+  clearWgtDiffs();
+  clearBotDiffs();
+
+  for (size_t i = 0; i < iter_; ++i) {
+    VLOG(MKLDNN_TESTS) << "Check Iteration " << i;
+    runOnce();
+  }
+}
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/MKLDNNTester.h b/paddle/gserver/tests/MKLDNNTester.h
new file mode 100644
index 0000000000000000000000000000000000000000..522eeaf24b1949abac057a1e59e9977610be23c0
--- /dev/null
+++ b/paddle/gserver/tests/MKLDNNTester.h
@@ -0,0 +1,120 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#include <string>
+#include <vector>
+#include "LayerGradUtil.h"
+#include "paddle/gserver/layers/MKLDNNBase.h"
+
+namespace paddle {
+
+/**
+ * @brief test the functionality of Mkldnnlayers
+ * refer to paddle original function
+ */
+class MKLDNNTester {
+  enum {
+    DNN = 0,  // MKLDNN layer
+    REF = 1,  // Reference layer
+    NUM = 2,  // Number of total
+  };
+
+protected:
+  std::vector<TestConfig> configs_;
+  vector<string> layerNames_;
+  vector<vector<DataLayerPtr>> dataLayers_;
+  vector<vector<Argument>> datas_;
+  vector<LayerMap> layerMaps_;
+  vector<vector<ParameterPtr>> parameters_;
+  vector<LayerPtr> testLayers_;
+  LayerPtr dnnLayer_, refLayer_;
+
+  /// run some iterations, all the result should pass
+  size_t iter_;
+  /// whether to print out the details
+  bool log_;
+  /// vlog level to print the matrix details datas
+  int lvl_;
+  /// epsilon
+  float eps_;
+  /// input image size, default 1
+  size_t ih_, iw_;
+
+public:
+  explicit MKLDNNTester(size_t iter = 3, float epsilon = 1e-4) {
+    iter_ = iter;
+    eps_ = epsilon;
+    log_ = false;
+    lvl_ = MKLDNN_ALL;
+  }
+
+  ~MKLDNNTester() {}
+
+public:
+  void run(const TestConfig& dnn,
+           const TestConfig& ref,
+           size_t batchSize,
+           size_t inputImgH = 1,
+           size_t inputImgW = 1,
+           size_t iter = 3,
+           float epsilon = 1e-4,
+           bool log = false,
+           int level = MKLDNN_ALL);
+  void setLogLevel(int lvl) { lvl_ = lvl; }
+
+private:
+  void reset(const TestConfig& dnn, const TestConfig& ref, size_t batchSize);
+  void setInputImgSize();
+  void runOnce();
+
+  void randomWgtDatas();
+  void randomBotDatas();
+  void randomTopDiffs();
+
+  void checkForward();
+  void checkBackwardData();
+  void checkBackwardWgts();
+
+  void clearWgtDiffs();
+  void clearBotDiffs();
+  void clearBotDiffs(int n);  // clear specific layer
+  void clearTopDatas();
+
+  void printTopDatas();
+  void printMatrix(const MatrixPtr& m);
+  void printVector(const VectorPtr& v);
+
+  void saveWgt(const vector<ParameterPtr>& from, vector<VectorPtr>& to);
+  void restoreWgt(const vector<VectorPtr>& from, vector<ParameterPtr>& to);
+
+  double compareMatrix(const MatrixPtr& m1, const MatrixPtr& m2);
+  double compareVector(const VectorPtr& v1, const VectorPtr& v2);
+
+  /**
+   * Get delta percent
+   * if many(>failRate) wrong(abs(dnn-ref)/abs(ref)>thres) points return the
+   * max(diff/ref)
+   * else return sum(abs(a-b)) / sum(abs(b))
+   * The return value should smaller than eps when passing.
+   */
+  double getDelta(const real* d1,
+                  const real* d2,
+                  size_t len,
+                  const float failRate = 1e-3,
+                  const float thres = 0.1);
+};
+
+}  //  namespace paddle
diff --git a/paddle/gserver/tests/test_KmaxSeqScore.cpp b/paddle/gserver/tests/test_KmaxSeqScore.cpp
index a51fe390c74d74cd5f3d07df62b715b239335548..308abe6816428bc0f98ec32e892622fa4a23b1ae 100644
--- a/paddle/gserver/tests/test_KmaxSeqScore.cpp
+++ b/paddle/gserver/tests/test_KmaxSeqScore.cpp
@@ -96,6 +96,11 @@ TEST(Layer, kmaxSeqScoreLayer) {
   MatrixPtr inValue =
       Matrix::create(subSeqStartPosition.back(), 1, false, false);
 
+  std::vector<bool> mode = {false};
+#ifndef PADDLE_ONLY_CPU
+  mode.push_back(true);
+#endif
+
   for (auto hasSubseq : {false, true}) {
     vector<vector<int>> groundTruth;
     inValue->randomizeUniform();
@@ -104,7 +109,7 @@ TEST(Layer, kmaxSeqScoreLayer) {
                          hasSubseq ? subSeqStartPosition : seqStartPosition,
                          beamSize);
 
-    for (auto useGpu : {false, true}) {
+    for (auto useGpu : mode) {
       TestConfig config;
       config.layerConfig.set_type("kmax_seq_score");
       config.layerConfig.set_beam_size(beamSize);
diff --git a/paddle/gserver/tests/test_MKLDNN.cpp b/paddle/gserver/tests/test_MKLDNN.cpp
new file mode 100644
index 0000000000000000000000000000000000000000..e1d2270df24331914f3a51acc90a518084b3ce4e
--- /dev/null
+++ b/paddle/gserver/tests/test_MKLDNN.cpp
@@ -0,0 +1,76 @@
+/* Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserve.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <gtest/gtest.h>
+#include <string>
+#include <vector>
+#include "MKLDNNTester.h"
+#include "ModelConfig.pb.h"
+
+using namespace paddle;  // NOLINT
+
+DECLARE_bool(thread_local_rand_use_global_seed);
+DECLARE_bool(use_gpu);
+DECLARE_bool(use_mkldnn);
+
+struct testFCDesc {
+  int bs;
+  int ic;
+  int oc;
+  int ih, iw;  // oh == ow == 1
+};
+
+void testFcLayer(const testFCDesc& pm) {
+  const std::string compareTypes[] = {"mkldnn_fc", "fc"};
+  TestConfig cfg;
+  cfg.layerConfig.set_type(compareTypes[0]);
+  cfg.layerConfig.set_size(pm.oc);
+  cfg.inputDefs.push_back(
+      {INPUT_DATA,
+       "layer_0",
+       /* size of input layer= */ size_t(pm.ic * pm.ih * pm.iw),
+       /* size of weight= */ size_t(pm.oc * pm.ic * pm.ih * pm.iw)});
+  cfg.layerConfig.add_inputs();
+
+  MKLDNNTester tester;
+  for (auto biasSize : {pm.oc, 0}) {
+    cfg.biasSize = biasSize;
+    TestConfig ref = cfg;
+    ref.layerConfig.set_type(compareTypes[1]);
+    for (auto bs : {pm.bs, 1}) {
+      tester.run(cfg, ref, bs, pm.ih, pm.iw);
+    }
+  }
+}
+
+TEST(MKLDNNLayer, FcLayer) {
+  testFcLayer({/*bs*/ 2, /*ic*/ 2, /*oc*/ 3, /*ih*/ 1, /*iw*/ 1});
+  testFcLayer({/*bs*/ 3, /*ic*/ 7, /*oc*/ 19, /*ih*/ 1, /*iw*/ 1});
+  testFcLayer({/*bs*/ 8, /*ic*/ 16, /*oc*/ 32, /*ih*/ 13, /*iw*/ 13});
+  testFcLayer({/*bs*/ 4, /*ic*/ 12, /*oc*/ 18, /*ih*/ 13, /*iw*/ 11});
+  testFcLayer({/*bs*/ 2, /*ic*/ 64, /*oc*/ 32, /*ih*/ 16, /*iw*/ 16});
+  testFcLayer({/*bs*/ 15, /*ic*/ 3, /*oc*/ 6, /*ih*/ 16, /*iw*/ 16});
+}
+
+// TODO(TJ): add branch test
+
+int main(int argc, char** argv) {
+  testing::InitGoogleTest(&argc, argv);
+  FLAGS_use_gpu = false;
+  FLAGS_use_mkldnn = true;
+  initMain(argc, argv);
+  FLAGS_thread_local_rand_use_global_seed = true;
+  srand(1);
+  return RUN_ALL_TESTS();
+}
diff --git a/paddle/math/CMakeLists.txt b/paddle/math/CMakeLists.txt
index 9981de61606bda6baac103592125b929d4c12a3d..bf28092e82b778dc904c5a2e271f76261cf5f6b6 100644
--- a/paddle/math/CMakeLists.txt
+++ b/paddle/math/CMakeLists.txt
@@ -15,13 +15,13 @@
 file(GLOB MATH_HEADERS . *.h)
 file(GLOB MATH_SOURCES . *.cpp)
 set(MATH_SOURCES
-    "${PROJ_ROOT}/paddle/math/BaseMatrix.cu"
-    "${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu"
+    "${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu"
     ${MATH_SOURCES})
 if(NOT WITH_GPU)
     # then compile BaseMatrix.cu as c++ file
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/BaseMatrix.cu")
-    compile_cu_as_cpp("${PROJ_ROOT}/paddle/math/TrainingAlgorithmOp.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/BaseMatrix.cu")
+    compile_cu_as_cpp("${PADDLE_SOURCE_DIR}/paddle/math/TrainingAlgorithmOp.cu")
     add_library(paddle_math STATIC
         ${MATH_SOURCES})
 else()
diff --git a/paddle/math/CpuSparseMatrix.h b/paddle/math/CpuSparseMatrix.h
index 860cad1047fc343b13efa901186ea218d0855151..36d57bbb65245de6b0de5909b55fbc4be3eccd78 100644
--- a/paddle/math/CpuSparseMatrix.h
+++ b/paddle/math/CpuSparseMatrix.h
@@ -302,6 +302,10 @@ public:
   bool isSparse() const { return true; }
 
 private:
+  using Matrix::mul;
   using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
 };
 }  // namespace paddle
diff --git a/paddle/math/SparseMatrix.h b/paddle/math/SparseMatrix.h
index f6cd5df338965b55ca17636de097d2401dc057f9..16300db081f89182faa82ea5798e8ec2f1cd93f9 100644
--- a/paddle/math/SparseMatrix.h
+++ b/paddle/math/SparseMatrix.h
@@ -231,6 +231,9 @@ public:
 private:
   using Matrix::mul;
   using Matrix::copyFrom;
+  using Matrix::rowMax;
+  using Matrix::print;
+  using Matrix::subMatrix;
 };
 
 }  // namespace paddle
diff --git a/paddle/operators/CMakeLists.txt b/paddle/operators/CMakeLists.txt
index b3399aaf0fb864857ecbf19a7ebeb498b29510f5..c181bd7b881c08dfd80d640b1ddce10b3c74d758 100644
--- a/paddle/operators/CMakeLists.txt
+++ b/paddle/operators/CMakeLists.txt
@@ -61,9 +61,6 @@ op_library(fill_zeros_like_op SRCS fill_zeros_like_op.cc fill_zeros_like_op.cu)
 
 op_library(sgd_op SRCS sgd_op.cc sgd_op.cu)
 
-op_library(fc_op
-    SRCS fc_op.cc
-    DEPS mul_op rowwise_add_op sigmoid_op softmax_op net_op)
 op_library(recurrent_op SRCS recurrent_op.cc rnn/recurrent_op_utils.cc
     DEPS op_desc tensor op_registry operator net_op)
 cc_test(recurrent_op_test SRCS recurrent_op_test.cc DEPS recurrent_op gtest mul_op add_op)
diff --git a/paddle/operators/fc_op.cc b/paddle/operators/fc_op.cc
deleted file mode 100644
index 01a1a81206f160386467b3c789a41206d89576b6..0000000000000000000000000000000000000000
--- a/paddle/operators/fc_op.cc
+++ /dev/null
@@ -1,76 +0,0 @@
-/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
-
-   Licensed under the Apache License, Version 2.0 (the "License");
-   you may not use this file except in compliance with the License.
-   You may obtain a copy of the License at
-
-   http://www.apache.org/licenses/LICENSE-2.0
-
-   Unless required by applicable law or agreed to in writing, software
-   distributed under the License is distributed on an "AS IS" BASIS,
-   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-   See the License for the specific language governing permissions and
-   limitations under the License. */
-
-#include "paddle/operators/net_op.h"
-
-#include "paddle/framework/eigen.h"
-#include "paddle/framework/op_registry.h"
-
-namespace paddle {
-namespace operators {
-
-using OpRegistry = framework::OpRegistry;
-
-class FullyConnectedOp : public NetOp {
- public:
-  void Init() override {
-    AddOp(OpRegistry::CreateOp("mul",
-                               {
-                                   Input("X"), Input("W"),
-                               },
-                               {Output("before_act")}, {}));
-    auto b = Input("b");
-    if (b != framework::kEmptyVarName) {
-      AddOp(OpRegistry::CreateOp("rowwise_add",
-                                 {Output("before_act"), Input("b")},
-                                 {Output("before_act")}, {}));
-    }
-
-    auto activation = GetAttr<std::string>("activation");
-    AddOp(OpRegistry::CreateOp(activation, {Output("before_act")},
-                               {Output("Y")}, {}));
-    CompleteAddOp(false);
-  }
-};
-
-class FullyConnectedOpMaker : public framework::OpProtoAndCheckerMaker {
- public:
-  FullyConnectedOpMaker(framework::OpProto *proto,
-                        framework::OpAttrChecker *op_checker)
-      : OpProtoAndCheckerMaker(proto, op_checker) {
-    AddInput("X", "the input of fc operator");
-    AddInput("W", "the weight of fc operator");
-    AddInput("b", "the bias of fc operator");
-
-    AddOutput("Y", "the output of fc operator");
-    AddOutput("before_act", "the before activation output of fc operator")
-        .SetTemporary();
-    AddAttr<std::string>("activation", "The activation key for fc layer")
-        .SetDefault("sigmoid")
-        .InEnum({"sigmoid", "softmax"});
-
-    //! TODO(yuyang18): Complete comment;
-    AddComment("FullyConnected Operator");
-  }
-};
-}  // namespace operators
-}  // namespace paddle
-
-USE_OP(mul);
-USE_OP(rowwise_add);
-USE_OP(sigmoid);
-USE_OP(softmax);
-
-namespace ops = paddle::operators;
-REGISTER_OP(fc, ops::FullyConnectedOp, ops::FullyConnectedOpMaker);
diff --git a/paddle/operators/gaussian_random_op.cu b/paddle/operators/gaussian_random_op.cu
index 54e4ae5d2b255f72582b9826685bfacf6c565fab..1340b1e1e9f19fd96ced9e57fab75fe9d33bc84e 100644
--- a/paddle/operators/gaussian_random_op.cu
+++ b/paddle/operators/gaussian_random_op.cu
@@ -33,15 +33,16 @@ class GaussianRandomKernel : public framework::OpKernel {
 
     int seed = context.op_.GetAttr<int>("seed");
     if (seed == 0) {
-      seed = std::random_device()();
+      std::random_device rd;
+      seed = rd();
     }
     curandGenerator_t g;
     PADDLE_ENFORCE(platform::dynload::curandCreateGenerator(
         &g, CURAND_RNG_PSEUDO_DEFAULT));
     PADDLE_ENFORCE(
         platform::dynload::curandSetPseudoRandomGeneratorSeed(g, seed));
-    curandGenerateNormal(g, data, framework::product(tensor->dims()), mean,
-                         std);
+    platform::dynload::curandGenerateNormal(
+        g, data, framework::product(tensor->dims()), mean, std);
   }
 };
 
@@ -49,4 +50,4 @@ class GaussianRandomKernel : public framework::OpKernel {
 }  // namespace paddle
 
 namespace ops = paddle::operators;
-REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
\ No newline at end of file
+REGISTER_OP_GPU_KERNEL(gaussian_random, ops::GaussianRandomKernel<float>);
diff --git a/paddle/operators/uniform_random_op.cu b/paddle/operators/uniform_random_op.cu
index f1a63e52ec0d3d46a505a89d7d7916bf93a58221..b35ebe7b630be72a5856ec1d3cc32bfaf097aa8a 100644
--- a/paddle/operators/uniform_random_op.cu
+++ b/paddle/operators/uniform_random_op.cu
@@ -51,7 +51,8 @@ class GPUUniformRandomKernel : public framework::OpKernel {
     unsigned int seed =
         static_cast<unsigned int>(context.op_.GetAttr<int>("seed"));
     if (seed == 0) {
-      seed = std::random_device()();
+      std::random_device rd;
+      seed = rd();
     }
     T min = static_cast<T>(context.op_.GetAttr<float>("min"));
     T max = static_cast<T>(context.op_.GetAttr<float>("max"));
diff --git a/paddle/platform/dynload/cublas.h b/paddle/platform/dynload/cublas.h
index c44b7240a885c2ef71e550df645dbaded69f9944..aad8097dbb33cbf6c0f2b4b3efb1376fbe96bc74 100644
--- a/paddle/platform/dynload/cublas.h
+++ b/paddle/platform/dynload/cublas.h
@@ -48,13 +48,13 @@ extern void *cublas_dso_handle;
   };                                                                \
   extern DynLoad__##__name __name
 #else
-#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name) \
-  struct DynLoad__##__name {                     \
-    inline template <typename... Args>           \
-    cublasStatus_t operator()(Args... args) {    \
-      return __name(args...);                    \
-    }                                            \
-  };                                             \
+#define DECLARE_DYNAMIC_LOAD_CUBLAS_WRAP(__name)     \
+  struct DynLoad__##__name {                         \
+    template <typename... Args>                      \
+    inline cublasStatus_t operator()(Args... args) { \
+      return __name(args...);                        \
+    }                                                \
+  };                                                 \
   extern DynLoad__##__name __name
 #endif
 
diff --git a/paddle/platform/dynload/curand.h b/paddle/platform/dynload/curand.h
index d8c46bc41e18d013a80cd0a9116a4b1a52bf5854..7bfe0778c78f6075ec8a284d478a1f9d5ee66ae9 100644
--- a/paddle/platform/dynload/curand.h
+++ b/paddle/platform/dynload/curand.h
@@ -55,6 +55,7 @@ extern void *curand_dso_handle;
   __macro(curandSetPseudoRandomGeneratorSeed); \
   __macro(curandGenerateUniform);              \
   __macro(curandGenerateUniformDouble);        \
+  __macro(curandGenerateNormal);               \
   __macro(curandDestroyGenerator);
 
 CURAND_RAND_ROUTINE_EACH(DECLARE_DYNAMIC_LOAD_CURAND_WRAP);
diff --git a/paddle/platform/place.h b/paddle/platform/place.h
index a82e8c942fa28297d91056a66b61f085f2bdb946..1117476bb37f1b0f3876c55e610803d5ee2558ce 100644
--- a/paddle/platform/place.h
+++ b/paddle/platform/place.h
@@ -14,8 +14,8 @@ limitations under the License. */
 
 #pragma once
 
-#include <boost/variant.hpp>
 #include <iostream>
+#include "paddle/platform/variant.h"
 
 namespace paddle {
 namespace platform {
diff --git a/paddle/platform/variant.h b/paddle/platform/variant.h
new file mode 100644
index 0000000000000000000000000000000000000000..c2257af1b5dd1a1e284979bf17e1a947072baa85
--- /dev/null
+++ b/paddle/platform/variant.h
@@ -0,0 +1,32 @@
+/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserve.
+
+   Licensed under the Apache License, Version 2.0 (the "License");
+   you may not use this file except in compliance with the License.
+   You may obtain a copy of the License at
+
+   http://www.apache.org/licenses/LICENSE-2.0
+
+   Unless required by applicable law or agreed to in writing, software
+   distributed under the License is distributed on an "AS IS" BASIS,
+   WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+   See the License for the specific language governing permissions and
+   limitations under the License. */
+
+#pragma once
+
+#include <boost/config.hpp>
+
+#ifndef PADDLE_ONLY_CPU
+
+// Because boost's variadic templates has bug on nvcc, boost will disable
+// variadic template support when GPU enabled on nvcc.
+// Define BOOST_NO_CXX11_VARIADIC_TEMPLATES on gcc/clang to generate same
+// function symbols.
+//
+// https://github.com/PaddlePaddle/Paddle/issues/3386
+#ifndef BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#define BOOST_NO_CXX11_VARIADIC_TEMPLATES
+#endif
+#endif
+
+#include <boost/variant.hpp>
diff --git a/paddle/pserver/test/CMakeLists.txt b/paddle/pserver/test/CMakeLists.txt
index 6e8f9c37f64b70921e09241089a5a480fd8ca47f..b66a00ba0652dfe1afbb877eca06cacdfe2ca343 100644
--- a/paddle/pserver/test/CMakeLists.txt
+++ b/paddle/pserver/test/CMakeLists.txt
@@ -3,7 +3,7 @@ add_unittest_without_exec(socket_test
     SocketTest.cpp)
 
 add_test(NAME socket_test
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
         ${CMAKE_CURRENT_BINARY_DIR}/socket_test --loop_time=10)
 
 ####################### test_ProtoServer ####################
@@ -12,7 +12,7 @@ add_unittest_without_exec(test_ProtoServer
 
 IF(NOT ON_TRAVIS)
     add_test(NAME test_ProtoServer
-        COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port
             ${CMAKE_CURRENT_BINARY_DIR}/test_ProtoServer)
 ENDIF(NOT ON_TRAVIS)
 
@@ -24,5 +24,5 @@ ENDIF(NOT ON_TRAVIS)
 add_unittest_without_exec(test_ParameterServer2
     test_ParameterServer2.cpp)
 add_test(NAME test_ParameterServer2
-    COMMAND ${PROJ_ROOT}/paddle/.set_port.sh -p port -n 4
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port -n 4
         ${CMAKE_CURRENT_BINARY_DIR}/test_ParameterServer2)
diff --git a/paddle/scripts/docker/build.sh b/paddle/scripts/docker/build.sh
index 44442be4729ff77e8d378c93acebe1486eb75397..2f0205b7702b6d73b5348430f39166ec78f6c143 100644
--- a/paddle/scripts/docker/build.sh
+++ b/paddle/scripts/docker/build.sh
@@ -31,7 +31,7 @@ Configuring cmake in /paddle/build ...
       -DWITH_DOC=OFF
       -DWITH_GPU=${WITH_GPU:-OFF}
       -DWITH_AVX=${WITH_AVX:-OFF}
-      -DWITH_GOLANG=${WITH_GOLANG:-OFF}
+      -DWITH_GOLANG=${WITH_GOLANG:-ON}
       -DWITH_SWIG_PY=ON
       -DWITH_C_API=${WITH_C_API:-OFF}
       -DWITH_PYTHON=${WITH_PYTHON:-ON}
@@ -51,7 +51,7 @@ cmake .. \
       -DWITH_DOC=OFF \
       -DWITH_GPU=${WITH_GPU:-OFF} \
       -DWITH_AVX=${WITH_AVX:-OFF} \
-      -DWITH_GOLANG=${WITH_GOLANG:-OFF} \
+      -DWITH_GOLANG=${WITH_GOLANG:-ON} \
       -DWITH_SWIG_PY=${WITH_SWIG_PY:-ON} \
       -DWITH_C_API=${WITH_C_API:-OFF} \
       -DWITH_PYTHON=${WITH_PYTHON:-ON} \
@@ -74,11 +74,11 @@ cat <<EOF
 Running unit tests ...
 ========================================
 EOF
+    ctest --output-on-failure
     # make install should also be test when unittest
     make install -j `nproc`
     pip install /usr/local/opt/paddle/share/wheels/*.whl
     paddle version
-    ctest --output-on-failure
 fi
 
 
@@ -130,7 +130,7 @@ fi
 
 # generate deb package for current build
 # FIXME(typhoonzero): should we remove paddle/scripts/deb ?
-if [[ ${WITH_DEB:-OFF} == "ON" ]]; then
+if [[ ${WITH_DEB:-ON} == "ON" ]]; then
     cat <<EOF
 ========================================
 Generating .deb package ...
diff --git a/paddle/trainer/NewRemoteParameterUpdater.cpp b/paddle/trainer/NewRemoteParameterUpdater.cpp
index af1dceed0284c70d68b61b9682b0cb23c28043d6..cccb7e7cddda18a88355b0217c421ef3e2a21d92 100644
--- a/paddle/trainer/NewRemoteParameterUpdater.cpp
+++ b/paddle/trainer/NewRemoteParameterUpdater.cpp
@@ -66,28 +66,92 @@ void NewRemoteParameterUpdater::init(
   // from parameter server
   if (paddle_begin_init_params(parameterClient_)) {
     LOG(INFO) << "paddle_begin_init_params start";
+    // NOTE: convert V1 OptimizatioinConfig proto to V2 OptimizerConfig.
+    // This makes golang pserver compatible with handy V1 demos.
+    // TODO: Refine or remove these ugly converting lines
+    OptimizerConfig optimizerConfigV2;
+    if (trainerConfig_.learning_method() == "momentum") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    } else if (trainerConfig_.learning_method() == "adagrad") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adagrad()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+    } else if (trainerConfig_.learning_method() == "adadelta") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adagrad);
+      optimizerConfigV2.mutable_adadelta()->set_epsilon(
+          trainerConfig_.ada_epsilon());
+      optimizerConfigV2.mutable_adadelta()->set_rho(trainerConfig_.ada_rou());
+    } else if (trainerConfig_.learning_method() == "adam") {
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::Adam);
+      optimizerConfigV2.mutable_adam()->set_beta_1(trainerConfig_.adam_beta1());
+      optimizerConfigV2.mutable_adam()->set_beta_2(trainerConfig_.adam_beta2());
+      optimizerConfigV2.mutable_adam()->set_epsilon(
+          trainerConfig_.adam_epsilon());
+    } else {
+      LOG(ERROR) << "got unsupported v1 optimizer config: "
+                 << trainerConfig_.learning_method();
+      optimizerConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+    }
+
+    if (trainerConfig_.learning_rate_schedule() == "constant") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+      optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+    } else if (trainerConfig_.learning_rate_schedule() == "linear") {
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Linear);
+      optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+          trainerConfig_.learning_rate());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_a(
+          trainerConfig_.learning_rate_decay_a());
+      optimizerConfigV2.mutable_linear_lr()->set_lr_decay_b(
+          trainerConfig_.learning_rate_decay_b());
+    } else {
+      LOG(ERROR) << "got unsupported v1 learning_rate_schedule config: "
+                 << trainerConfig_.learning_rate_schedule() << ", set to const";
+      optimizerConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
+    }
+
+    // overwrite optimizerConfigV2 for per-parameter(layer) configs
     for (int i = 0; i < parameterSize(); ++i) {
       auto paramConfig = parameters_[i]->getConfig();
-      LOG(INFO) << "old param config: " << paramConfig.DebugString();
-      // FIXME(typhoonzero): convert old paramConfig to optimizerConfig
-      OptimizerConfig optimizeConfigV2;
-      auto sgdConfigV2 = optimizeConfigV2.mutable_sgd();
-      sgdConfigV2->set_momentum(paramConfig.momentum());
-      sgdConfigV2->set_decay(paramConfig.decay_rate());
-      optimizeConfigV2.set_lr_policy(paddle::OptimizerConfig::Const);
-      auto constlr = optimizeConfigV2.mutable_const_lr();
+      if (paramConfig.has_momentum() &&
+          trainerConfig_.learning_method() == "momentum") {
+        optimizerConfigV2.mutable_sgd()->set_momentum(paramConfig.momentum());
+      }
       if (paramConfig.has_learning_rate()) {
-        constlr->set_learning_rate(paramConfig.learning_rate());
-      } else {
-        constlr->set_learning_rate(trainerConfig_.learning_rate());
+        switch (optimizerConfigV2.lr_policy()) {
+          case 0:
+            optimizerConfigV2.mutable_const_lr()->set_learning_rate(
+                paramConfig.learning_rate());
+            break;
+          case 1:
+            optimizerConfigV2.mutable_linear_lr()->set_learning_rate(
+                paramConfig.learning_rate());
+            break;
+        }
       }
-      if (trainerConfig_.algorithm() == "sgd") {
-        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
-        // FIXME: config all algorithms
-      } else {
-        optimizeConfigV2.set_optimizer(paddle::OptimizerConfig::SGD);
+      if (paramConfig.has_decay_rate()) {
+        switch (optimizerConfigV2.optimizer()) {
+          case 1:  // SGD
+            optimizerConfigV2.mutable_sgd()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 2:  // Adadelta
+            optimizerConfigV2.mutable_adadelta()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 3:  // Adagrad
+            optimizerConfigV2.mutable_adagrad()->set_decay(
+                paramConfig.decay_rate());
+            break;
+          case 4:  // Adam
+            optimizerConfigV2.mutable_adam()->set_decay(
+                paramConfig.decay_rate());
+            break;
+        }
       }
-      std::string bytes = optimizeConfigV2.SerializeAsString();
+      // send param and config to pserver
+      std::string bytes = optimizerConfigV2.SerializeAsString();
       const char *array = bytes.data();
       int size = (int)bytes.size();
       paddle_init_param(
diff --git a/paddle/trainer/TrainerConfigHelper.cpp b/paddle/trainer/TrainerConfigHelper.cpp
index 133e2be104c6fbfddefd8698d2b6aa8315c56c70..eba40862b926cfe863c569e73a6a3ceabcf1f3b4 100644
--- a/paddle/trainer/TrainerConfigHelper.cpp
+++ b/paddle/trainer/TrainerConfigHelper.cpp
@@ -28,6 +28,8 @@ DECLARE_bool(with_cost);
 DECLARE_bool(with_gpu);
 DECLARE_bool(parallel_nn);
 DECLARE_string(config_args);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkldnn_wgt);
 
 const char *kConfigParserModuleName = "paddle.trainer.config_parser";
 const char *kConfigParserFuncName = "parse_config_and_serialize";
@@ -44,6 +46,8 @@ TrainerConfigHelper::TrainerConfigHelper(const std::string &configFilePath)
   configArgs << "trainer_id=" << FLAGS_trainer_id << ",local=" << FLAGS_local
              << ",with_cost=" << FLAGS_with_cost << ",use_gpu=" << FLAGS_use_gpu
              << ",parallel_nn=" << FLAGS_parallel_nn
+             << ",use_mkldnn=" << FLAGS_use_mkldnn
+             << ",use_mkldnn_wgt=" << FLAGS_use_mkldnn_wgt
              << ",cudnn_version=" << hl_get_cudnn_lib_version();
   if (!FLAGS_config_args.empty()) {
     configArgs << "," << FLAGS_config_args;
diff --git a/paddle/trainer/tests/CMakeLists.txt b/paddle/trainer/tests/CMakeLists.txt
index 08b2d8a38e2d20a357752269bd3ee3f515116abd..f01ad4142d4fe7c7f7d7aac60d967ea114b93e56 100644
--- a/paddle/trainer/tests/CMakeLists.txt
+++ b/paddle/trainer/tests/CMakeLists.txt
@@ -2,19 +2,19 @@
 add_unittest_without_exec(test_Compare
     test_Compare.cpp)
 add_test(NAME test_Compare
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python
         ${CMAKE_CURRENT_BINARY_DIR}/test_Compare
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ################# test_Trainer ###########################
 add_unittest_without_exec(test_Trainer
     test_Trainer.cpp)
 add_test(NAME test_Trainer
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/gen_proto_data.py &&
-        ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/gen_proto_data.py &&
+        ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_Trainer
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ############### test_TrainerOnePass ##########################
 if(WITH_PYTHON)
@@ -23,60 +23,60 @@ if(WITH_PYTHON)
   add_unittest_without_exec(test_TrainerOnePass
       test_TrainerOnePass.cpp)
   add_test(NAME test_TrainerOnePass
-    COMMAND  ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-          ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
-          ${PROJ_ROOT}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
-      WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    COMMAND  ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+          ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
+          ${PADDLE_SOURCE_DIR}/paddle/.set_port.sh -p port ${CMAKE_CURRENT_BINARY_DIR}/test_TrainerOnePass
+      WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 ################ test_CompareTwoNets ######################
 add_unittest_without_exec(test_CompareTwoNets
     test_CompareTwoNets.cpp)
 add_test(NAME test_CompareTwoNets
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoNets
             --config_file_a=trainer/tests/sample_trainer_config_qb_rnn.conf --config_file_b=trainer/tests/sample_trainer_config_rnn.conf
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ############### test_CompareTwoOpts ###################
 add_unittest_without_exec(test_CompareTwoOpts
     test_CompareTwoOpts.cpp)
 add_test(NAME test_CompareTwoOpts
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_CompareTwoOpts
             --config_file_a=trainer/tests/sample_trainer_config_opt_a.conf --config_file_b=trainer/tests/sample_trainer_config_opt_b.conf
             --num_passes=1 --need_high_accuracy=0
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 ################# test_CompareSparse ##################
 add_unittest_without_exec(test_CompareSparse
     test_CompareSparse.cpp)
 if(NOT ON_TRAVIS)
   add_test(NAME test_CompareSparse
-    COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+    COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
               ./.set_port.sh -p port -n 6
                   ${CMAKE_CURRENT_BINARY_DIR}/test_CompareSparse
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 endif()
 ################# test_recurrent_machine_generation ###############
 add_unittest_without_exec(test_recurrent_machine_generation
     test_recurrent_machine_generation.cpp)
 add_test(NAME test_recurrent_machine_generation
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
         ${CMAKE_CURRENT_BINARY_DIR}/test_recurrent_machine_generation
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 #################### test_PyDataProviderWrapper #########################
 add_unittest_without_exec(test_PyDataProviderWrapper
     test_PyDataProviderWrapper.cpp)
 
 add_test(NAME test_PyDataProviderWrapper
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d
-        ${PROJ_ROOT}/python/:${PROJ_ROOT}/paddle/trainer/tests
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d
+        ${PADDLE_SOURCE_DIR}/python/:${PADDLE_SOURCE_DIR}/paddle/trainer/tests
         ${CMAKE_CURRENT_BINARY_DIR}/test_PyDataProviderWrapper
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
 
 #################### test_config_parser #########################
 add_test(NAME test_config_parser
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/paddle/trainer/tests/config_parser_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/paddle/)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/paddle/trainer/tests/config_parser_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/paddle/)
diff --git a/paddle/utils/Flags.cpp b/paddle/utils/Flags.cpp
index 320f671ed97dbadc4fa1b4b52d5611cf9239e7dd..600c83a8487191895de635dd8433f6c44e86ce77 100644
--- a/paddle/utils/Flags.cpp
+++ b/paddle/utils/Flags.cpp
@@ -20,6 +20,14 @@ DEFINE_bool(use_gpu, false, "Only support CPU training");
 DEFINE_bool(use_gpu, true, "Whether to use GPU for training");
 #endif
 
+#ifdef PADDLE_USE_MKLDNN
+// TODO(TJ): change to true when MKLDNN layers support multi-inputs
+DEFINE_bool(use_mkldnn, false, "Default still keep use CPU training");
+#else
+DEFINE_bool(use_mkldnn, false, "Only support CPU training");
+#endif
+
+DEFINE_bool(use_mkldnn_wgt, false, "Init weight from CPU weight");
 DEFINE_bool(parallel_nn,
             false,
             "Whether to use multi-threads to calculate one neural network."
diff --git a/paddle/utils/Flags.h b/paddle/utils/Flags.h
index dc4faef8331ed47b9ce3e952389b6469cd9fda2e..0aca4c0ee036ee8490c0ceca7279df876dc21947 100644
--- a/paddle/utils/Flags.h
+++ b/paddle/utils/Flags.h
@@ -40,3 +40,5 @@ DECLARE_bool(show_layer_stat);
 DECLARE_string(predict_file);
 DECLARE_bool(prev_batch_state);
 DECLARE_string(init_model_path);
+DECLARE_bool(use_mkldnn);
+DECLARE_bool(use_mkldnn_wgt);
diff --git a/paddle/utils/tests/CMakeLists.txt b/paddle/utils/tests/CMakeLists.txt
index aa923b355377752f9b297a125f5c43c364ba9b06..c770ce169878d9998e559b1d417fc1acc88cde97 100644
--- a/paddle/utils/tests/CMakeLists.txt
+++ b/paddle/utils/tests/CMakeLists.txt
@@ -13,6 +13,6 @@ add_executable(
 link_paddle_exe(test_CustomStackTracePrint)
 if(NOT APPLE)
     add_test(NAME test_CustomStackTracePrint
-        COMMAND ${PROJ_ROOT}/paddle/utils/tests/test_CustomStackTracePrint.sh
+        COMMAND ${PADDLE_SOURCE_DIR}/paddle/utils/tests/test_CustomStackTracePrint.sh
         WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
 endif()
diff --git a/proto/CMakeLists.txt b/proto/CMakeLists.txt
index e1cea8bd0de5394020a498725485cea025512e48..6212c2e60a8ed94ecc1d6e58535a2b3d365e3eb8 100644
--- a/proto/CMakeLists.txt
+++ b/proto/CMakeLists.txt
@@ -9,13 +9,13 @@ foreach(filename ${proto_filenames})
     get_filename_component(ABS_FIL ${filename} ABSOLUTE)
     get_filename_component(FIL_WE ${filename} NAME_WE)
     set(CUR_PROTO_GEN_PY
-            ${PROJ_ROOT}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
+            ${PADDLE_SOURCE_DIR}/paddle/python/paddle/proto/${FIL_WE}_pb2.py)
     set(PROTO_GEN_PY
             ${CUR_PROTO_GEN_PY}
             ${PROTO_GEN_PY})
     add_custom_command(OUTPUT ${CUR_PROTO_GEN_PY}
             COMMAND ${PROTOBUF_PROTOC_EXECUTABLE}
-            ARGS "--python_out=${PROJ_ROOT}/python/paddle/proto"
+            ARGS "--python_out=${PADDLE_SOURCE_DIR}/python/paddle/proto"
             "-I" ${CMAKE_CURRENT_SOURCE_DIR} ${ABS_FIL}
             DEPENDS ${ABS_FIL} protoc)
 endforeach()
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index b5030da8e75eb94e857ae4effc6adb6d19dc0e93..16c519d45aa62694201379b8da1ca54d8a07ee9a 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -1,5 +1,3 @@
-set(OUTPUT_DIR
-    "${CMAKE_CURRENT_BINARY_DIR}/build")
 
 file(GLOB TRAINER_PY_FILES . ./paddle/trainer/*.py)
 file(GLOB HELPERS_PY_FILES . ./paddle/trainer_config_helpers/*.py)
@@ -18,7 +16,7 @@ SET(COPY_PADDLE_MASTER "")
 if(WITH_GOLANG)
   SET(COPY_PADDLE_MASTER "copy_paddle_master")
   add_custom_command(TARGET ${COPY_PADDLE_MASTER}
-    COMMAND cp ${paddle_master_LIB_PATH} ${PROJ_ROOT}/python/paddle/v2/master/
+    COMMAND cp ${paddle_master_LIB_PATH} ${PADDLE_SOURCE_DIR}/python/paddle/v2/master/
     )
   add_dependencies(copy_paddle_master paddle_master)
 endif(WITH_GOLANG)
@@ -27,19 +25,21 @@ configure_file(${CMAKE_CURRENT_SOURCE_DIR}/setup.py.in
     ${CMAKE_CURRENT_BINARY_DIR}/setup.py)
 
 
-add_custom_command(OUTPUT ${PROJ_ROOT}/python/paddle/v2/framework/core.so
-        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PROJ_ROOT}/python/paddle/v2/framework/core.so
+add_custom_command(OUTPUT ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
+        COMMAND cmake -E copy $<TARGET_FILE:paddle_pybind> ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so
         DEPENDS paddle_pybind)
-add_custom_target(copy_paddle_pybind ALL DEPENDS ${PROJ_ROOT}/python/paddle/v2/framework/core.so)
+add_custom_target(copy_paddle_pybind ALL DEPENDS ${PADDLE_SOURCE_DIR}/python/paddle/v2/framework/core.so)
 
 
-add_custom_command(OUTPUT ${OUTPUT_DIR}/.timestamp
+add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
-    COMMAND ${CMAKE_COMMAND} -E touch ${OUTPUT_DIR}/.timestamp
+    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
+    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     DEPENDS gen_proto_py copy_paddle_pybind framework_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 add_custom_target(paddle_python ALL DEPENDS
-    ${OUTPUT_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model python_api_wheel)
+    ${PADDLE_PYTHON_BUILD_DIR}/.timestamp paddle_pserver_main paddle_trainer paddle_merge_model python_api_wheel)
 
 set(PADDLE_PYTHON_PACKAGE_DIR ${CMAKE_CURRENT_BINARY_DIR}/dist/)
 
diff --git a/python/paddle/trainer/config_parser.py b/python/paddle/trainer/config_parser.py
index b7b696ef0c13e1bae2e910e08d1a1ea3e45cd5d5..da99e5bd53458aa0cb201a3525e28c66ab63c52d 100644
--- a/python/paddle/trainer/config_parser.py
+++ b/python/paddle/trainer/config_parser.py
@@ -1604,6 +1604,8 @@ class MultiClassCrossEntropySelfNormCostLayer(LayerBase):
 
 @config_layer('fc')
 class FCLayer(LayerBase):
+    layer_type = 'fc'
+
     def __init__(self,
                  name,
                  size,
@@ -1611,14 +1613,27 @@ class FCLayer(LayerBase):
                  bias=True,
                  error_clipping_threshold=None,
                  **xargs):
-        super(FCLayer, self).__init__(name, 'fc', size, inputs=inputs, **xargs)
+        use_mkldnn = bool(int(g_command_config_args.get("use_mkldnn", 0)))
+        use_mkldnn_wgt = bool(
+            int(g_command_config_args.get("use_mkldnn_wgt", 0)))
+        if use_mkldnn:
+            self.layer_type = 'mkldnn_fc'
+            config_assert(
+                len(inputs) == 1,
+                "MkldnnFCLayer support one and only one input!")
+        super(FCLayer, self).__init__(
+            name, self.layer_type, size, inputs=inputs, **xargs)
         for input_index in xrange(len(self.inputs)):
             input_layer = self.get_input_layer(input_index)
             psize = self.config.size * input_layer.size
             dims = [input_layer.size, self.config.size]
             format = self.inputs[input_index].format
             sparse = format == "csr" or format == "csc"
-
+            if use_mkldnn:
+                config_assert(not sparse,
+                              "MkldnnFCLayer do not support sparse format yet")
+                if use_mkldnn_wgt:
+                    dims = [self.config.size, input_layer.size]
             if sparse:
                 psize = self.inputs[input_index].nnz
             else:
@@ -1631,6 +1646,11 @@ class FCLayer(LayerBase):
             self.config.error_clipping_threshold = error_clipping_threshold
 
 
+@config_layer('mkldnn_fc')
+class MkldnnFcLayer(FCLayer):
+    layer_type = 'mkldnn_fc'
+
+
 @config_layer('selective_fc')
 class SelectiveFCLayer(LayerBase):
     def __init__(self,
diff --git a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
index 6c860fd49702ebc93612114011361efb885c62ec..580aef935b5cec385a88fb0b4f5b9a5ddeddb40c 100644
--- a/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
+++ b/python/paddle/trainer_config_helpers/tests/CMakeLists.txt
@@ -1,17 +1,17 @@
 #################### test_config_parser #########################
 add_test(NAME layers_test
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/layers_test.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/layers_test.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_test(NAME test_reset_hook
-  COMMAND ${PROJ_ROOT}/paddle/.set_python_path.sh -d ${PROJ_ROOT}/python/
-        ${PYTHON_EXECUTABLE} ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
-    WORKING_DIRECTORY ${PROJ_ROOT}/python/paddle)
+  COMMAND ${PADDLE_SOURCE_DIR}/paddle/.set_python_path.sh -d ${PADDLE_SOURCE_DIR}/python/
+        ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/test_reset_hook.py
+    WORKING_DIRECTORY ${PADDLE_SOURCE_DIR}/python/paddle)
 
 add_paddle_exe(protobuf_equal ProtobufEqualMain.cpp)
 add_test(NAME test_layerHelpers
   COMMAND
-  ${PROJ_ROOT}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
+  ${PADDLE_SOURCE_DIR}/python/paddle/trainer_config_helpers/tests/configs/run_tests.sh ${PYTHON_EXECUTABLE}
   ${CMAKE_CURRENT_BINARY_DIR}/protobuf_equal
 )
diff --git a/python/paddle/v2/framework/.gitignore b/python/paddle/v2/framework/.gitignore
new file mode 100644
index 0000000000000000000000000000000000000000..2ff540d5764b76cf7bac64fc2bb9df6e9c1b398a
--- /dev/null
+++ b/python/paddle/v2/framework/.gitignore
@@ -0,0 +1 @@
+proto
diff --git a/python/paddle/v2/framework/tests/CMakeLists.txt b/python/paddle/v2/framework/tests/CMakeLists.txt
index f6850e06512d196d51e454bc22cfa3cda8bba84a..b76c05dc8142af40d9872b42cc51b3c317e095be 100644
--- a/python/paddle/v2/framework/tests/CMakeLists.txt
+++ b/python/paddle/v2/framework/tests/CMakeLists.txt
@@ -1,6 +1,5 @@
 py_test(test_net SRCS test_net.py)
 
-py_test(test_fc_op SRCS test_fc_op.py)
 py_test(test_scope SRCS test_scope.py)
 
 py_test(test_tensor SRCS test_tensor.py)
@@ -23,6 +22,5 @@ py_test(test_rowwise_add_op SRCS test_rowwise_add_op.py)
 py_test(test_default_scope_funcs SRCS test_default_scope_funcs.py)
 
 py_test(test_operator SRCS test_operator.py)
-
-py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
+# py_test(test_gaussian_random_op SRCS test_gaussian_random_op.py)
 py_test(test_uniform_random_op SRCS test_uniform_random_op.py)
diff --git a/python/paddle/v2/framework/tests/test_fc_op.py b/python/paddle/v2/framework/tests/test_fc_op.py
deleted file mode 100644
index e24435839d305bb1a4ab7daa3e9684a421468fd8..0000000000000000000000000000000000000000
--- a/python/paddle/v2/framework/tests/test_fc_op.py
+++ /dev/null
@@ -1,45 +0,0 @@
-import paddle.v2.framework.core as core
-import unittest
-import numpy
-from paddle.v2.framework.op import Operator
-
-
-class TestFc(unittest.TestCase):
-    def test_fc(self):
-        scope = core.Scope()
-        place = core.CPUPlace()
-        x = scope.new_var("X")
-
-        x_tensor = x.get_tensor()
-        x_tensor.set_dims([1000, 784])
-        x_tensor.alloc_float(place)
-
-        w = scope.new_var("W")
-        w_tensor = w.get_tensor()
-        w_tensor.set_dims([784, 100])
-        w_tensor.alloc_float(place)
-
-        w_tensor.set(numpy.random.random((784, 100)).astype("float32"), place)
-
-        # Set a real numpy array here.
-        # x_tensor.set(numpy.array([]))
-
-        op = Operator("fc", X="X", Y="Y", W="W")
-
-        for out in op.outputs():
-            if scope.find_var(out) is None:
-                scope.new_var(out).get_tensor()
-
-        tensor = scope.find_var("Y").get_tensor()
-        op.infer_shape(scope)
-        self.assertEqual([1000, 100], tensor.shape())
-
-        ctx = core.DeviceContext.create(place)
-
-        op.run(scope, ctx)
-
-        # After complete all ops, check Y is expect or not.
-
-
-if __name__ == '__main__':
-    unittest.main()
diff --git a/python/paddle/v2/framework/tests/test_net.py b/python/paddle/v2/framework/tests/test_net.py
index b30896553dea4a4929038d524b23c6090bbed380..cc7f09e7155f5b1afa47fc4133b71ae3676b7436 100644
--- a/python/paddle/v2/framework/tests/test_net.py
+++ b/python/paddle/v2/framework/tests/test_net.py
@@ -3,6 +3,15 @@ from paddle.v2.framework.op import Operator
 import unittest
 
 
+def fc(X, W, Y):
+    ret_v = core.Net.create()
+
+    ret_v.add_op(Operator("mul", X="X", Y="W", Out="pre_activation"))
+    ret_v.add_op(Operator("sigmoid", X="pre_activation", Y=Y))
+    ret_v.complete_add_op(True)
+    return ret_v
+
+
 class TestNet(unittest.TestCase):
     def test_net_all(self):
         net = core.Net.create()
@@ -10,18 +19,18 @@ class TestNet(unittest.TestCase):
         net.add_op(op1)
 
         net2 = core.Net.create()
-        net2.add_op(Operator("fc", X="X", W="w", Y="fc.out"))
+        net2.add_op(fc(X="X", W="w", Y="fc.out"))
         net2.complete_add_op(True)
         net.add_op(net2)
         net.complete_add_op(True)
 
         expected = '''
-Op(plain_net), inputs:(@EMPTY@, X, Y, w), outputs:(@TEMP@fc@0, Out, fc.out).
+Op(plain_net), inputs:(W, X, Y), outputs:(Out, fc.out, pre_activation).
     Op(add_two), inputs:(X, Y), outputs:(Out).
-    Op(plain_net), inputs:(@EMPTY@, X, w), outputs:(@TEMP@fc@0, fc.out).
-        Op(fc), inputs:(X, w, @EMPTY@), outputs:(fc.out, @TEMP@fc@0).
-            Op(mul), inputs:(X, w), outputs:(@TEMP@fc@0).
-            Op(sigmoid), inputs:(@TEMP@fc@0), outputs:(fc.out).
+    Op(plain_net), inputs:(W, X), outputs:(fc.out, pre_activation).
+        Op(plain_net), inputs:(W, X), outputs:(fc.out, pre_activation).
+            Op(mul), inputs:(X, W), outputs:(pre_activation).
+            Op(sigmoid), inputs:(pre_activation), outputs:(fc.out).
 '''
         self.assertEqual(expected, "\n" + str(net))
 
diff --git a/python/paddle/v2/optimizer.py b/python/paddle/v2/optimizer.py
index ba581980334fec6226a537af2cf53b3465d32c1e..29f0945eb4c88eab8fa9ee83f455190dfd473aa4 100644
--- a/python/paddle/v2/optimizer.py
+++ b/python/paddle/v2/optimizer.py
@@ -1,13 +1,26 @@
-import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
-import paddle.trainer_config_helpers.optimizers as v1_optimizers
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
 """
 Optimizers(update equation) for SGD method.
 
-TODO(zhihong) : create new optimizer with proto config, add new optimizer here
-
 TODO(yuyang18): Complete comments.
 """
 
+import paddle.trainer_config_helpers.config_parser_utils as config_parser_utils
+import paddle.trainer_config_helpers.optimizers as v1_optimizers
+from paddle.proto.OptimizerConfig_pb2 import OptimizerConfig
+
 __all__ = [
     'Momentum', 'Adam', 'Adamax', 'AdaGrad', 'DecayedAdaGrad', 'AdaDelta',
     'RMSProp', 'ModelAverage', 'L2Regularization'
@@ -70,7 +83,8 @@ class Optimizer(object):
                         gradient_machine.prefetch(in_args)
                         parameter_updater.getParametersRemote()
 
-        :param pserver_spec: pserver location, eg: localhost:3000
+        :param pserver_spec: pserver location, eg: localhost:3000, if use etcd,
+        pserver_spec should be the etcd endpoints, eg: http://localhost:2379
         :return: parameter_updater
         """
         if is_local:
diff --git a/python/paddle/v2/parameters.py b/python/paddle/v2/parameters.py
index a9cba8ca0b1efd4149463f6c7bf2dcdfbea350c9..b8af5abaeada49a3e8951c21c9065aaf4d1ab851 100644
--- a/python/paddle/v2/parameters.py
+++ b/python/paddle/v2/parameters.py
@@ -1,3 +1,17 @@
+# Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
 import numpy as np
 from paddle.proto.ParameterConfig_pb2 import ParameterConfig
 import paddle.trainer.config_parser as cp
@@ -113,16 +127,7 @@ class Parameters(object):
         """
         return iter(self.__param_conf__)
 
-    def __getitem__(self, key):
-        """
-        Get parameter by parameter name. It uses Python dict syntax.
-
-        :note: It will always copy the parameter from C++ side.
-        :param key: Parameter name
-        :type key: basestring
-        :return: parameter value
-        :rtype: np.ndarray
-        """
+    def __getter_inner(self, key, param_type):
         import py_paddle.swig_paddle as api
         shape = self.get_shape(key)
 
@@ -138,7 +143,7 @@ class Parameters(object):
                     each_gradient_machine, key)
                 # for simplify implementation now, we always copy from C++
                 assert isinstance(param, api.Parameter)
-                val = param.getBuf(api.PARAMETER_VALUE)
+                val = param.getBuf(param_type)
                 assert isinstance(val, api.Vector)
                 val = val.copyToNumpyArray()
                 return val
@@ -146,6 +151,19 @@ class Parameters(object):
 
             raise RuntimeError("Unexpected branch")
 
+    def __getitem__(self, key):
+        """
+        Get parameter by parameter name. It uses Python dict syntax.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: Parameter name
+        :type key: basestring
+        :return: parameter value
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        return self.__getter_inner(key, api.PARAMETER_VALUE)
+
     def get_shape(self, key):
         """
         get shape of the parameter.
@@ -202,6 +220,19 @@ class Parameters(object):
         """
         return self.__getitem__(key=parameter_name)
 
+    def get_grad(self, key):
+        """
+        Get grandient by parameter name.
+
+        :note: It will always copy the parameter from C++ side.
+        :param key: parameter name
+        :type key: basestring
+        :return: The grandient matrix.
+        :rtype: np.ndarray
+        """
+        import py_paddle.swig_paddle as api
+        return self.__getter_inner(key, api.PARAMETER_GRADIENT)
+
     def set(self, parameter_name, value):
         """
         Set parameter by parameter name & matrix.
@@ -250,7 +281,13 @@ class Parameters(object):
         size = reduce(lambda a, b: a * b, param.shape)
         f.write(struct.pack("IIQ", 0, 4, size))
         param = param.astype(np.float32)
-        f.write(param.tostring())
+        s = param.tostring()
+        wrote_size = 0
+        buf = buffer(s, wrote_size, 65535)
+        while buf:  # f.write crashes with big data blog.
+            f.write(buf)
+            wrote_size += 65535
+            buf = buffer(s, wrote_size, 65535)
 
     def deserialize(self, name, f):
         """
diff --git a/python/paddle/v2/trainer.py b/python/paddle/v2/trainer.py
index 76bae0bb12b6c33f88530386f9cc19ae9b59f457..9c4dd5f25083d210bbd218a85d8dbb3cce2c3d0e 100644
--- a/python/paddle/v2/trainer.py
+++ b/python/paddle/v2/trainer.py
@@ -161,14 +161,14 @@ class SGD(object):
                     self.__parameter_updater__.update(each_param)
                 cost_sum = out_args.sum()
                 cost = cost_sum / len(data_batch)
-                self.__parameter_updater__.finishBatch(cost)
-                batch_evaluator.finish()
                 event_handler(
                     v2_event.EndIteration(
                         pass_id=pass_id,
                         batch_id=batch_id,
                         cost=cost,
                         evaluator=batch_evaluator))
+                self.__parameter_updater__.finishBatch(cost)
+                batch_evaluator.finish()
 
             self.__parameter_updater__.finishPass()
             pass_evaluator.finish()
diff --git a/python/setup.py.in b/python/setup.py.in
index 38f0a503bee3eb29ae3c893c96d6e333be54b96e..4110c983180937e86716324f8c92c37b7d2cc3ea 100644
--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -45,14 +45,14 @@ setup(name='paddlepaddle',
           '': '${CMAKE_CURRENT_SOURCE_DIR}',
           # The paddle.v2.framework.proto will be generated while compiling.
           # So that package points to other directory.
-          'paddle.v2.framework.proto': '${PROJ_BINARY_ROOT}/paddle/framework',
-          'py_paddle': '${PROJ_ROOT}/paddle/py_paddle'
+          'paddle.v2.framework.proto': '${PADDLE_BINARY_DIR}/paddle/framework',
+          'py_paddle': '${PADDLE_SOURCE_DIR}/paddle/py_paddle'
       },
-      scripts=['${PROJ_BINARY_ROOT}/paddle/scripts/paddle'],
+      scripts=['${PADDLE_BINARY_DIR}/paddle/scripts/paddle'],
       distclass=BinaryDistribution,
       data_files=[('/usr/local/opt/paddle/bin',
-                       ['${PROJ_BINARY_ROOT}/paddle/scripts/paddle_usage',
-                        '${PROJ_BINARY_ROOT}/paddle/trainer/paddle_trainer',
-                        '${PROJ_BINARY_ROOT}/paddle/trainer/paddle_merge_model',
-                        '${PROJ_BINARY_ROOT}/paddle/pserver/paddle_pserver_main'])]
+                       ['${PADDLE_BINARY_DIR}/paddle/scripts/paddle_usage',
+                        '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_trainer',
+                        '${PADDLE_BINARY_DIR}/paddle/trainer/paddle_merge_model',
+                        '${PADDLE_BINARY_DIR}/paddle/pserver/paddle_pserver_main'])]
 )