Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into feature/support_op_role

27e4ce72 · yuyang18 · c9782590 · 62559ace · 27e4ce72 · 27e4ce72
18 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -100,6 +100,9 @@ endif()
 set(THIRD_PARTY_PATH "${CMAKE_BINARY_DIR}/third_party" CACHE STRING
  "A path setting third party libraries download & build directories.")
+set(FLUID_INSTALL_DIR "${CMAKE_BINARY_DIR}/fluid_install_dir" CACHE STRING
+  "A path setting fluid shared and static libraries")
 if (WITH_C_API AND WITH_PYTHON)
  message(WARNING "It is suggest not embedded a python interpreter in Paddle "
    "when using C-API. It will give an unpredictable behavior when using a "
@@ -117,13 +120,14 @@ else()
 endif()
 set(WITH_MKLML ${WITH_MKL})
-if (WITH_MKL AND AVX2_FOUND)
+if (NOT DEFINED WITH_MKLDNN)
+    if (WITH_MKL AND AVX2_FOUND)
        set(WITH_MKLDNN ON)
-else()
+    else()
        message(STATUS "Do not have AVX2 intrinsics and disabled MKL-DNN")
        set(WITH_MKLDNN OFF)
+    endif()
 endif()
 ########################################################################################
 include(external/mklml)     # download mklml package

--- a/cmake/external/boost.cmake
+++ b/cmake/external/boost.cmake
@@ -26,7 +26,7 @@ set(BOOST_VER           "1.41.0")
 if((NOT DEFINED BOOST_TAR) OR (NOT DEFINED BOOST_URL))
    message(STATUS "use pre defined download url")
    set(BOOST_TAR "boost_1_41_0" CACHE STRING "" FORCE)
-    set(BOOST_URL "http://paddlepaddledeps.cdn.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
+    set(BOOST_URL "http://paddlepaddledeps.bj.bcebos.com/${BOOST_TAR}.tar.gz" CACHE STRING "" FORCE)
 endif()
 MESSAGE(STATUS "BOOST_TAR: ${BOOST_TAR}, BOOST_URL: ${BOOST_URL}")
 set(BOOST_SOURCES_DIR ${THIRD_PARTY_PATH}/boost)

--- a/cmake/external/mklml.cmake
+++ b/cmake/external/mklml.cmake
@@ -30,7 +30,7 @@ SET(MKLML_PROJECT       "extern_mklml")
 IF((NOT DEFINED MKLML_VER) OR (NOT DEFINED MKLML_URL))
  MESSAGE(STATUS "use pre defined download url")
  SET(MKLML_VER "mklml_lnx_2018.0.3.20180406" CACHE STRING "" FORCE)
-  SET(MKLML_URL "http://paddlepaddledeps.cdn.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
+  SET(MKLML_URL "http://paddlepaddledeps.bj.bcebos.com/${MKLML_VER}.tgz" CACHE STRING "" FORCE)
 ENDIF()
 MESSAGE(STATUS "MKLML_VER: ${MKLML_VER}, MKLML_URL: ${MKLML_URL}")
 SET(MKLML_SOURCE_DIR    "${THIRD_PARTY_PATH}/mklml")

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -231,7 +231,7 @@ endfunction(cc_binary)
 function(cc_test TARGET_NAME)
  if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS ARGS)
    cmake_parse_arguments(cc_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -241,6 +241,9 @@ function(cc_test TARGET_NAME)
    add_test(NAME ${TARGET_NAME}
             COMMAND ${TARGET_NAME} ${cc_test_ARGS}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (${cc_test_SERIAL})
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
  endif()
 endfunction(cc_test)
@@ -295,7 +298,7 @@ endfunction(nv_binary)
 function(nv_test TARGET_NAME)
  if (WITH_GPU AND WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS)
    cmake_parse_arguments(nv_test "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -303,6 +306,9 @@ function(nv_test TARGET_NAME)
    target_link_libraries(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_dependencies(${TARGET_NAME} ${nv_test_DEPS} paddle_gtest_main memory gtest gflags glog)
    add_test(${TARGET_NAME} ${TARGET_NAME})
+    if (nv_test_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
  endif()
 endfunction(nv_test)

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -52,32 +52,32 @@ function(copy TARGET)
 endfunction()
 # third party
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/eigen3")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/eigen3")
 copy(eigen3_lib
  SRCS ${EIGEN_INCLUDE_DIR}/Eigen/Core ${EIGEN_INCLUDE_DIR}/Eigen/src ${EIGEN_INCLUDE_DIR}/unsupported/Eigen
  DSTS ${dst_dir}/Eigen ${dst_dir}/Eigen ${dst_dir}/unsupported
 )
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/gflags")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/gflags")
 copy(gflags_lib
  SRCS ${GFLAGS_INCLUDE_DIR} ${GFLAGS_LIBRARIES}
  DSTS ${dst_dir} ${dst_dir}/lib
 )
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/glog")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/glog")
 copy(glog_lib
  SRCS ${GLOG_INCLUDE_DIR} ${GLOG_LIBRARIES}
  DSTS ${dst_dir} ${dst_dir}/lib
 )
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/boost/")
+set(dst_dir "${FLUID_INSTALL_DIR}/third_party/boost/")
 copy(boost_lib
  SRCS ${BOOST_INCLUDE_DIR}/boost
  DSTS ${dst_dir}
 )
 if(NOT PROTOBUF_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/protobuf")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/protobuf")
    copy(protobuf_lib
      SRCS ${PROTOBUF_INCLUDE_DIR} ${PROTOBUF_LIBRARY}
      DSTS ${dst_dir} ${dst_dir}/lib
@@ -85,13 +85,13 @@ if(NOT PROTOBUF_FOUND)
 endif()
 if(NOT CBLAS_FOUND)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/openblas")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/openblas")
    copy(openblas_lib
      SRCS ${CBLAS_INSTALL_DIR}/lib ${CBLAS_INSTALL_DIR}/include
      DSTS ${dst_dir} ${dst_dir}
    )
 elseif (WITH_MKLML)
-    set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mklml")
+    set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mklml")
    copy(mklml_lib
      SRCS ${MKLML_LIB} ${MKLML_IOMP_LIB} ${MKLML_INC_DIR}
      DSTS ${dst_dir}/lib ${dst_dir}/lib ${dst_dir}
@@ -99,7 +99,7 @@ elseif (WITH_MKLML)
 endif()
 if(WITH_MKLDNN)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/mkldnn")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/mkldnn")
  copy(mkldnn_lib
    SRCS ${MKLDNN_INC_DIR} ${MKLDNN_SHARED_LIB}
    DSTS ${dst_dir} ${dst_dir}/lib
@@ -107,17 +107,17 @@ if(WITH_MKLDNN)
 endif()
 if(NOT MOBILE_INFERENCE AND NOT RPI)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappy")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappy")
  copy(snappy_lib
    SRCS ${SNAPPY_INCLUDE_DIR} ${SNAPPY_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/snappystream")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/snappystream")
  copy(snappystream_lib
    SRCS ${SNAPPYSTREAM_INCLUDE_DIR} ${SNAPPYSTREAM_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)
-  set(dst_dir "${CMAKE_INSTALL_PREFIX}/third_party/install/zlib")
+  set(dst_dir "${FLUID_INSTALL_DIR}/third_party/install/zlib")
  copy(zlib_lib
    SRCS ${ZLIB_INCLUDE_DIR} ${ZLIB_LIBRARIES}
    DSTS ${dst_dir} ${dst_dir}/lib)
@@ -125,7 +125,7 @@ endif()
 # paddle fluid module
 set(src_dir "${PADDLE_SOURCE_DIR}/paddle/fluid")
-set(dst_dir "${CMAKE_INSTALL_PREFIX}/paddle/fluid")
+set(dst_dir "${FLUID_INSTALL_DIR}/paddle/fluid")
 set(module "framework")
 copy(framework_lib DEPS framework_py_proto 
  SRCS ${src_dir}/${module}/*.h ${src_dir}/${module}/details/*.h ${PADDLE_BINARY_DIR}/paddle/fluid/framework/framework.pb.h
@@ -165,7 +165,7 @@ copy(pybind_lib
 # CMakeCache Info
 copy(cmake_cache
  SRCS ${CMAKE_CURRENT_BINARY_DIR}/CMakeCache.txt
-  DSTS ${CMAKE_INSTALL_PREFIX})
+  DSTS ${FLUID_INSTALL_DIR})
 add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep}) 
@@ -173,7 +173,7 @@ add_custom_target(inference_lib_dist DEPENDS ${inference_lib_dist_dep})
 execute_process(
  COMMAND ${GIT_EXECUTABLE} log --pretty=format:%H -1
  OUTPUT_VARIABLE PADDLE_GIT_COMMIT)
-set(version_file ${CMAKE_INSTALL_PREFIX}/version.txt)
+set(version_file ${FLUID_INSTALL_DIR}/version.txt)
 file(WRITE ${version_file}
  "GIT COMMIT ID: ${PADDLE_GIT_COMMIT}\n"
  "WITH_MKL: ${WITH_MKL}\n"

--- a/doc/v2/build_and_install/build_from_source_cn.rst
+++ b/doc/v2/build_and_install/build_from_source_cn.rst
@@ -35,7 +35,7 @@ PaddlePaddle需要使用Docker环境完成编译，这样可以免去单独安
   # 2. 可选步骤：源码中构建用于编译PaddlePaddle的Docker镜像
   docker build -t paddle:dev .
   # 3. 执行下面的命令编译CPU-Only的二进制
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
   # 4. 或者也可以使用为上述可选步骤构建的镜像（必须先执行第2步）
   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev

--- a/doc/v2/build_and_install/build_from_source_en.rst
+++ b/doc/v2/build_and_install/build_from_source_en.rst
@@ -34,7 +34,7 @@ Or you can build your own image from source as the optional step below:
   # 2. Optional: build development docker image from source
   docker build -t paddle:dev .
   # 3. Run the following command to build a CPU-Only binaries
-   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/docker/build.sh
+   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddlepaddle/paddle_manylinux_devel:cuda8.0_cudnn5 bash -x /paddle/paddle/scripts/paddle_build.sh build
   # 4. Or, use your built Docker image to build PaddlePaddle (must run step 2)
   docker run -it -v $PWD:/paddle -e "WITH_GPU=OFF" -e "WITH_TESTING=OFF" paddle:dev

--- a/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/tensorrt/convert/CMakeLists.txt
 nv_test(test_op_converter SRCS test_op_converter.cc mul_op.cc conv2d_op.cc DEPS ${FLUID_CORE_MODULES})
 nv_test(test_trt_activation_op SRCS test_activation_op.cc activation_op.cc io_converter.cc
-  DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine)
+  DEPS ${FLUID_CORE_MODULES} activation_op tensorrt_engine
+  SERIAL)
 nv_test(test_io_converter SRCS test_io_converter.cc io_converter.cc DEPS dynload_cuda dynamic_loader lod_tensor)
--- a/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
+++ b/paddle/fluid/inference/tests/book/test_inference_image_classification.cc
@@ -21,6 +21,7 @@ DEFINE_string(fp16_dirname, "", "Directory of the float16 inference model.");
 DEFINE_int32(batch_size, 1, "Batch size of input data");
 DEFINE_int32(repeat, 1, "Running the inference program repeat times");
 DEFINE_bool(skip_cpu, false, "Skip the cpu test");
+DEFINE_bool(use_mkldnn, false, "Use MKLDNN to run inference");
 TEST(inference, image_classification) {
  if (FLAGS_dirname.empty() || FLAGS_batch_size < 1 || FLAGS_repeat < 1) {
@@ -58,8 +59,10 @@ TEST(inference, image_classification) {
    // Run inference on CPU
    LOG(INFO) << "--- CPU Runs: ---";
    LOG(INFO) << "Batch size is " << FLAGS_batch_size;
+    LOG(INFO) << "FLAGS_use_mkldnn: " << FLAGS_use_mkldnn;
    TestInference<paddle::platform::CPUPlace, false, true>(
-        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined);
+        dirname, cpu_feeds, cpu_fetchs1, FLAGS_repeat, is_combined,
+        FLAGS_use_mkldnn);
    LOG(INFO) << output1.dims();
  }

--- a/paddle/fluid/inference/tests/test_helper.h
+++ b/paddle/fluid/inference/tests/test_helper.h
@@ -133,11 +133,24 @@ std::vector<std::vector<int64_t>> GetFeedTargetShapes(
  return feed_target_shapes;
 }
+void EnableMKLDNN(
+    const std::unique_ptr<paddle::framework::ProgramDesc>& program) {
+  for (size_t bid = 0; bid < program->Size(); ++bid) {
+    auto* block = program->MutableBlock(bid);
+    for (auto* op : block->AllOps()) {
+      if (op->HasAttr("use_mkldnn")) {
+        op->SetAttr("use_mkldnn", true);
+      }
+    }
+  }
+}
 template <typename Place, bool CreateVars = true, bool PrepareContext = false>
 void TestInference(const std::string& dirname,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_feeds,
                   const std::vector<paddle::framework::LoDTensor*>& cpu_fetchs,
-                   const int repeat = 1, const bool is_combined = false) {
+                   const int repeat = 1, const bool is_combined = false,
+                   const bool use_mkldnn = false) {
  // 1. Define place, executor, scope
  auto place = Place();
  auto executor = paddle::framework::Executor(place);
@@ -169,6 +182,9 @@ void TestInference(const std::string& dirname,
        "init_program",
        paddle::platform::DeviceContextPool::Instance().Get(place));
    inference_program = InitProgram(&executor, scope, dirname, is_combined);
+    if (use_mkldnn) {
+      EnableMKLDNN(inference_program);
+    }
  }
  // Disable the profiler and print the timing information
  paddle::platform::DisableProfiler(paddle::platform::EventSortingKey::kDefault,

--- a/paddle/fluid/operators/CMakeLists.txt
+++ b/paddle/fluid/operators/CMakeLists.txt
@@ -202,10 +202,12 @@ if(WITH_DISTRIBUTE)
    op_library(send_barrier_op DEPS ${DISTRIBUTE_DEPS})
    set_source_files_properties(send_barrier_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    set_source_files_properties(send_recv_op_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op listen_and_serv_op sum_op executor)
+    cc_test(test_send_recv SRCS send_recv_op_test.cc DEPS prefetch_op send_op
+            listen_and_serv_op sum_op executor SERIAL)
    if(WITH_GPU)
        set_source_files_properties(test_send_nccl_id.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
-        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op listen_and_serv_op executor)
+        cc_test(test_send_nccl_id SRCS test_send_nccl_id.cc DEPS send_op
+                listen_and_serv_op executor SERIAL)
        op_library(gen_nccl_id_op DEPS nccl_common sendrecvop_grpc)
        set_source_files_properties(gen_nccl_id_op.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
    else()

--- a/paddle/fluid/operators/detail/CMakeLists.txt
+++ b/paddle/fluid/operators/detail/CMakeLists.txt
@@ -4,6 +4,8 @@ if(WITH_DISTRIBUTE)
  set(DISTRIBUTE_COMPILE_FLAGS "-Wno-non-virtual-dtor -Wno-error=non-virtual-dtor -Wno-error=delete-non-virtual-dtor")
  set_source_files_properties(serde_test.cc grpc_server_test.cc PROPERTIES COMPILE_FLAGS ${DISTRIBUTE_COMPILE_FLAGS})
  cc_test(serde_test SRCS serde_test.cc variable_response.cc DEPS grpc++_unsecure grpc_unsecure gpr
-      cares zlib protobuf sendrecvop_grpc)
+          cares zlib protobuf sendrecvop_grpc SERIAL)
-  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor proto_desc lookup_table_op)
+  cc_test(grpc_server_test SRCS grpc_server_test.cc DEPS sendrecvop_grpc
+          grpc++_unsecure grpc_unsecure gpr cares zlib protobuf executor
+          proto_desc lookup_table_op SERIAL)
 endif()
--- a/paddle/fluid/operators/warpctc_op.h
+++ b/paddle/fluid/operators/warpctc_op.h
@@ -186,8 +186,7 @@ class WarpCTCKernel : public framework::OpKernel<T> {
    // warpctc accesses labels in CPU memory
    Tensor warpctc_label;
-    TensorCopy(*label, platform::CPUPlace(), ctx.device_context(),
+    TensorCopySync(*label, platform::CPUPlace(), &warpctc_label);
-               &warpctc_label);
    const int* warpctc_label_data = warpctc_label.data<int>();
    // warpctc stores loss in CPU memory
    Tensor warpctc_loss;

--- a/paddle/fluid/train/demo/README.md
+++ b/paddle/fluid/train/demo/README.md
@@ -7,7 +7,7 @@
 # WITH_MKLDNN=ON|OFF
 PADDLE_LIB=/paddle/lib/dir
-cmake .. -DCMAKE_INSTALL_PREFIX=$PADDLE_LIB \
+cmake .. -DFLUID_INSTALL_DIR=$PADDLE_LIB \
         -DCMAKE_BUILD_TYPE=Release \
         -DWITH_FLUID_ONLY=ON \
         -DWITH_GPU=OFF \
@@ -42,7 +42,7 @@ cd build
 # WITH_MKLDNN=ON|OFF
 PADDLE_LIB=/paddle/lib/dir
-# PADDLE_LIB is the same with CMAKE_INSTALL_PREFIX when building the lib
+# PADDLE_LIB is the same with FLUID_INSTALL_DIR when building the lib
 cmake .. -DPADDLE_LIB=$PADDLE_LIB \
         -DWITH_MKLDNN=OFF \
         -DWITH_MKL=OFF

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -493,6 +493,7 @@ function gen_fluid_inference_lib() {
    ========================================
 EOF
        make -j `nproc` inference_lib_dist
+        tar -cf ${PADDLE_ROOT}/build/fluid.tgz ${PADDLE_ROOT}/build/fluid_install_dir
      fi
 }

--- a/python/paddle/fluid/layers/control_flow.py
+++ b/python/paddle/fluid/layers/control_flow.py
@@ -1098,7 +1098,7 @@ class ConditionalBlock(object):
        input_set = set([ipt.name for ipt in self.inputs])
        param_list = [
-            parent_block.var(each_name) for each_name in params
+            parent_block.var_recursive(each_name) for each_name in params
            if each_name not in input_set
        ]

--- a/python/paddle/fluid/layers/nn.py
+++ b/python/paddle/fluid/layers/nn.py
@@ -699,8 +699,8 @@ def dynamic_gru(input,
 def gru_unit(input,
             hidden,
             size,
-             weight=None,
+             param_attr=None,
-             bias=None,
+             bias_attr=None,
             activation='tanh',
             gate_activation='sigmoid'):
    """
@@ -731,8 +731,8 @@ def gru_unit(input,
        input (Variable): The fc transformed input value of current step.
        hidden (Variable): The hidden value of lstm unit from previous step.
        size (integer): The input dimension value.
-        weight (ParamAttr): The weight parameters for gru unit. Default: None
+        param_attr (ParamAttr): The weight parameters for gru unit. Default: None
-        bias (ParamAttr): The bias parameters for gru unit. Default: None
+        bias_attr (ParamAttr): The bias parameters for gru unit. Default: None
        activation (string): The activation type for cell (actNode).
                             Default: 'tanh'
        gate_activation (string): The activation type for gates (actGate).
@@ -764,34 +764,31 @@ def gru_unit(input,
    size = size / 3
    # create weight
-    if weight is None:
    weight = helper.create_parameter(
        attr=helper.param_attr, shape=[size, 3 * size], dtype=dtype)
+    gate = helper.create_tmp_variable(dtype)
+    reset_hidden_pre = helper.create_tmp_variable(dtype)
+    updated_hidden = helper.create_tmp_variable(dtype)
+    inputs = {'Input': input, 'HiddenPrev': hidden, 'Weight': weight}
    # create bias
+    if helper.bias_attr:
-    if bias is None:
        bias_size = [1, 3 * size]
        bias = helper.create_parameter(
            attr=helper.bias_attr, shape=bias_size, dtype=dtype, is_bias=True)
+        inputs['Bias'] = bias
-    gate = helper.create_tmp_variable(dtype)
-    reset_hidden_pre = helper.create_tmp_variable(dtype)
-    updated_hidden = helper.create_tmp_variable(dtype)
    helper.append_op(
        type='gru_unit',
-        inputs={'Input': input,
+        inputs=inputs,
-                'HiddenPrev': hidden,
-                'Weight': weight},
        outputs={
            'Gate': gate,
            'ResetHiddenPrev': reset_hidden_pre,
            'Hidden': updated_hidden,
        },
        attrs={
-            'activation': 0,
+            'activation': 2,  # tanh
-            'gate_activation': 1,
+            'gate_activation': 1,  # sigmoid
        })
    return updated_hidden, reset_hidden_pre, gate

--- a/python/paddle/fluid/tests/unittests/CMakeLists.txt
+++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt
@@ -26,7 +26,7 @@ list(REMOVE_ITEM TEST_OPS decorators) # decorators is a helper python file, not
 function(py_test_modules TARGET_NAME)
  if(WITH_TESTING)
-    set(options "")
+    set(options SERIAL)
    set(oneValueArgs "")
    set(multiValueArgs MODULES DEPS ENVS)
    cmake_parse_arguments(py_test_modules "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
@@ -34,6 +34,9 @@ function(py_test_modules TARGET_NAME)
             COMMAND env PYTHONPATH=${PADDLE_BINARY_DIR}/python ${py_test_modules_ENVS}
             ${PYTHON_EXECUTABLE} ${PADDLE_SOURCE_DIR}/tools/test_runner.py ${py_test_modules_MODULES}
             WORKING_DIRECTORY ${CMAKE_CURRENT_BINARY_DIR})
+    if (py_test_modules_SERIAL)
+        set_property(TEST ${TARGET_NAME} PROPERTY SERIAL 1)
+    endif()
  endif()
 endfunction()
@@ -81,7 +84,7 @@ endif(WITH_FAST_BUNDLE_TEST)
 py_test_modules(test_sequence_expand MODULES test_sequence_expand)
 # tests with high overhead
 py_test_modules(test_parallel_executor MODULES test_parallel_executor)
-py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR})
+py_test_modules(test_warpctc_op MODULES test_warpctc_op ENVS FLAGS_warpctc_dir=${WARPCTC_LIB_DIR} SERIAL)
 py_test_modules(test_train_dyn_rnn MODULES test_dyn_rnn)
 py_test_modules(test_mul_op MODULES test_mul_op)
 py_test_modules(test_network_with_dtype MODULES test_network_with_dtype)
@@ -106,4 +109,4 @@ py_test_modules(test_registry MODULES test_registry)
 py_test_modules(test_fetch_var MODULES test_fetch_var)
 py_test_modules(test_dynrnn_static_input MODULES test_dynrnn_static_input)
 py_test_modules(test_parallel_op MODULES test_parallel_op)
-py_test_modules(test_dist_train MODULES test_dist_train)
+py_test_modules(test_dist_train MODULES test_dist_train SERIAL)