diff --git a/paddle/fluid/framework/details/multi_devices_graph_pass.cc b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
index 67d29a42d750441cfe422db0d7b75c0064f3c9ac..3dc177a8cb7a1e994aca5304240f1eb61ba23f02 100644
--- a/paddle/fluid/framework/details/multi_devices_graph_pass.cc
+++ b/paddle/fluid/framework/details/multi_devices_graph_pass.cc
@@ -648,6 +648,12 @@ int MultiDevSSAGraphBuilder::GetVarDeviceID(
     const ir::Graph &graph, const std::string &varname,
     const std::unordered_map<std::string, int> &sharded_var_device) const {
   auto got = sharded_var_device.find(varname);
+  if (got == sharded_var_device.end()) {
+    auto pos = varname.find(framework::kNewGradSuffix);
+    if (pos != std::string::npos) {
+      got = sharded_var_device.find(varname.substr(0, pos));
+    }
+  }
   return got == sharded_var_device.end() ? -1 : got->second;
 }
 
diff --git a/paddle/fluid/operators/math/CMakeLists.txt b/paddle/fluid/operators/math/CMakeLists.txt
index 7f9a55acf84a7c2a85f57cfbd6390686150e7bdf..cc3cc9787a3926eea2f9a1620eead9823a7d77c5 100644
--- a/paddle/fluid/operators/math/CMakeLists.txt
+++ b/paddle/fluid/operators/math/CMakeLists.txt
@@ -57,6 +57,9 @@ math_library(sequence_padding)
 math_library(sequence_pooling DEPS math_function)
 math_library(sequence_scale)
 math_library(softmax DEPS math_function)
+if (NOT WIN32)
+    math_library(matrix_bit_code)
+endif (NOT WIN32)
 math_library(unpooling)
 math_library(vol2col)
 
@@ -73,11 +76,12 @@ endif()
 cc_test(concat_test SRCS concat_test.cc DEPS concat_and_split)
 cc_test(cpu_vec_test SRCS cpu_vec_test.cc DEPS blas cpu_info)
 if (NOT WIN32)
-    math_library(matrix_bit_code)
     set(JIT_KERNEL_SRCS jit_kernel.cc jit_kernel_blas.cc jit_kernel_exp.cc jit_kernel_rnn.cc jit_kernel_crf_decode.cc)
     set(JIT_KERNEL_DEPS cpu_info cblas gflags enforce)
     if(WITH_XBYAK)
         list(APPEND JIT_KERNEL_SRCS jit_gen.cc jit_code.cc)
         list(APPEND JIT_KERNEL_DEPS xbyak)
     endif()
+    cc_library(jit_kernel SRCS ${JIT_KERNEL_SRCS} DEPS ${JIT_KERNEL_DEPS})
+    cc_test(jit_kernel_test SRCS jit_kernel_test.cc DEPS jit_kernel)
 endif (NOT WIN32)
diff --git a/paddle/fluid/operators/math/selected_rows_functor.h b/paddle/fluid/operators/math/selected_rows_functor.h
index b24ffb57acd4f4ef4430a1bdff30d2f96e349803..6d146d39d6d07678e859b82b25ba60ed7661546d 100644
--- a/paddle/fluid/operators/math/selected_rows_functor.h
+++ b/paddle/fluid/operators/math/selected_rows_functor.h
@@ -64,6 +64,8 @@ struct SelectedRowsSumTo {
                   framework::SelectedRows* input2);
 };
 
+// FIXME: The result of SelectedRowsAddToTensor maybe non deterministic,
+// because it uses CudaAtomicAdd.
 // input2 = input1 + input2
 template <typename DeviceContext, typename T>
 struct SelectedRowsAddToTensor {
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index d7676f89ab5e781f910f98d03e72d5f7c1023a9a..2f5fef36c423736666695c07ebf69d812c3488ed 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -367,7 +367,12 @@ function run_test() {
     Running unit tests ...
     ========================================
 EOF
-        ctest --output-on-failure
+        if [ ${TESTING_DEBUG_MODE:-OFF} == "ON" ] ; then
+            ctest -V
+        else
+            ctest --output-on-failure
+        fi
+
         # make install should also be test when unittest
         make install -j `nproc`
         pip install ${INSTALL_PREFIX:-/paddle/build}/opt/paddle/share/wheels/*.whl