diff --git a/.gitignore b/.gitignore
index b92bb9cc129659fa502b4a9b55548992412e5429..90138f996cf9cacc3c1cbff0cf2600eefca3f305 100644
--- a/.gitignore
+++ b/.gitignore
@@ -25,5 +25,6 @@ third_party/
 bazel-*
 third_party/
 
+build_*
 # clion workspace.
 cmake-build-*
diff --git a/CMakeLists.txt b/CMakeLists.txt
index 24262c1821dab88bdf7202349fcde0e9dd6a0820..df00e977ebb547980e69ee421779c57717d771a9 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,7 +72,7 @@ option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_FAST_MATH   "Make use of fast math library"                 OFF)
+option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 
 # PY_VERSION
 if(NOT PY_VERSION)
diff --git a/paddle/fluid/API.spec b/paddle/fluid/API.spec
index 6418da2a7e51c51575ff56aeabedff5452458fbc..c6dd919a93d119723b389d3a695f0af82d711a06 100644
--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -198,6 +198,9 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
 paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.has_inf ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.has_nan ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.isfinite ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))
diff --git a/paddle/fluid/framework/data_type.h b/paddle/fluid/framework/data_type.h
index 8ad2fb5f3ffd9641932bbbb024a31e81d31dc9bb..d5be43b33edab7871e1bba930a4fc6cd1e293825 100644
--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
-
 #include "paddle/fluid/platform/float16.h"
 
 namespace paddle {
diff --git a/paddle/fluid/framework/op_desc.cc b/paddle/fluid/framework/op_desc.cc
index 17f942571d0141537e992be9ab73847d2a794698..b29ac44699463312a1fdcea55e003daa75997302 100644
--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -50,6 +50,27 @@ class CompileTimeInferShapeContext : public InferShapeContext {
   const std::vector<std::string> &Outputs(
       const std::string &name) const override;
 
+  void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string &input_n = Inputs(in)[i];
+    const std::string &output_n = Outputs(out)[j];
+
+    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
+                   in, i);
+    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
+
+    auto *in_var = block_.FindVarRecursive(input_n);
+    auto *out_var = block_.FindVarRecursive(output_n);
+
+    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
+                   "The type of %s and %s is not the same.", input_n, output_n);
+
+    SetDim(output_n, GetDim(input_n));
+  }
+
   void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                 size_t j = 0) const override {
     PADDLE_ENFORCE_LT(i, Inputs(in).size());
diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc
index 6666dd8e60a509c3f82f66f832ecce03811e9fb6..9f930065324f13f5aa79c214e820fb6fc2f3a166 100644
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -542,6 +542,36 @@ class RuntimeInferShapeContext : public InferShapeContext {
     return op_.Outputs(name);
   }
 
+  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string& input_n = Inputs(in)[i];
+    const std::string& output_n = Outputs(out)[j];
+
+    Variable* in_var = scope_.FindVar(input_n);
+    Variable* out_var = scope_.FindVar(output_n);
+    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
+                   "The type of %s and %s is not the same.", output_n,
+                   GetDim(input_n));
+
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
+      out_sele_rows->set_rows(in_sele_rows.rows());
+      out_sele_rows->set_height(in_sele_rows.height());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
+      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
+      out_lod_tensor->Resize(in_lod_tensor.dims());
+    } else {
+      PADDLE_THROW(
+          "Currently, the input type of ShareDim only can be LoDTensor "
+          "or SelectedRows.");
+    }
+  }
+
   void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
                 size_t j = 0) const override {
     const std::vector<std::string>& inputs = Inputs(in);
diff --git a/paddle/fluid/framework/shape_inference.h b/paddle/fluid/framework/shape_inference.h
index 5f497cafa0f75f7c23d550ef767d55274de7c900..280bc19dce7b604d67aefdc572de96b479b8d2d7 100644
--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -56,6 +56,9 @@ class InferShapeContext {
   virtual const std::vector<std::string> &Outputs(
       const std::string &name) const = 0;
 
+  virtual void ShareDim(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) = 0;
+
   virtual void ShareLoD(const std::string &in, const std::string &out,
                         size_t i = 0, size_t j = 0) const = 0;
 
diff --git a/paddle/fluid/framework/tensor_util.cc b/paddle/fluid/framework/tensor_util.cc
index 05c4a17a01c6fabe48f3fe18544c13153feb0673..1d7a2eb5b38255531880fe3d2e5321024caf0c6b 100644
--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -165,10 +165,12 @@ inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
 }
 
 template <typename Predicate>
-struct AnyVisitor : public boost::static_visitor<bool> {
+class AnyVisitor : public boost::static_visitor<bool> {
+ private:
   const framework::Tensor& tensor_;
   Predicate predicate_;
 
+ public:
   AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
       : tensor_(tensor), predicate_(std::move(predicate)) {}
 
@@ -206,6 +208,27 @@ struct AnyVisitor : public boost::static_visitor<bool> {
   }
 };
 
+template <typename Predicate>
+class AnyOutVisitor : public boost::static_visitor<> {
+ private:
+  const framework::Tensor& tensor_;
+  mutable framework::Tensor* out_;
+  Predicate predicate_;
+
+ public:
+  AnyOutVisitor(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out)
+      : tensor_(tensor), out_(out), predicate_(std::move(predicate)) {}
+
+  template <typename Place>
+  void operator()(const Place& place) const {
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    out_->Resize({1});
+    out_->mutable_data<bool>(place);
+    AnyImpl(predicate_, tensor_, *ctx, out_);
+  }
+};
+
 template <typename Predicate>
 inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
   AnyVisitor<Predicate> visitor(tensor, predicate);
@@ -213,6 +236,14 @@ inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
   return platform::VisitPlace(place, visitor);
 }
 
+template <typename Predicate>
+inline void Any(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out) {
+  AnyOutVisitor<Predicate> visitor(tensor, predicate, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 struct ContainsNANPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -227,6 +258,12 @@ bool TensorContainsNAN(const framework::Tensor& tensor) {
   return Any(tensor, predicate);
 }
 
+void TensorContainsNAN(const framework::Tensor& tensor,
+                       framework::Tensor* out) {
+  ContainsNANPredicate predicate;
+  Any(tensor, predicate, out);
+}
+
 struct ContainsInfPredicate {
   template <typename T>
   auto operator()(const T& eigen_vec) const
@@ -241,6 +278,71 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
   return Any(tensor, predicate);
 }
 
+void TensorContainsInf(const framework::Tensor& tensor,
+                       framework::Tensor* out) {
+  ContainsInfPredicate predicate;
+  Any(tensor, predicate, out);
+}
+
+// NOTE(dzhwinter):
+// Isfinite need a AllVisitor to loop through all the elements.
+// We choose two cuda call instead of one allvisitor. The AllVisitor
+// should be implemented if the performance hurts.
+bool TensorIsfinite(const framework::Tensor& tensor) {
+  ContainsInfPredicate pred_inf;
+  ContainsNANPredicate pred_nan;
+  return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
+}
+
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+static inline void __global__ BothFalse(const T* cmp, T* out) {
+  out[0] = (!cmp[0]) && (!out[0]);
+}
+#endif
+
+struct BothFalseVisitor : public boost::static_visitor<> {
+  const framework::Tensor& in_;
+  mutable framework::Tensor* out_;
+  BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out)
+      : in_(in), out_(out) {}
+
+  template <typename Place>
+  void operator()(const Place& place) const {
+    VisitorImpl(place);
+  }
+
+  void VisitorImpl(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
+    BothFalse<bool><<<1, 1, 0, ctx->stream()>>>(in_.data<bool>(),
+                                                out_->mutable_data<bool>(gpu));
+#endif
+  }
+
+  void VisitorImpl(const platform::CPUPlace& cpu) const {
+    bool lhs = !in_.data<bool>()[0];
+    bool rhs = !out_->mutable_data<bool>(cpu)[0];
+    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+  }
+
+  void VisitorImpl(
+      const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
+    bool lhs = !in_.data<bool>()[0];
+    bool rhs = !out_->mutable_data<bool>(cpu)[0];
+    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+  }
+};
+
+void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
+  framework::Tensor tmp;
+  TensorContainsInf(tensor, &tmp);
+  TensorContainsNAN(tensor, out);
+  BothFalseVisitor visitor(tmp, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
+
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                     const platform::DeviceContext& dev_ctx) {
   {  // the 1st field, uint32_t version
diff --git a/paddle/fluid/framework/tensor_util.h b/paddle/fluid/framework/tensor_util.h
index 4457382ade37a12f5f3613fc4113fbf1f6f91124..cab6d9b67e4e64335be0a386bfffb7ebe4373b3e 100644
--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -57,8 +57,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
 template <typename T>
 void TesnorToVector(const Tensor& src, std::vector<T>* dst);
 
+// copy the result bool to cpu
 bool TensorContainsNAN(const framework::Tensor& tensor);
 bool TensorContainsInf(const framework::Tensor& tensor);
+bool TensorIsfinite(const framework::Tensor& tensor);
+
+// store the result bool in gpu tensor, async operation. Faster than above ones.
+void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out);
 
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                     const platform::DeviceContext& dev_ctx);
diff --git a/paddle/fluid/framework/tensor_util_test.cc b/paddle/fluid/framework/tensor_util_test.cc
index 6e10885890cd2d4a0d77834944b37e291197b637..a1e5b967a86d10f3439db662af54bb82888027b9 100644
--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -36,7 +36,7 @@ TEST(TensorCopy, Tensor) {
   TensorCopy(src_tensor, *cpu_place, &dst_tensor);
 
   const int* dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(src_ptr, dst_ptr);
+  EXPECT_NE(src_ptr, dst_ptr);
   for (size_t i = 0; i < 9; ++i) {
     EXPECT_EQ(src_ptr[i], dst_ptr[i]);
   }
@@ -47,7 +47,7 @@ TEST(TensorCopy, Tensor) {
   TensorCopy(slice_tensor, *cpu_place, &dst_tensor);
   const int* slice_ptr = slice_tensor.data<int>();
   dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(dst_ptr, slice_ptr);
+  EXPECT_NE(dst_ptr, slice_ptr);
   for (size_t i = 0; i < 3; ++i) {
     EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
   }
@@ -77,7 +77,7 @@ TEST(TensorCopy, Tensor) {
     // Sync before Compare Tensors
     gpu_ctx.Wait();
     const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
     for (size_t i = 0; i < 9; ++i) {
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
     }
@@ -94,7 +94,7 @@ TEST(TensorCopy, Tensor) {
     gpu_ctx.Wait();
     const int* slice_ptr = slice_tensor.data<int>();
     dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
+    EXPECT_NE(dst_ptr, slice_ptr);
     for (size_t i = 0; i < 3; ++i) {
       EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
     }
@@ -117,7 +117,7 @@ TEST(TensorFromVector, Tensor) {
     // Compare Tensors
     const int* cpu_ptr = cpu_tensor.data<int>();
     const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
     for (size_t i = 0; i < 9; ++i) {
       EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
     }
@@ -127,7 +127,7 @@ TEST(TensorFromVector, Tensor) {
     paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
     cpu_ptr = cpu_tensor.data<int>();
     src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
     for (size_t i = 0; i < 5; ++i) {
       EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
     }
@@ -161,8 +161,8 @@ TEST(TensorFromVector, Tensor) {
     const int* src_ptr = src_vec.data();
     const int* cpu_ptr = cpu_tensor.data<int>();
     const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
     for (size_t i = 0; i < 9; ++i) {
       EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
@@ -181,8 +181,8 @@ TEST(TensorFromVector, Tensor) {
     src_ptr = src_vec.data();
     cpu_ptr = cpu_tensor.data<int>();
     dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
     for (size_t i = 0; i < 5; ++i) {
       EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
       EXPECT_EQ(src_ptr[i], dst_ptr[i]);
@@ -235,9 +235,9 @@ TEST(TensorContainsNAN, CPU) {
     buf[0] = 0.0;
     buf[1] = NAN;
     buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 
   {
@@ -248,9 +248,9 @@ TEST(TensorContainsNAN, CPU) {
     buf[0] = 0.0;
     buf[1].x = 0x7fff;
     buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
     buf[1] = 0.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
   }
 }
 
@@ -261,9 +261,9 @@ TEST(TensorContainsInf, CPU) {
     buf[0] = 1.0;
     buf[1] = INFINITY;
     buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
   }
 
   {
@@ -274,9 +274,55 @@ TEST(TensorContainsInf, CPU) {
     buf[0] = 1.0;
     buf[1].x = 0x7c00;
     buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
     buf[1] = 1.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
+  }
+}
+
+TEST(TensorIsfinite, CPU) {
+  {
+    paddle::framework::Tensor src, out;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1] = INFINITY;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+  }
+
+  {
+    paddle::framework::Tensor src, out;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1] = NAN;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+  }
+
+  {
+    paddle::framework::Tensor src, out;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1].x = 0x7c00;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+    buf[1].x = 0x7fff;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
   }
 }
 
@@ -299,9 +345,9 @@ TEST(Tensor, FromAndToStream) {
     TensorFromStream(iss, &dst_tensor, cpu_ctx);
     int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
     for (int i = 0; i < 5; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
+      EXPECT_EQ(dst_ptr[i], array[i]);
     }
-    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
+    EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
     delete place;
   }
 #ifdef PADDLE_WITH_CUDA
@@ -323,7 +369,7 @@ TEST(Tensor, FromAndToStream) {
 
     int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
     for (int i = 0; i < 6; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
+      EXPECT_EQ(dst_ptr[i], array[i]);
     }
     delete gpu_place;
   }
diff --git a/paddle/fluid/framework/tensor_util_test.cu b/paddle/fluid/framework/tensor_util_test.cu
index b4cff1e6c2293fa44f0fd0bb398a538c08dd4fb1..a51f74199e714b8606c9766c57bc6b1dc4c73c65 100644
--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -27,9 +27,9 @@ static __global__ void FillNAN(float* buf) {
 }
 
 static __global__ void FillInf(float* buf) {
-  buf[0] = 0.0;
-  buf[1] = INFINITY;
-  buf[2] = 0.5;
+  buf[0] = INFINITY;
+  buf[1] = 0.1;
+  buf[2] = 0.2;
 }
 
 static __global__ void FillNAN(platform::float16* buf) {
@@ -44,6 +44,18 @@ static __global__ void FillInf(platform::float16* buf) {
   buf[2] = 0.5;
 }
 
+static __global__ void FillFinite(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = 0.2;
+}
+
+static __global__ void FillFinite(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = 0.2;
+}
+
 TEST(TensorContainsNAN, GPU) {
   paddle::platform::CUDAPlace gpu(0);
   auto& pool = paddle::platform::DeviceContextPool::Instance();
@@ -86,5 +98,163 @@ TEST(TensorContainsInf, GPU) {
   }
 }
 
+TEST(TensorIsfinite, GPU) {
+  paddle::platform::CUDAPlace gpu(0);
+  using paddle::platform::float16;
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  // contains inf
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+
+  // contains nan
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+
+  // all element are finite
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(TensorIsfinite(tensor));
+  }
+}
+
+TEST(TensorContainsInf, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsInf(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+  {
+    Tensor tensor, out;
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsInf(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+}
+
+TEST(TensorContainsNAN, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsNAN(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+  {
+    Tensor tensor, out;
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsNAN(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+}
+
+TEST(TensorIsfinite, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], false);
+  }
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], false);
+  }
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], true);
+  }
+}
+
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
index 360f924810a570422db5a00b13939813fa73e2fa..8058d7e881025b1d806efe187d4523adadff367d 100644
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <thread>  //NOLINT
+
 #include "paddle/fluid/inference/paddle_inference_api.h"
-#include "paddle/fluid/platform/enforce.h"
 
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
@@ -62,17 +62,17 @@ void Main(bool use_gpu) {
     CHECK(predictor->Run(slots, &outputs));
 
     //# 4. Get output.
-    PADDLE_ENFORCE(outputs.size(), 1UL);
+    CHECK_EQ(outputs.size(), 1UL);
     // Check the output buffer size and result of each tid.
-    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    CHECK_EQ(outputs.front().data.length(), 33168UL);
     float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                        0.000932706};
     const size_t num_elements = outputs.front().data.length() / sizeof(float);
     // The outputs' buffers are in CPU memory.
     for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
          i++) {
-      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
-                     result[i]);
+      CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
+                 0.001);
     }
   }
 }
@@ -108,9 +108,9 @@ void MainThreads(int num_threads, bool use_gpu) {
         CHECK(predictor->Run(inputs, &outputs));
 
         // 4. Get output.
-        PADDLE_ENFORCE(outputs.size(), 1UL);
+        CHECK_EQ(outputs.size(), 1UL);
         // Check the output buffer size and result of each tid.
-        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        CHECK_EQ(outputs.front().data.length(), 33168UL);
         float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                            0.000932706};
         const size_t num_elements =
@@ -118,8 +118,8 @@ void MainThreads(int num_threads, bool use_gpu) {
         // The outputs' buffers are in CPU memory.
         for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
              i++) {
-          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
-                         result[i]);
+          CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
+                     result[i], 0.001);
         }
       }
     });
diff --git a/paddle/fluid/inference/api/demo_ci/vis_demo.cc b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
index 3800d49b34738d5a272033d75cb415ae9ad1fb8f..fb59cea457027854a099574c867299450690e61c 100644
--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -17,11 +17,12 @@ limitations under the License. */
  */
 
 #include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
 #include <fstream>
 #include <iostream>
+
+// #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/inference/demo_ci/utils.h"
-#include "paddle/fluid/platform/enforce.h"
 
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -78,18 +79,17 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
   size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
   VLOG(3) << "predictor output numel " << numel;
   VLOG(3) << "reference output numel " << refer.data.size();
-  PADDLE_ENFORCE_EQ(numel, refer.data.size());
+  CHECK_EQ(numel, refer.data.size());
   switch (output.dtype) {
     case PaddleDType::INT64: {
       for (size_t i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
-                          refer.data[i]);
+        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
       }
       break;
     }
     case PaddleDType::FLOAT32:
       for (size_t i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_LT(
+        CHECK_LT(
             fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
             1e-5);
       }
diff --git a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
index 8add7a59da613564d657a6fce329a089bab7e799..290fb007d8ba94a2d121947fe67c6474586ac0e0 100644
--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,9 +27,6 @@ void SetConfig(AnalysisConfig *cfg) {
   cfg->device = 0;
   cfg->enable_ir_optim = true;
   cfg->specify_input_name = true;
-#ifdef PADDLE_WITH_MKLDNN
-  cfg->_use_mkldnn = true;
-#endif
 }
 
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {
diff --git a/paddle/fluid/operators/activation_op.cc b/paddle/fluid/operators/activation_op.cc
index c091476d6d132db17a656d5c8dee65e3a88d9ac2..bbf52bea1358c32596ab6f14eeaa419735d19fc6 100644
--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -80,7 +80,7 @@ class ActivationOp : public framework::OperatorWithKernel {
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -91,12 +91,26 @@ class ActivationOp : public framework::OperatorWithKernel {
   }
 };
 
+class ActivationOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    auto& x = block->FindRecursiveOrCreateVar(x_name);
+    auto& out = block->FindRecursiveOrCreateVar(out_name);
+    out.SetType(x.GetType());
+    out.SetDataType(x.GetDataType());
+  }
+};
+
 class ActivationOpGrad : public framework::OperatorWithKernel {
  public:
   using framework::OperatorWithKernel::OperatorWithKernel;
 
   void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+    ctx->ShareDim("Out", framework::GradVarName("X"));
+    ctx->ShareLoD("Out", framework::GradVarName("X"));
   }
 
  protected:
@@ -525,12 +539,14 @@ namespace ops = paddle::operators;
 #define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)        \
   REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
                     ::paddle::operators::OP_NAME##OpMaker,          \
+                    ::paddle::operators::ActivationOpInferVarType,  \
                     ::paddle::operators::OP_NAME##GradMaker);       \
   REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
 
 #define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                    \
   REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,     \
                     ::paddle::operators::OP_NAME##OpMaker,              \
+                    ::paddle::operators::ActivationOpInferVarType,      \
                     ::paddle::framework::DefaultGradOpDescMaker<true>); \
   REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
 
diff --git a/paddle/fluid/operators/argsort_op.cc b/paddle/fluid/operators/argsort_op.cc
index a2f5a2545701991263c1ef842e9275b1edbfd2ca..d25160f4232b5a621d16b9f469f56bd5aa7c88e3 100644
--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -42,8 +42,8 @@ class ArgsortOp : public framework::OperatorWithKernel {
                    "-rank(Input(X)) (%d).",
                    axis, num_dims);
 
-    ctx->SetOutputDim("Out", in_dims);
-    ctx->SetOutputDim("Indices", in_dims);
+    ctx->ShareDim("X", "Out");
+    ctx->ShareDim("X", "Indices");
     ctx->ShareLoD("X", "Out");
     ctx->ShareLoD("X", "Indices");
   }
diff --git a/paddle/fluid/operators/conv_shift_op.cc b/paddle/fluid/operators/conv_shift_op.cc
index f2549e814d6f3b5674fe2eec1139f1c3dc6fa0b4..08506ddd18ed35831702814e70962cb36ec958b1 100644
--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -44,7 +44,7 @@ class ConvShiftOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
                       "The 2nd dimension of Input(Y) should be less than or "
                       "equal to the 2nd dimension of Input(X).");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
diff --git a/paddle/fluid/operators/cub_reduce.h b/paddle/fluid/operators/cub_reduce.h
index 16fdad775f7befaac04b1ac59a601f04e0ab2bdc..afd3922b8d6537ee16dc5041a838858089adbdb1 100644
--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
@@ -22,6 +22,7 @@
 
 #include <cub/cub.cuh>  // NOLINT
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 
 namespace paddle {
 namespace operators {
@@ -293,7 +294,12 @@ void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
   }
   auto x_data = x.data<Tx>();
   auto y_data = y->mutable_data<Ty>(x.place());
-  if (reduce_num == 1) return;
+  if (reduce_num == 1) {
+    auto out_dims = y->dims();
+    framework::TensorCopy(x, y->place(), y);
+    y->Resize(out_dims);
+    return;
+  }
 
 #define CUB_BLOCK_DIM_CASE(block_dim)                                    \
   case block_dim: {                                                      \
diff --git a/paddle/fluid/operators/elementwise_op.h b/paddle/fluid/operators/elementwise_op.h
index 94df11bee70dec44f19ee9ffff04ca92d5990ee8..7e5975ead64ab39a9c618a33e300c4fce55a5b22 100644
--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -41,7 +41,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
     auto y_dim = ctx->GetInputDim("Y");
     PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                       "Rank of first input must >= rank of second input.");
-    ctx->SetOutputDim("Out", x_dim);
+
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
@@ -70,6 +71,7 @@ class ElementwiseOpInferVarType : public framework::VarTypeInference {
     auto& x = block->FindRecursiveOrCreateVar(x_name);
     auto& out = block->FindRecursiveOrCreateVar(out_name);
     out.SetType(x.GetType());
+    out.SetDataType(x.GetDataType());
   }
 };
 
@@ -157,10 +159,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
     auto x_grad_name = framework::GradVarName("X");
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareDim("X", /*->*/ x_grad_name);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
     }
     if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
+      ctx->ShareDim("Y", /*->*/ y_grad_name);
+      ctx->ShareLoD("Y", /*->*/ y_grad_name);
     }
   }
 
@@ -193,14 +197,15 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
 
     auto x_grad_name = framework::GradVarName("X");
     if (ctx->HasOutput(x_grad_name)) {
-      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
-      ctx->SetOutputDim(x_grad_name, out_dims);
+      ctx->ShareDim(framework::GradVarName("Out"), /*->*/ x_grad_name);
+      ctx->ShareLoD(framework::GradVarName("Out"), /*->*/ x_grad_name);
     }
     auto y_grad_name = framework::GradVarName("Y");
     if (ctx->HasOutput(y_grad_name)) {
       PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-      auto y_dims = ctx->GetInputDim("Y");
-      ctx->SetOutputDim(y_grad_name, y_dims);
+
+      ctx->ShareDim("Y", /*->*/ y_grad_name);
+      ctx->ShareLoD("Y", /*->*/ y_grad_name);
     }
   }
 };
diff --git a/paddle/fluid/operators/fake_dequantize_op.cc b/paddle/fluid/operators/fake_dequantize_op.cc
index 2008e7027524ffd1f80a6eede015801b8a0b0254..5d6488c67e0db440c8d4609736523643dd666dcc 100644
--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -48,7 +48,8 @@ class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
                    "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
diff --git a/paddle/fluid/operators/isfinite_op.cc b/paddle/fluid/operators/isfinite_op.cc
new file mode 100644
index 0000000000000000000000000000000000000000..248c7793560db99c0af06421bf74808422016061
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -0,0 +1,113 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/operators/isfinite_op.h"
+#include <string>
+#include <vector>
+
+namespace paddle {
+namespace operators {
+
+class OverflowOp : public framework::OperatorWithKernel {
+ public:
+  OverflowOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of OverflowOp should not be null.");
+
+    ctx->SetOutputDim("Out", {1});
+  }
+
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    int dtype = -1;
+    auto *x_var = ctx.InputVar("X");
+    if (x_var->IsType<framework::LoDTensor>()) {
+      dtype = framework::ToDataType(x_var->Get<framework::LoDTensor>().type());
+    } else if (x_var->IsType<framework::SelectedRows>()) {
+      dtype = framework::ToDataType(
+          x_var->Get<framework::SelectedRows>().value().type());
+    } else {
+      PADDLE_THROW("Cannot find the input data type by all input data");
+    }
+    return framework::OpKernelType(framework::proto::VarType::Type(dtype),
+                                   ctx.GetPlace());
+  }
+};
+
+class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensors of overflow operator.");
+    AddOutput("Out",
+              "(Tensor) 1-dim tensor, contains a bool scalar. The output "
+              "tensor of overflow operator.");
+    AddComment(string::Sprintf(R"DOC(
+Overflow operator.
+
+$$Out = any(X)$$
+
+If any X contains Inf or Nan, the Out will generate a indicator.
+Out = Inf if any X contains Inf,
+Out = Nan if any X contains Nan,
+Out = 0 if no Inf/Nan detected.
+If X contains both Inf/Nan, it will return the first indicator it meeted.
+)DOC",
+                               GetName(), GetComments()));
+  }
+
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetComments() const = 0;
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+namespace ops = paddle::operators;
+
+#define REGISTER_OP_MAKER(op_type, comment)             \
+  namespace paddle {                                    \
+  namespace operators {                                 \
+  class _##op_type##OverflowOpMaker                     \
+      : public ::paddle::operators::OverflowOpMaker {   \
+   protected:                                           \
+    std::string GetName() const { return #op_type; }    \
+    std::string GetComments() const { return comment; } \
+  };                                                    \
+  }                                                     \
+  }                                                     \
+  REGISTER_OPERATOR(op_type, ops::OverflowOp,           \
+                    ops::_##op_type##OverflowOpMaker,   \
+                    paddle::framework::EmptyGradOpMaker)
+
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                      \
+  REGISTER_OP_CPU_KERNEL(                                                   \
+      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int, \
+                                   ops::functor>,                           \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,        \
+                          ops::functor>,                                    \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,       \
+                          ops::functor>);
+
+REGISTER_OP_MAKER(isinf, "isinf(X)");
+REGISTER_OP_MAKER(isnan, "isnan(X)");
+REGISTER_OP_MAKER(isfinite, "isfinite(X)");
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_OVERFLOW_CPU_KERNEL);
diff --git a/paddle/fluid/operators/isfinite_op.cu b/paddle/fluid/operators/isfinite_op.cu
new file mode 100644
index 0000000000000000000000000000000000000000..8d1268b18c6fec03063051f545075209a6fcde27
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_op.cu
@@ -0,0 +1,33 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/float16.h"
+
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+
+#define REGISTER_OVERFLOW_CUDA_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CUDA_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,  \
+                                   ops::functor>,                             \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,         \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,        \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16, \
+                          ops::functor>);
+
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_OVERFLOW_CUDA_KERNEL);
diff --git a/paddle/fluid/operators/isfinite_op.h b/paddle/fluid/operators/isfinite_op.h
new file mode 100644
index 0000000000000000000000000000000000000000..83b080856366ac3332c5856a19b721893bb80eb3
--- /dev/null
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -0,0 +1,71 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/transform.h"
+
+namespace paddle {
+namespace operators {
+
+struct InfinityFunctor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsInf(tensor, out);
+  }
+};
+
+struct NANFunctor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsNAN(tensor, out);
+  }
+};
+
+struct IsfiniteFunctor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorIsfinite(tensor, out);
+  }
+};
+
+template <typename DeviceContext, typename T, typename Functor>
+class OverflowKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* x = ctx.InputVar("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    Functor functor;
+    if (x->IsType<framework::LoDTensor>()) {
+      auto* in = ctx.Input<framework::Tensor>("X");
+      functor(*in, out);
+    } else if (x->IsType<framework::SelectedRows>()) {
+      auto& in = ctx.Input<framework::SelectedRows>("X")->value();
+      functor(in, out);
+    } else {
+      PADDLE_THROW("Unsupported input type.");
+    }
+  }
+};
+
+}  // namespace operators
+}  // namespace paddle
+
+#define FOR_EACH_KERNEL_FUNCTOR(__macro) \
+  __macro(isinf, InfinityFunctor);       \
+  __macro(isnan, NANFunctor);            \
+  __macro(isfinite, IsfiniteFunctor);
diff --git a/paddle/fluid/operators/lookup_table_op.cc b/paddle/fluid/operators/lookup_table_op.cc
index d77b095c5d783a2a9fab87eb8b458117a6a3d225..b9ac54e446811889b647397ae1fbb11c28f46777 100644
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -137,6 +137,7 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
               << " is set to LoDTensor";
       block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
     }
+    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
   }
 };
 
diff --git a/paddle/fluid/operators/prelu_op.cc b/paddle/fluid/operators/prelu_op.cc
index e0c4c81bdd5b5d0af3bafe632a2fa033efd08050..58cfbb76e93a1c15c9b7cf9f9e596066c29b7ebb 100644
--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -49,7 +49,7 @@ class PReluOp : public framework::OperatorWithKernel {
     } else {
       PADDLE_THROW("Unkown mode %s", mode);
     }
-    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
diff --git a/paddle/fluid/operators/rnn_memory_helper_op.cc b/paddle/fluid/operators/rnn_memory_helper_op.cc
index 13df1d4b4bb6c240610f96ccc8f223fc984d63f7..0fb7776fd9dbf437673820c7cf9411644272626c 100644
--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -54,7 +54,7 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
                    "Input(X) of rnn_memory_helper op should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output of rnn_memory_helper op should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
diff --git a/paddle/fluid/operators/sequence_conv_op.cc b/paddle/fluid/operators/sequence_conv_op.cc
index ec6cb24350ae276724aae339590d40be1e9ea400..95a21a5d3ee6d8037431083edc25d1cddf05dedb 100644
--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -90,8 +90,8 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
                         ctx->GetInputDim("PaddingData"));
     }
     if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
-      ctx->ShareLoD("X", framework::GradVarName("X"));
+      ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
     }
     if (ctx->HasOutput(framework::GradVarName("Filter"))) {
       ctx->SetOutputDim(framework::GradVarName("Filter"),
diff --git a/paddle/fluid/operators/sequence_pool_op.cc b/paddle/fluid/operators/sequence_pool_op.cc
index 5c6fd13d42e43e3502a1cab85a56e019420c708d..15d3f064eb7b025dc9a85b2aabad24186061cbd4 100644
--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -102,8 +102,9 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
     for (int64_t i = 1; i < og_dims.size(); ++i) {
       PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
     }
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
+
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
   }
 
  protected:
diff --git a/paddle/fluid/operators/sequence_reshape_op.cc b/paddle/fluid/operators/sequence_reshape_op.cc
index ef5e6f3210234d59298fcf04c812390643c693d0..31d28d723498892f287246ba228df757d5b9f6c8 100644
--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
@@ -92,7 +92,7 @@ class SequenceReshapeGradOp : public framework::OperatorWithKernel {
     PADDLE_ENFORCE(ctx->HasInput("X"),
                    "Input(X) of SequenceReshapeGradOp should  not be null.");
 
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
     ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
   }
 };
diff --git a/paddle/fluid/operators/sequence_softmax_op.cc b/paddle/fluid/operators/sequence_softmax_op.cc
index c44f8206eb5079fef969e3e527552512eebd0f1a..ada3e0c8dbba38729c2b9c8b02335327835f2ef4 100644
--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -27,7 +27,8 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
                    "Input(X) of SequenceSoftmaxOp should not be null.");
     PADDLE_ENFORCE(ctx->HasOutput("Out"),
                    "Output(Out) of SequenceSoftmaxOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 
diff --git a/paddle/fluid/operators/shrink_rnn_memory_op.cc b/paddle/fluid/operators/shrink_rnn_memory_op.cc
index 29d2fb989754f5621222768a279a1c898ea1c355..e1c74c3a2f89235ba92c396d1a548271bb7d939d 100644
--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -151,9 +151,9 @@ class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
   void operator()(framework::InferShapeContext *context) const override {
     PADDLE_ENFORCE(context->HasInput("X"));
     PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
-    context->SetOutputDim(framework::GradVarName("X"),
-                          context->GetInputDim("X"));
-    context->ShareLoD("X", framework::GradVarName("X"));
+
+    context->ShareDim("X", /*->*/ framework::GradVarName("X"));
+    context->ShareLoD("X", /*->*/ framework::GradVarName("X"));
   }
 };
 
diff --git a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
index c3b0fe32098cb4b41ccc155db58809ef9f1bf46b..193de05422bb78572c0e5eaf4cd46744c3bcb113 100644
--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -40,7 +40,7 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
                       "The 2nd dimension of Input(X) and Input(Label) should "
                       "be equal.");
 
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareDim("X", /*->*/ "Out");
     ctx->ShareLoD("X", /*->*/ "Out");
   }
 };
diff --git a/paddle/fluid/pybind/pybind.cc b/paddle/fluid/pybind/pybind.cc
index 295af1c5837d70c32b522cc47c8c3e12d5bd61c7..311cd944603e9bdfefef4daa3a9c690df5b30235 100644
--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -620,7 +620,23 @@ All parameter, weight, gradient are variables in Paddle.
 
   // -- python binds for parallel executor.
   py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
+    ExecutionStrategy allows the user to more preciously control how to run
+    the program in ParallelExecutor by setting the property.
+
+    The available properties include:
+        use_cuda (bool): Whether to use CUDA or not. Default True.
+        num_threads (int): The number of threads that used to run the
+            operators in ParallelExecutor. If it is not set, it will be
+            set in ParallelExecutor according to the device count.
+            Default 0.
+        allow_op_delay (bool): Whether to delay the communication operators
+            to run. Default False.
+        num_iteration_per_drop_scope (int): how many iterations between
+            the two dropping local scopes. Default 100.
+
+        )DOC");
+
   exec_strategy.def(py::init())
       .def_property(
           "num_threads",
@@ -658,7 +674,25 @@ All parameter, weight, gradient are variables in Paddle.
                                   : ExecutionStrategy::kDefault;
       });
 
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
+  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
+    BuildStrategy allows the user to more preciously control how to
+    build the SSA Graph in ParallelExecutor by setting the property.
+
+    The available properties include:
+        reduce_strategy (str): There are two reduce strategies, 'AllReduce'
+            and 'Reduce'. If you want that all parameters will be optimized
+            on all devices, you can choose 'AllReduce'; if you choose
+            'Reduce', all parameters will be evenly allocated to different
+            devices for optimization, and then broadcast the optimized
+            parameter to other devices. Default 'AllReduce'.
+        gradient_scale_strategy (str): There are two ways of defining loss@grad,
+            'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor
+            sets the loss@grad according to the number of devices. If you want
+            to customize loss@grad, you can choose 'Customized'.
+            Default 'CoeffNumDevice'.
+        debug_graphviz_path (str): Whether to write the SSA Graph to file in the
+            form of graphviz. It is useful for debugging. Default "".
+)DOC");
 
   py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
       .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)
diff --git a/paddle/scripts/paddle_build.sh b/paddle/scripts/paddle_build.sh
index b434c9f08e96abe16c1194e69af40392e6e4ca0c..e133323ae420ba68d90215767ab940aed744acd6 100755
--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -600,7 +600,7 @@ EOF
     if [[ ${WITH_GPU} == "ON"  ]]; then
         NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} || true"
     else
-        NCCL_DEPS=""
+        NCCL_DEPS="true"
     fi
 
     if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then
diff --git a/python/CMakeLists.txt b/python/CMakeLists.txt
index 1c5ded943b3814688af1f177503d3bdc35073c3f..0d29f2ad209296688582924ae16e495930830bd4 100644
--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -60,7 +60,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
     COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
     COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib.* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
     DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})
diff --git a/python/paddle/dataset/flowers.py b/python/paddle/dataset/flowers.py
index 0d4e7f1ee46ff97912d010cdb268cc4898d99f58..57c5e83c82d216f55a33e568849d87689f86270f 100644
--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -35,16 +35,15 @@ import itertools
 import functools
 from .common import download
 import tarfile
-import six
 import scipy.io as scio
 from paddle.dataset.image import *
 from paddle.reader import *
+from paddle import compat as cpt
 import os
 import numpy as np
 from multiprocessing import cpu_count
 import six
 from six.moves import cPickle as pickle
-from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 
 DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
@@ -126,9 +125,11 @@ def reader_creator(data_file,
                         batch = pickle.load(f)
                     else:
                         batch = pickle.load(f, encoding='bytes')
+                if six.PY3:
+                    batch = cpt.to_text(batch)
                 data = batch['data']
                 labels = batch['label']
-                for sample, label in zip(data, batch['label']):
+                for sample, label in six.moves.zip(data, batch['label']):
                     yield sample, int(label) - 1
             if not cycle:
                 break
diff --git a/python/paddle/fluid/layers/tensor.py b/python/paddle/fluid/layers/tensor.py
index 04e71497aa762e390c4123c0bf3d7f111a772dd4..44b92af7acc012f89b271c74b026d18e1a4075f8 100644
--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -24,21 +24,10 @@ from .layer_function_generator import templatedoc
 import numpy
 
 __all__ = [
-    'create_tensor',
-    'create_parameter',
-    'create_global_var',
-    'cast',
-    'concat',
-    'sums',
-    'assign',
-    'fill_constant_batch_size_like',
-    'fill_constant',
-    'argmin',
-    'argmax',
-    'argsort',
-    'ones',
-    'zeros',
-    'reverse',
+    'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'concat',
+    'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant',
+    'argmin', 'argmax', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf',
+    'has_nan', 'isfinite'
 ]
 
 
@@ -652,3 +641,52 @@ def load_combine(out, file_path):
         inputs={},
         output={"Out": out},
         args={"file_path": file_path})
+
+
+def has_inf(x):
+    """
+    Test if any of x contains an infinity number
+
+    Args:
+       x(variable): The Tensor/LoDTensor to be checked.
+
+    Returns:
+        Variable: The tensor variable storing the output, only a bool value.
+    """
+    helper = LayerHelper("isinf", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def has_nan(x):
+    """
+    Test if any of x contains a NAN
+
+    Args:
+       x(variable): The Tensor/LoDTensor to be checked.
+
+    Returns:
+        Variable: The tensor variable storing the output, only a bool value.
+    """
+    helper = LayerHelper("isnan", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out})
+    return out
+
+
+def isfinite(x):
+    """
+    Test if any of x contains an infinity/NAN number. If all the elements are finite,
+    returns true, else false.
+
+    Args:
+       x(variable): The Tensor/LoDTensor to be checked.
+
+    Returns:
+        Variable: The tensor variable storing the output, contains a bool value.
+    """
+    helper = LayerHelper("isfinite", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
+    return out
diff --git a/python/paddle/fluid/lod_tensor.py b/python/paddle/fluid/lod_tensor.py
index a9de09f31f4ed04ba1aa003e85b25fc5a91557e4..b91566fa6fb2449a8becc694b978c30039bf30ed 100644
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -74,7 +74,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
         assert [
             new_recursive_seq_lens
         ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = np.concatenate(data, axis=0)
         flattened_data = flattened_data.reshape([len(flattened_data), 1])
         return create_lod_tensor(flattened_data, recursive_seq_lens, place)
     elif isinstance(data, np.ndarray):
diff --git a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
index 775c2253ab3b27708b745b85fc007fcb504d1aed..6a129b6df9bf1830fdf5eb5cb9ae0c5e4f7bb4ec 100644
--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 
 
 class ElementwiseMulOp(OpTest):
@@ -115,5 +117,56 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
         }
 
 
+class TestElementWiseMulSelectedRows(OpTest):
+    def setUp(self):
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+        self.feature = 12
+        self.height = 100
+        self.input_shape = (len(self.rows), self.feature)
+
+    def prepare_input(self, scope, place):
+        self.input = {
+            "X": np.random.random(self.input_shape).astype("float32"),
+            "Y": np.random.random(self.input_shape).astype("float32")
+        }
+
+        def init_input(in_name):
+            x_selected_rows = scope.var(in_name).get_selected_rows()
+            x_selected_rows.set_height(self.height)
+            x_selected_rows.set_rows(self.rows)
+            x_array = self.input[in_name]
+            x_tensor = x_selected_rows.get_tensor()
+            x_tensor.set(x_array, place)
+
+        init_input("X")
+        init_input("Y")
+
+    def create_out_selected_row(self, scope):
+        return scope.var('Out').get_selected_rows()
+
+    def check_result(self, out_selected_rows):
+        assert out_selected_rows.height() == self.height
+        assert out_selected_rows.rows() == self.rows
+        out_tensor = np.array(out_selected_rows.get_tensor())
+        assert out_tensor.shape == self.input_shape
+
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.prepare_input(scope, place)
+
+        out_selected_rows = self.create_out_selected_row(scope)
+        out_selected_rows.set_height(0)
+        out_selected_rows.set_rows([])
+
+        elementwise_mul = Operator("elementwise_mul", X='X', Y='Y', Out='Out')
+        elementwise_mul.run(scope, place)
+        self.check_result(out_selected_rows)
+
+    def test_elewisemul_with_selected_rows_input(self):
+        places = [core.CPUPlace()]
+        for place in places:
+            self.check_with_place(place)
+
+
 if __name__ == '__main__':
     unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_isfinite_op.py b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
new file mode 100644
index 0000000000000000000000000000000000000000..d96ae15c7288c9a8d585d8d70d2aa8922b8f22b3
--- /dev/null
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
@@ -0,0 +1,97 @@
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import unittest
+import numpy as np
+from op_test import OpTest
+
+
+class TestInf(OpTest):
+    def setUp(self):
+        self.op_type = "isinf"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        x[0] = np.inf
+        x[-1] = np.inf
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.array(True).astype(self.dtype)}
+
+    def init_dtype(self):
+        pass
+
+    def test_output(self):
+        self.check_output()
+
+
+class TestFP16Inf(TestInf):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestNAN(OpTest):
+    def setUp(self):
+        self.op_type = "isnan"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        x[0] = np.nan
+        x[-1] = np.nan
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.array(True).astype(self.dtype)}
+
+    def init_dtype(self):
+        pass
+
+    def test_output(self):
+        self.check_output()
+
+
+class TestFP16NAN(TestNAN):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+class TestIsfinite(OpTest):
+    def setUp(self):
+        self.op_type = "isfinite"
+        self.dtype = np.float32
+        self.init_dtype()
+
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        x[0] = np.inf
+        x[-1] = np.nan
+        out = np.isinf(x) | np.isnan(x)
+
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.array(False).astype(self.dtype)}
+
+    def init_dtype(self):
+        pass
+
+    def test_output(self):
+        self.check_output()
+
+
+class TestFP16Isfinite(TestIsfinite):
+    def init_dtype(self):
+        self.dtype = np.float16
+
+
+if __name__ == '__main__':
+    unittest.main()
diff --git a/python/paddle/fluid/tests/unittests/test_reduce_op.py b/python/paddle/fluid/tests/unittests/test_reduce_op.py
index 328f0f0011381b77cccb8b2d9b266aa53b259473..8fc8125a773543eea768783155ad152c475535b5 100644
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -243,5 +243,87 @@ class TestKeepDimReduceSumMultiAxises(OpTest):
         self.check_grad(['X'], 'Out')
 
 
+class TestReduceSumWithDimOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((10, 1, 1)).astype("float64")}
+        self.attrs = {'dim': [1, 2], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=True)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceSumWithNumelOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((1, 1)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': False}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=False)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceMeanWithDimOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((10, 1, 1)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': False}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=False)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceMeanWithNumelOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((1, 1)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=True)
+        }
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
+class TestReduceAll(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((1, 1, 1)).astype("float64")}
+        self.attrs = {'reduce_all': True, 'keep_dim': False}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+
+    def test_check_output(self):
+        self.check_output()
+
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+
+
 if __name__ == '__main__':
     unittest.main()