Merge branch 'develop' of https://github.com/PaddlePaddle/Paddle into develop

98ffde41 · shippingwang · 08d8a622 · ac8208b6 · 98ffde41 · 98ffde41
41 changed file
--- a/.gitignore
+++ b/.gitignore
@@ -25,5 +25,6 @@ third_party/
 bazel-*
 third_party/
+build_*
 # clion workspace.
 cmake-build-*
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -72,7 +72,7 @@ option(WITH_INFERENCE    "Compile fluid inference library"              ON)
 option(WITH_INFERENCE_API_TEST   "Test fluid inference high-level api interface"  OFF)
 option(WITH_SYSTEM_BLAS   "Use system blas library"           OFF)
 option(PY_VERSION       "Compile PaddlePaddle with python3 support"     ${PY_VERSION})
-option(WITH_FAST_MATH   "Make use of fast math library"                 OFF)
+option(WITH_FAST_MATH   "Make use of fast math library, might affect the precision to some extent" ON)
 # PY_VERSION
 if(NOT PY_VERSION)

--- a/paddle/fluid/API.spec
+++ b/paddle/fluid/API.spec
@@ -198,6 +198,9 @@ paddle.fluid.layers.argsort ArgSpec(args=['input', 'axis', 'name'], varargs=None
 paddle.fluid.layers.ones ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.zeros ArgSpec(args=['shape', 'dtype', 'force_cpu'], varargs=None, keywords=None, defaults=(False,))
 paddle.fluid.layers.reverse ArgSpec(args=['x', 'axis'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.has_inf ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.has_nan ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
+paddle.fluid.layers.isfinite ArgSpec(args=['x'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.While.__init__ ArgSpec(args=['self', 'cond', 'is_test', 'name'], varargs=None, keywords=None, defaults=(False, None))
 paddle.fluid.layers.While.block ArgSpec(args=['self'], varargs=None, keywords=None, defaults=None)
 paddle.fluid.layers.Switch.__init__ ArgSpec(args=['self', 'name'], varargs=None, keywords=None, defaults=(None,))

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -17,7 +17,6 @@ limitations under the License. */
 #include <typeindex>
 #include "paddle/fluid/framework/framework.pb.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/float16.h"
 namespace paddle {

--- a/paddle/fluid/framework/op_desc.cc
+++ b/paddle/fluid/framework/op_desc.cc
@@ -50,6 +50,27 @@ class CompileTimeInferShapeContext : public InferShapeContext {
  const std::vector<std::string> &Outputs(
      const std::string &name) const override;
+  void ShareDim(const std::string &in, const std::string &out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string &input_n = Inputs(in)[i];
+    const std::string &output_n = Outputs(out)[j];
+    PADDLE_ENFORCE(input_n != framework::kEmptyVarName, "The %s[%d] is @EMPTY@",
+                   in, i);
+    PADDLE_ENFORCE(output_n != framework::kEmptyVarName,
+                   "The %s[%d] is @EMPTY@", out, j);
+    auto *in_var = block_.FindVarRecursive(input_n);
+    auto *out_var = block_.FindVarRecursive(output_n);
+    PADDLE_ENFORCE(in_var->GetType() == out_var->GetType(),
+                   "The type of %s and %s is not the same.", input_n, output_n);
+    SetDim(output_n, GetDim(input_n));
+  }
  void ShareLoD(const std::string &in, const std::string &out, size_t i = 0,
                size_t j = 0) const override {
    PADDLE_ENFORCE_LT(i, Inputs(in).size());

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -542,6 +542,36 @@ class RuntimeInferShapeContext : public InferShapeContext {
    return op_.Outputs(name);
  }
+  void ShareDim(const std::string& in, const std::string& out, size_t i = 0,
+                size_t j = 0) override {
+    PADDLE_ENFORCE_LT(i, Inputs(in).size());
+    PADDLE_ENFORCE_LT(j, Outputs(out).size());
+    const std::string& input_n = Inputs(in)[i];
+    const std::string& output_n = Outputs(out)[j];
+    Variable* in_var = scope_.FindVar(input_n);
+    Variable* out_var = scope_.FindVar(output_n);
+    PADDLE_ENFORCE(in_var->Type() == out_var->Type(),
+                   "The type of %s and %s is not the same.", output_n,
+                   GetDim(input_n));
+    if (in_var->IsType<framework::SelectedRows>()) {
+      auto& in_sele_rows = in_var->Get<framework::SelectedRows>();
+      auto out_sele_rows = out_var->GetMutable<framework::SelectedRows>();
+      out_sele_rows->mutable_value()->Resize(in_sele_rows.value().dims());
+      out_sele_rows->set_rows(in_sele_rows.rows());
+      out_sele_rows->set_height(in_sele_rows.height());
+    } else if (in_var->IsType<framework::LoDTensor>()) {
+      auto& in_lod_tensor = in_var->Get<framework::LoDTensor>();
+      auto* out_lod_tensor = out_var->GetMutable<framework::LoDTensor>();
+      out_lod_tensor->Resize(in_lod_tensor.dims());
+    } else {
+      PADDLE_THROW(
+          "Currently, the input type of ShareDim only can be LoDTensor "
+          "or SelectedRows.");
+    }
+  }
  void ShareLoD(const std::string& in, const std::string& out, size_t i = 0,
                size_t j = 0) const override {
    const std::vector<std::string>& inputs = Inputs(in);

--- a/paddle/fluid/framework/shape_inference.h
+++ b/paddle/fluid/framework/shape_inference.h
@@ -56,6 +56,9 @@ class InferShapeContext {
  virtual const std::vector<std::string> &Outputs(
      const std::string &name) const = 0;
+  virtual void ShareDim(const std::string &in, const std::string &out,
+                        size_t i = 0, size_t j = 0) = 0;
  virtual void ShareLoD(const std::string &in, const std::string &out,
                        size_t i = 0, size_t j = 0) const = 0;

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -165,10 +165,12 @@ inline void AnyImpl(Predicate predicate, const framework::Tensor& tensor,
 }
 template <typename Predicate>
-struct AnyVisitor : public boost::static_visitor<bool> {
+class AnyVisitor : public boost::static_visitor<bool> {
+ private:
  const framework::Tensor& tensor_;
  Predicate predicate_;
+ public:
  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
      : tensor_(tensor), predicate_(std::move(predicate)) {}
@@ -206,6 +208,27 @@ struct AnyVisitor : public boost::static_visitor<bool> {
  }
 };
+template <typename Predicate>
+class AnyOutVisitor : public boost::static_visitor<> {
+ private:
+  const framework::Tensor& tensor_;
+  mutable framework::Tensor* out_;
+  Predicate predicate_;
+ public:
+  AnyOutVisitor(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out)
+      : tensor_(tensor), out_(out), predicate_(std::move(predicate)) {}
+  template <typename Place>
+  void operator()(const Place& place) const {
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
+    out_->Resize({1});
+    out_->mutable_data<bool>(place);
+    AnyImpl(predicate_, tensor_, *ctx, out_);
+  }
+};
 template <typename Predicate>
 inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
  AnyVisitor<Predicate> visitor(tensor, predicate);
@@ -213,6 +236,14 @@ inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
  return platform::VisitPlace(place, visitor);
 }
+template <typename Predicate>
+inline void Any(const framework::Tensor& tensor, Predicate predicate,
+                framework::Tensor* out) {
+  AnyOutVisitor<Predicate> visitor(tensor, predicate, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
 struct ContainsNANPredicate {
  template <typename T>
  auto operator()(const T& eigen_vec) const
@@ -227,6 +258,12 @@ bool TensorContainsNAN(const framework::Tensor& tensor) {
  return Any(tensor, predicate);
 }
+void TensorContainsNAN(const framework::Tensor& tensor,
+                       framework::Tensor* out) {
+  ContainsNANPredicate predicate;
+  Any(tensor, predicate, out);
+}
 struct ContainsInfPredicate {
  template <typename T>
  auto operator()(const T& eigen_vec) const
@@ -241,6 +278,71 @@ bool TensorContainsInf(const framework::Tensor& tensor) {
  return Any(tensor, predicate);
 }
+void TensorContainsInf(const framework::Tensor& tensor,
+                       framework::Tensor* out) {
+  ContainsInfPredicate predicate;
+  Any(tensor, predicate, out);
+}
+// NOTE(dzhwinter):
+// Isfinite need a AllVisitor to loop through all the elements.
+// We choose two cuda call instead of one allvisitor. The AllVisitor
+// should be implemented if the performance hurts.
+bool TensorIsfinite(const framework::Tensor& tensor) {
+  ContainsInfPredicate pred_inf;
+  ContainsNANPredicate pred_nan;
+  return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
+}
+#ifdef PADDLE_WITH_CUDA
+template <typename T>
+static inline void __global__ BothFalse(const T* cmp, T* out) {
+  out[0] = (!cmp[0]) && (!out[0]);
+}
+#endif
+struct BothFalseVisitor : public boost::static_visitor<> {
+  const framework::Tensor& in_;
+  mutable framework::Tensor* out_;
+  BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out)
+      : in_(in), out_(out) {}
+  template <typename Place>
+  void operator()(const Place& place) const {
+    VisitorImpl(place);
+  }
+  void VisitorImpl(const platform::CUDAPlace& gpu) const {
+#ifdef PADDLE_WITH_CUDA
+    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
+    BothFalse<bool><<<1, 1, 0, ctx->stream()>>>(in_.data<bool>(),
+                                                out_->mutable_data<bool>(gpu));
+#endif
+  }
+  void VisitorImpl(const platform::CPUPlace& cpu) const {
+    bool lhs = !in_.data<bool>()[0];
+    bool rhs = !out_->mutable_data<bool>(cpu)[0];
+    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+  }
+  void VisitorImpl(
+      const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
+    bool lhs = !in_.data<bool>()[0];
+    bool rhs = !out_->mutable_data<bool>(cpu)[0];
+    out_->mutable_data<bool>(cpu)[0] = lhs && rhs;
+  }
+};
+void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
+  framework::Tensor tmp;
+  TensorContainsInf(tensor, &tmp);
+  TensorContainsNAN(tensor, out);
+  BothFalseVisitor visitor(tmp, out);
+  auto place = tensor.place();
+  platform::VisitPlace(place, visitor);
+}
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                    const platform::DeviceContext& dev_ctx) {
  {  // the 1st field, uint32_t version

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -57,8 +57,15 @@ void TensorToVector(const Tensor& src, const platform::DeviceContext& ctx,
 template <typename T>
 void TesnorToVector(const Tensor& src, std::vector<T>* dst);
+// copy the result bool to cpu
 bool TensorContainsNAN(const framework::Tensor& tensor);
 bool TensorContainsInf(const framework::Tensor& tensor);
+bool TensorIsfinite(const framework::Tensor& tensor);
+// store the result bool in gpu tensor, async operation. Faster than above ones.
+void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out);
 void TensorToStream(std::ostream& os, const Tensor& tensor,
                    const platform::DeviceContext& dev_ctx);

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -36,7 +36,7 @@ TEST(TensorCopy, Tensor) {
  TensorCopy(src_tensor, *cpu_place, &dst_tensor);
  const int* dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(src_ptr, dst_ptr);
+  EXPECT_NE(src_ptr, dst_ptr);
  for (size_t i = 0; i < 9; ++i) {
    EXPECT_EQ(src_ptr[i], dst_ptr[i]);
  }
@@ -47,7 +47,7 @@ TEST(TensorCopy, Tensor) {
  TensorCopy(slice_tensor, *cpu_place, &dst_tensor);
  const int* slice_ptr = slice_tensor.data<int>();
  dst_ptr = dst_tensor.data<int>();
-  ASSERT_NE(dst_ptr, slice_ptr);
+  EXPECT_NE(dst_ptr, slice_ptr);
  for (size_t i = 0; i < 3; ++i) {
    EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
  }
@@ -77,7 +77,7 @@ TEST(TensorCopy, Tensor) {
    // Sync before Compare Tensors
    gpu_ctx.Wait();
    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
    for (size_t i = 0; i < 9; ++i) {
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
    }
@@ -94,7 +94,7 @@ TEST(TensorCopy, Tensor) {
    gpu_ctx.Wait();
    const int* slice_ptr = slice_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(dst_ptr, slice_ptr);
+    EXPECT_NE(dst_ptr, slice_ptr);
    for (size_t i = 0; i < 3; ++i) {
      EXPECT_EQ(dst_ptr[i], slice_ptr[i]);
    }
@@ -117,7 +117,7 @@ TEST(TensorFromVector, Tensor) {
    // Compare Tensors
    const int* cpu_ptr = cpu_tensor.data<int>();
    const int* src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
    for (size_t i = 0; i < 9; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
    }
@@ -127,7 +127,7 @@ TEST(TensorFromVector, Tensor) {
    paddle::framework::TensorFromVector<int>(src_vec, &cpu_tensor);
    cpu_ptr = cpu_tensor.data<int>();
    src_ptr = src_vec.data();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
    for (size_t i = 0; i < 5; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
    }
@@ -161,8 +161,8 @@ TEST(TensorFromVector, Tensor) {
    const int* src_ptr = src_vec.data();
    const int* cpu_ptr = cpu_tensor.data<int>();
    const int* dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
    for (size_t i = 0; i < 9; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
@@ -181,8 +181,8 @@ TEST(TensorFromVector, Tensor) {
    src_ptr = src_vec.data();
    cpu_ptr = cpu_tensor.data<int>();
    dst_ptr = dst_tensor.data<int>();
-    ASSERT_NE(src_ptr, cpu_ptr);
+    EXPECT_NE(src_ptr, cpu_ptr);
-    ASSERT_NE(src_ptr, dst_ptr);
+    EXPECT_NE(src_ptr, dst_ptr);
    for (size_t i = 0; i < 5; ++i) {
      EXPECT_EQ(src_ptr[i], cpu_ptr[i]);
      EXPECT_EQ(src_ptr[i], dst_ptr[i]);
@@ -235,9 +235,9 @@ TEST(TensorContainsNAN, CPU) {
    buf[0] = 0.0;
    buf[1] = NAN;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
    buf[1] = 0.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
  }
  {
@@ -248,9 +248,9 @@ TEST(TensorContainsNAN, CPU) {
    buf[0] = 0.0;
    buf[1].x = 0x7fff;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsNAN(src));
    buf[1] = 0.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsNAN(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsNAN(src));
  }
 }
@@ -261,9 +261,9 @@ TEST(TensorContainsInf, CPU) {
    buf[0] = 1.0;
    buf[1] = INFINITY;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
    buf[1] = 1.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
  }
  {
@@ -274,9 +274,55 @@ TEST(TensorContainsInf, CPU) {
    buf[0] = 1.0;
    buf[1].x = 0x7c00;
    buf[2] = 0.0;
-    ASSERT_TRUE(paddle::framework::TensorContainsInf(src));
+    EXPECT_TRUE(paddle::framework::TensorContainsInf(src));
    buf[1] = 1.0;
-    ASSERT_FALSE(paddle::framework::TensorContainsInf(src));
+    EXPECT_FALSE(paddle::framework::TensorContainsInf(src));
+  }
+}
+TEST(TensorIsfinite, CPU) {
+  {
+    paddle::framework::Tensor src, out;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1] = INFINITY;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+  }
+  {
+    paddle::framework::Tensor src, out;
+    double* buf = src.mutable_data<double>({3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1] = NAN;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+  }
+  {
+    paddle::framework::Tensor src, out;
+    paddle::platform::float16* buf =
+        src.mutable_data<paddle::platform::float16>(
+            {3}, paddle::platform::CPUPlace());
+    buf[0] = 1.0;
+    buf[1].x = 0x7c00;
+    buf[2] = 0.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
+    buf[1] = 1.0;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], true);
+    buf[1].x = 0x7fff;
+    paddle::framework::TensorIsfinite(src, &out);
+    EXPECT_EQ(out.data<bool>()[0], false);
  }
 }
@@ -299,9 +345,9 @@ TEST(Tensor, FromAndToStream) {
    TensorFromStream(iss, &dst_tensor, cpu_ctx);
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 5; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
+      EXPECT_EQ(dst_ptr[i], array[i]);
    }
-    ASSERT_EQ(dst_tensor.dims(), src_tensor.dims());
+    EXPECT_EQ(dst_tensor.dims(), src_tensor.dims());
    delete place;
  }
 #ifdef PADDLE_WITH_CUDA
@@ -323,7 +369,7 @@ TEST(Tensor, FromAndToStream) {
    int* dst_ptr = dst_tensor.mutable_data<int>(platform::CPUPlace());
    for (int i = 0; i < 6; ++i) {
-      ASSERT_EQ(dst_ptr[i], array[i]);
+      EXPECT_EQ(dst_ptr[i], array[i]);
    }
    delete gpu_place;
  }

--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -27,9 +27,9 @@ static __global__ void FillNAN(float* buf) {
 }
 static __global__ void FillInf(float* buf) {
-  buf[0] = 0.0;
+  buf[0] = INFINITY;
-  buf[1] = INFINITY;
+  buf[1] = 0.1;
-  buf[2] = 0.5;
+  buf[2] = 0.2;
 }
 static __global__ void FillNAN(platform::float16* buf) {
@@ -44,6 +44,18 @@ static __global__ void FillInf(platform::float16* buf) {
  buf[2] = 0.5;
 }
+static __global__ void FillFinite(float* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = 0.2;
+}
+static __global__ void FillFinite(platform::float16* buf) {
+  buf[0] = 0.0;
+  buf[1] = 0.1;
+  buf[2] = 0.2;
+}
 TEST(TensorContainsNAN, GPU) {
  paddle::platform::CUDAPlace gpu(0);
  auto& pool = paddle::platform::DeviceContextPool::Instance();
@@ -86,5 +98,163 @@ TEST(TensorContainsInf, GPU) {
  }
 }
+TEST(TensorIsfinite, GPU) {
+  paddle::platform::CUDAPlace gpu(0);
+  using paddle::platform::float16;
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  // contains inf
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  // contains nan
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(!TensorIsfinite(tensor));
+  }
+  // all element are finite
+  {
+    Tensor tensor;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(TensorIsfinite(tensor));
+  }
+  {
+    Tensor tensor;
+    float16* buf = tensor.mutable_data<float16>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    EXPECT_TRUE(TensorIsfinite(tensor));
+  }
+}
+TEST(TensorContainsInf, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsInf(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+  {
+    Tensor tensor, out;
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsInf(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+}
+TEST(TensorContainsNAN, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsNAN(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+  {
+    Tensor tensor, out;
+    paddle::platform::float16* buf =
+        tensor.mutable_data<paddle::platform::float16>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorContainsNAN(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    ASSERT_EQ(tmp.data<bool>()[0], true);
+  }
+}
+TEST(TensorIsfinite, GPUWithoutWait) {
+  paddle::platform::CUDAPlace gpu(0);
+  auto& pool = paddle::platform::DeviceContextPool::Instance();
+  auto* cuda_ctx = pool.GetByPlace(gpu);
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillInf<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], false);
+  }
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillNAN<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], false);
+  }
+  {
+    Tensor tensor, out;
+    float* buf = tensor.mutable_data<float>({3}, gpu);
+    FillFinite<<<1, 1, 0, cuda_ctx->stream()>>>(buf);
+    cuda_ctx->Wait();
+    TensorIsfinite(tensor, &out);
+    platform::CPUPlace cpu;
+    Tensor tmp;
+    TensorCopy(out, cpu, *cuda_ctx, &tmp);
+    cuda_ctx->Wait();
+    EXPECT_EQ(tmp.data<bool>()[0], true);
+  }
+}
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
+++ b/paddle/fluid/inference/api/demo_ci/simple_on_word2vec.cc
@@ -22,8 +22,8 @@ limitations under the License. */
 #include <algorithm>
 #include <memory>
 #include <thread>  //NOLINT
 #include "paddle/fluid/inference/paddle_inference_api.h"
-#include "paddle/fluid/platform/enforce.h"
 DEFINE_string(dirname, "", "Directory of the inference model.");
 DEFINE_bool(use_gpu, false, "Whether use gpu.");
@@ -62,17 +62,17 @@ void Main(bool use_gpu) {
    CHECK(predictor->Run(slots, &outputs));
    //# 4. Get output.
-    PADDLE_ENFORCE(outputs.size(), 1UL);
+    CHECK_EQ(outputs.size(), 1UL);
    // Check the output buffer size and result of each tid.
-    PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+    CHECK_EQ(outputs.front().data.length(), 33168UL);
    float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                       0.000932706};
    const size_t num_elements = outputs.front().data.length() / sizeof(float);
    // The outputs' buffers are in CPU memory.
    for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
         i++) {
-      PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+      CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i], result[i],
-                     result[i]);
+                 0.001);
    }
  }
 }
@@ -108,9 +108,9 @@ void MainThreads(int num_threads, bool use_gpu) {
        CHECK(predictor->Run(inputs, &outputs));
        // 4. Get output.
-        PADDLE_ENFORCE(outputs.size(), 1UL);
+        CHECK_EQ(outputs.size(), 1UL);
        // Check the output buffer size and result of each tid.
-        PADDLE_ENFORCE(outputs.front().data.length(), 33168UL);
+        CHECK_EQ(outputs.front().data.length(), 33168UL);
        float result[5] = {0.00129761, 0.00151112, 0.000423564, 0.00108815,
                           0.000932706};
        const size_t num_elements =
@@ -118,8 +118,8 @@ void MainThreads(int num_threads, bool use_gpu) {
        // The outputs' buffers are in CPU memory.
        for (size_t i = 0; i < std::min(static_cast<size_t>(5), num_elements);
             i++) {
-          PADDLE_ENFORCE(static_cast<float*>(outputs.front().data.data())[i],
+          CHECK_NEAR(static_cast<float*>(outputs.front().data.data())[i],
-                         result[i]);
+                     result[i], 0.001);
        }
      }
    });

--- a/paddle/fluid/inference/api/demo_ci/vis_demo.cc
+++ b/paddle/fluid/inference/api/demo_ci/vis_demo.cc
@@ -17,11 +17,12 @@ limitations under the License. */
 */
 #include <gflags/gflags.h>
-#include <glog/logging.h>  // use glog instead of PADDLE_ENFORCE to avoid importing other paddle header files.
+#include <glog/logging.h>  // use glog instead of CHECK to avoid importing other paddle header files.
 #include <fstream>
 #include <iostream>
+// #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/inference/demo_ci/utils.h"
-#include "paddle/fluid/platform/enforce.h"
 #ifdef PADDLE_WITH_CUDA
 DECLARE_double(fraction_of_gpu_memory_to_use);
@@ -78,18 +79,17 @@ void CheckOutput(const std::string& referfile, const PaddleTensor& output) {
  size_t numel = output.data.length() / PaddleDtypeSize(output.dtype);
  VLOG(3) << "predictor output numel " << numel;
  VLOG(3) << "reference output numel " << refer.data.size();
-  PADDLE_ENFORCE_EQ(numel, refer.data.size());
+  CHECK_EQ(numel, refer.data.size());
  switch (output.dtype) {
    case PaddleDType::INT64: {
      for (size_t i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_EQ(static_cast<int64_t*>(output.data.data())[i],
+        CHECK_EQ(static_cast<int64_t*>(output.data.data())[i], refer.data[i]);
-                          refer.data[i]);
      }
      break;
    }
    case PaddleDType::FLOAT32:
      for (size_t i = 0; i < numel; ++i) {
-        PADDLE_ENFORCE_LT(
+        CHECK_LT(
            fabs(static_cast<float*>(output.data.data())[i] - refer.data[i]),
            1e-5);
      }

--- a/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_resnet50_tester.cc
@@ -27,9 +27,6 @@ void SetConfig(AnalysisConfig *cfg) {
  cfg->device = 0;
  cfg->enable_ir_optim = true;
  cfg->specify_input_name = true;
-#ifdef PADDLE_WITH_MKLDNN
-  cfg->_use_mkldnn = true;
-#endif
 }
 void SetInput(std::vector<std::vector<PaddleTensor>> *inputs) {

--- a/paddle/fluid/operators/activation_op.cc
+++ b/paddle/fluid/operators/activation_op.cc
@@ -80,7 +80,7 @@ class ActivationOp : public framework::OperatorWithKernel {
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -91,12 +91,26 @@ class ActivationOp : public framework::OperatorWithKernel {
  }
 };
+class ActivationOpInferVarType : public framework::VarTypeInference {
+ public:
+  void operator()(const framework::OpDesc& op_desc,
+                  framework::BlockDesc* block) const override {
+    auto x_name = op_desc.Input("X")[0];
+    auto out_name = op_desc.Output("Out")[0];
+    auto& x = block->FindRecursiveOrCreateVar(x_name);
+    auto& out = block->FindRecursiveOrCreateVar(out_name);
+    out.SetType(x.GetType());
+    out.SetDataType(x.GetDataType());
+  }
+};
 class ActivationOpGrad : public framework::OperatorWithKernel {
 public:
  using framework::OperatorWithKernel::OperatorWithKernel;
  void InferShape(framework::InferShapeContext* ctx) const override {
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("Out"));
+    ctx->ShareDim("Out", framework::GradVarName("X"));
+    ctx->ShareLoD("Out", framework::GradVarName("X"));
  }
 protected:
@@ -525,12 +539,14 @@ namespace ops = paddle::operators;
 #define REGISTER_INPLACE_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)        \
  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp, \
                    ::paddle::operators::OP_NAME##OpMaker,          \
+                    ::paddle::operators::ActivationOpInferVarType,  \
                    ::paddle::operators::OP_NAME##GradMaker);       \
  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)
 #define REGISTER_ACTIVATION_OP(OP_NAME, KERNEL_TYPE)                    \
  REGISTER_OPERATOR(KERNEL_TYPE, ::paddle::operators::ActivationOp,     \
                    ::paddle::operators::OP_NAME##OpMaker,              \
+                    ::paddle::operators::ActivationOpInferVarType,      \
                    ::paddle::framework::DefaultGradOpDescMaker<true>); \
  REGISTER_OPERATOR(KERNEL_TYPE##_grad, ::paddle::operators::ActivationOpGrad)

--- a/paddle/fluid/operators/argsort_op.cc
+++ b/paddle/fluid/operators/argsort_op.cc
@@ -42,8 +42,8 @@ class ArgsortOp : public framework::OperatorWithKernel {
                   "-rank(Input(X)) (%d).",
                   axis, num_dims);
-    ctx->SetOutputDim("Out", in_dims);
+    ctx->ShareDim("X", "Out");
-    ctx->SetOutputDim("Indices", in_dims);
+    ctx->ShareDim("X", "Indices");
    ctx->ShareLoD("X", "Out");
    ctx->ShareLoD("X", "Indices");
  }

--- a/paddle/fluid/operators/conv_shift_op.cc
+++ b/paddle/fluid/operators/conv_shift_op.cc
@@ -44,7 +44,7 @@ class ConvShiftOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE_LE(y_dims[1], x_dims[1],
                      "The 2nd dimension of Input(Y) should be less than or "
                      "equal to the 2nd dimension of Input(X).");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };

--- a/paddle/fluid/operators/cub_reduce.h
+++ b/paddle/fluid/operators/cub_reduce.h
@@ -22,6 +22,7 @@
 #include <cub/cub.cuh>  // NOLINT
 #include "paddle/fluid/framework/tensor.h"
+#include "paddle/fluid/framework/tensor_util.h"
 namespace paddle {
 namespace operators {
@@ -293,7 +294,12 @@ void TensorReduce(const framework::Tensor& x, framework::Tensor* y,
  }
  auto x_data = x.data<Tx>();
  auto y_data = y->mutable_data<Ty>(x.place());
-  if (reduce_num == 1) return;
+  if (reduce_num == 1) {
+    auto out_dims = y->dims();
+    framework::TensorCopy(x, y->place(), y);
+    y->Resize(out_dims);
+    return;
+  }
 #define CUB_BLOCK_DIM_CASE(block_dim)                                    \
  case block_dim: {                                                      \

--- a/paddle/fluid/operators/elementwise_op.h
+++ b/paddle/fluid/operators/elementwise_op.h
@@ -41,7 +41,8 @@ class ElementwiseOp : public framework::OperatorWithKernel {
    auto y_dim = ctx->GetInputDim("Y");
    PADDLE_ENFORCE_GE(x_dim.size(), y_dim.size(),
                      "Rank of first input must >= rank of second input.");
-    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }
@@ -70,6 +71,7 @@ class ElementwiseOpInferVarType : public framework::VarTypeInference {
    auto& x = block->FindRecursiveOrCreateVar(x_name);
    auto& out = block->FindRecursiveOrCreateVar(out_name);
    out.SetType(x.GetType());
+    out.SetDataType(x.GetDataType());
  }
 };
@@ -157,10 +159,12 @@ class ElementwiseOpGrad : public framework::OperatorWithKernel {
    auto x_grad_name = framework::GradVarName("X");
    auto y_grad_name = framework::GradVarName("Y");
    if (ctx->HasOutput(x_grad_name)) {
-      ctx->SetOutputDim(x_grad_name, x_dims);
+      ctx->ShareDim("X", /*->*/ x_grad_name);
+      ctx->ShareLoD("X", /*->*/ x_grad_name);
    }
    if (ctx->HasOutput(y_grad_name)) {
-      ctx->SetOutputDim(y_grad_name, y_dims);
+      ctx->ShareDim("Y", /*->*/ y_grad_name);
+      ctx->ShareLoD("Y", /*->*/ y_grad_name);
    }
  }
@@ -193,14 +197,15 @@ class ElementwiseOpExplicitGrad : public ElementwiseOpGrad {
    auto x_grad_name = framework::GradVarName("X");
    if (ctx->HasOutput(x_grad_name)) {
-      auto out_dims = ctx->GetInputDim(framework::GradVarName("Out"));
+      ctx->ShareDim(framework::GradVarName("Out"), /*->*/ x_grad_name);
-      ctx->SetOutputDim(x_grad_name, out_dims);
+      ctx->ShareLoD(framework::GradVarName("Out"), /*->*/ x_grad_name);
    }
    auto y_grad_name = framework::GradVarName("Y");
    if (ctx->HasOutput(y_grad_name)) {
      PADDLE_ENFORCE(ctx->HasInput("Y"), "Input(Y) should not be null");
-      auto y_dims = ctx->GetInputDim("Y");
-      ctx->SetOutputDim(y_grad_name, y_dims);
+      ctx->ShareDim("Y", /*->*/ y_grad_name);
+      ctx->ShareLoD("Y", /*->*/ y_grad_name);
    }
  }
 };

--- a/paddle/fluid/operators/fake_dequantize_op.cc
+++ b/paddle/fluid/operators/fake_dequantize_op.cc
@@ -48,7 +48,8 @@ class FakeDequantizeMaxAbsOp : public framework::OperatorWithKernel {
                   "Input(X) of FakeDequantizeMaxAbsOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of FakeDequantizeMaxAbsOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };

--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/operators/isfinite_op.h"
+#include <string>
+#include <vector>
+namespace paddle {
+namespace operators {
+class OverflowOp : public framework::OperatorWithKernel {
+ public:
+  OverflowOp(const std::string &type, const framework::VariableNameMap &inputs,
+             const framework::VariableNameMap &outputs,
+             const framework::AttributeMap &attrs)
+      : OperatorWithKernel(type, inputs, outputs, attrs) {}
+  void InferShape(framework::InferShapeContext *ctx) const override {
+    PADDLE_ENFORCE(ctx->HasInputs("X"), "Inputs(X) should not be null");
+    PADDLE_ENFORCE(ctx->HasOutput("Out"),
+                   "Output(Out) of OverflowOp should not be null.");
+    ctx->SetOutputDim("Out", {1});
+  }
+ protected:
+  framework::OpKernelType GetExpectedKernelType(
+      const framework::ExecutionContext &ctx) const override {
+    int dtype = -1;
+    auto *x_var = ctx.InputVar("X");
+    if (x_var->IsType<framework::LoDTensor>()) {
+      dtype = framework::ToDataType(x_var->Get<framework::LoDTensor>().type());
+    } else if (x_var->IsType<framework::SelectedRows>()) {
+      dtype = framework::ToDataType(
+          x_var->Get<framework::SelectedRows>().value().type());
+    } else {
+      PADDLE_THROW("Cannot find the input data type by all input data");
+    }
+    return framework::OpKernelType(framework::proto::VarType::Type(dtype),
+                                   ctx.GetPlace());
+  }
+};
+class OverflowOpMaker : public framework::OpProtoAndCheckerMaker {
+ public:
+  void Make() override {
+    AddInput("X", "(Tensor) The input tensors of overflow operator.");
+    AddOutput("Out",
+              "(Tensor) 1-dim tensor, contains a bool scalar. The output "
+              "tensor of overflow operator.");
+    AddComment(string::Sprintf(R"DOC(
+Overflow operator.
+$$Out = any(X)$$
+If any X contains Inf or Nan, the Out will generate a indicator.
+Out = Inf if any X contains Inf,
+Out = Nan if any X contains Nan,
+Out = 0 if no Inf/Nan detected.
+If X contains both Inf/Nan, it will return the first indicator it meeted.
+)DOC",
+                               GetName(), GetComments()));
+  }
+ protected:
+  virtual std::string GetName() const = 0;
+  virtual std::string GetComments() const = 0;
+};
+}  // namespace operators
+}  // namespace paddle
+namespace ops = paddle::operators;
+#define REGISTER_OP_MAKER(op_type, comment)             \
+  namespace paddle {                                    \
+  namespace operators {                                 \
+  class _##op_type##OverflowOpMaker                     \
+      : public ::paddle::operators::OverflowOpMaker {   \
+   protected:                                           \
+    std::string GetName() const { return #op_type; }    \
+    std::string GetComments() const { return comment; } \
+  };                                                    \
+  }                                                     \
+  }                                                     \
+  REGISTER_OPERATOR(op_type, ops::OverflowOp,           \
+                    ops::_##op_type##OverflowOpMaker,   \
+                    paddle::framework::EmptyGradOpMaker)
+#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)                      \
+  REGISTER_OP_CPU_KERNEL(                                                   \
+      op_type, ops::OverflowKernel<paddle::platform::CPUDeviceContext, int, \
+                                   ops::functor>,                           \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, float,        \
+                          ops::functor>,                                    \
+      ops::OverflowKernel<paddle::platform::CPUDeviceContext, double,       \
+                          ops::functor>);
+REGISTER_OP_MAKER(isinf, "isinf(X)");
+REGISTER_OP_MAKER(isnan, "isnan(X)");
+REGISTER_OP_MAKER(isfinite, "isfinite(X)");
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_OVERFLOW_CPU_KERNEL);
--- a/paddle/fluid/operators/isfinite_op.cu
+++ b/paddle/fluid/operators/isfinite_op.cu
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#define EIGEN_USE_GPU
+#include "paddle/fluid/operators/isfinite_op.h"
+#include "paddle/fluid/platform/float16.h"
+namespace ops = paddle::operators;
+namespace plat = paddle::platform;
+#define REGISTER_OVERFLOW_CUDA_KERNEL(op_type, functor)                       \
+  REGISTER_OP_CUDA_KERNEL(                                                    \
+      op_type, ops::OverflowKernel<paddle::platform::CUDADeviceContext, int,  \
+                                   ops::functor>,                             \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, float,         \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, double,        \
+                          ops::functor>,                                      \
+      ops::OverflowKernel<paddle::platform::CUDADeviceContext, plat::float16, \
+                          ops::functor>);
+FOR_EACH_KERNEL_FUNCTOR(REGISTER_OVERFLOW_CUDA_KERNEL);
--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <vector>
+#include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/framework/op_registry.h"
+#include "paddle/fluid/framework/tensor_util.h"
+#include "paddle/fluid/platform/float16.h"
+#include "paddle/fluid/platform/transform.h"
+namespace paddle {
+namespace operators {
+struct InfinityFunctor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsInf(tensor, out);
+  }
+};
+struct NANFunctor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorContainsNAN(tensor, out);
+  }
+};
+struct IsfiniteFunctor {
+  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
+    framework::TensorIsfinite(tensor, out);
+  }
+};
+template <typename DeviceContext, typename T, typename Functor>
+class OverflowKernel : public framework::OpKernel<T> {
+ public:
+  virtual void Compute(const framework::ExecutionContext& ctx) const {
+    auto* x = ctx.InputVar("X");
+    auto* out = ctx.Output<framework::Tensor>("Out");
+    out->mutable_data<T>(ctx.GetPlace());
+    Functor functor;
+    if (x->IsType<framework::LoDTensor>()) {
+      auto* in = ctx.Input<framework::Tensor>("X");
+      functor(*in, out);
+    } else if (x->IsType<framework::SelectedRows>()) {
+      auto& in = ctx.Input<framework::SelectedRows>("X")->value();
+      functor(in, out);
+    } else {
+      PADDLE_THROW("Unsupported input type.");
+    }
+  }
+};
+}  // namespace operators
+}  // namespace paddle
+#define FOR_EACH_KERNEL_FUNCTOR(__macro) \
+  __macro(isinf, InfinityFunctor);       \
+  __macro(isnan, NANFunctor);            \
+  __macro(isfinite, IsfiniteFunctor);
--- a/paddle/fluid/operators/lookup_table_op.cc
+++ b/paddle/fluid/operators/lookup_table_op.cc
@@ -137,6 +137,7 @@ class LookupTableOpGradVarTypeInference : public framework::VarTypeInference {
              << " is set to LoDTensor";
      block->Var(out_var_name)->SetType(framework::proto::VarType::LOD_TENSOR);
    }
+    block->Var(out_var_name)->SetDataType(block->Var("W")->GetDataType());
  }
 };

--- a/paddle/fluid/operators/prelu_op.cc
+++ b/paddle/fluid/operators/prelu_op.cc
@@ -49,7 +49,7 @@ class PReluOp : public framework::OperatorWithKernel {
    } else {
      PADDLE_THROW("Unkown mode %s", mode);
    }
-    ctx->SetOutputDim("Out", x_dim);
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }

--- a/paddle/fluid/operators/rnn_memory_helper_op.cc
+++ b/paddle/fluid/operators/rnn_memory_helper_op.cc
@@ -54,7 +54,7 @@ class RNNMemoryHelperOpShapeInference : public framework::InferShapeBase {
                   "Input(X) of rnn_memory_helper op should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output of rnn_memory_helper op should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };

--- a/paddle/fluid/operators/sequence_conv_op.cc
+++ b/paddle/fluid/operators/sequence_conv_op.cc
@@ -90,8 +90,8 @@ class SequenceConvGradOp : public framework::OperatorWithKernel {
                        ctx->GetInputDim("PaddingData"));
    }
    if (ctx->HasOutput(framework::GradVarName("X"))) {
-      ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+      ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
-      ctx->ShareLoD("X", framework::GradVarName("X"));
+      ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
    }
    if (ctx->HasOutput(framework::GradVarName("Filter"))) {
      ctx->SetOutputDim(framework::GradVarName("Filter"),

--- a/paddle/fluid/operators/sequence_pool_op.cc
+++ b/paddle/fluid/operators/sequence_pool_op.cc
@@ -102,8 +102,9 @@ class SequencePoolGradOp : public framework::OperatorWithKernel {
    for (int64_t i = 1; i < og_dims.size(); ++i) {
      PADDLE_ENFORCE_EQ(og_dims[i], x_dims[i], "The dimension mismatch.");
    }
-    ctx->SetOutputDim(framework::GradVarName("X"), x_dims);
-    ctx->ShareLoD("X", framework::GradVarName("X"));
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
+    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
  }
 protected:

--- a/paddle/fluid/operators/sequence_reshape_op.cc
+++ b/paddle/fluid/operators/sequence_reshape_op.cc
@@ -92,7 +92,7 @@ class SequenceReshapeGradOp : public framework::OperatorWithKernel {
    PADDLE_ENFORCE(ctx->HasInput("X"),
                   "Input(X) of SequenceReshapeGradOp should  not be null.");
-    ctx->SetOutputDim(framework::GradVarName("X"), ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ framework::GradVarName("X"));
    ctx->ShareLoD("X", /*->*/ framework::GradVarName("X"));
  }
 };

--- a/paddle/fluid/operators/sequence_softmax_op.cc
+++ b/paddle/fluid/operators/sequence_softmax_op.cc
@@ -27,7 +27,8 @@ class SequenceSoftmaxOp : public framework::OperatorWithKernel {
                   "Input(X) of SequenceSoftmaxOp should not be null.");
    PADDLE_ENFORCE(ctx->HasOutput("Out"),
                   "Output(Out) of SequenceSoftmaxOp should not be null.");
-    ctx->SetOutputDim("Out", ctx->GetInputDim("X"));
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }

--- a/paddle/fluid/operators/shrink_rnn_memory_op.cc
+++ b/paddle/fluid/operators/shrink_rnn_memory_op.cc
@@ -151,9 +151,9 @@ class ShrinkRNNMemoryGradInferShape : public framework::InferShapeBase {
  void operator()(framework::InferShapeContext *context) const override {
    PADDLE_ENFORCE(context->HasInput("X"));
    PADDLE_ENFORCE(context->HasOutput(framework::GradVarName("X")));
-    context->SetOutputDim(framework::GradVarName("X"),
-                          context->GetInputDim("X"));
+    context->ShareDim("X", /*->*/ framework::GradVarName("X"));
-    context->ShareLoD("X", framework::GradVarName("X"));
+    context->ShareLoD("X", /*->*/ framework::GradVarName("X"));
  }
 };

--- a/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
+++ b/paddle/fluid/operators/sigmoid_cross_entropy_with_logits_op.cc
@@ -40,7 +40,7 @@ class SigmoidCrossEntropyWithLogitsOp : public framework::OperatorWithKernel {
                      "The 2nd dimension of Input(X) and Input(Label) should "
                      "be equal.");
-    ctx->SetOutputDim("Out", x_dims);
+    ctx->ShareDim("X", /*->*/ "Out");
    ctx->ShareLoD("X", /*->*/ "Out");
  }
 };

--- a/paddle/fluid/pybind/pybind.cc
+++ b/paddle/fluid/pybind/pybind.cc
@@ -620,7 +620,23 @@ All parameter, weight, gradient are variables in Paddle.
  // -- python binds for parallel executor.
  py::class_<ParallelExecutor> pe(m, "ParallelExecutor");
-  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy");
+  py::class_<ExecutionStrategy> exec_strategy(pe, "ExecutionStrategy", R"DOC(
+    ExecutionStrategy allows the user to more preciously control how to run
+    the program in ParallelExecutor by setting the property.
+    The available properties include:
+        use_cuda (bool): Whether to use CUDA or not. Default True.
+        num_threads (int): The number of threads that used to run the
+            operators in ParallelExecutor. If it is not set, it will be
+            set in ParallelExecutor according to the device count.
+            Default 0.
+        allow_op_delay (bool): Whether to delay the communication operators
+            to run. Default False.
+        num_iteration_per_drop_scope (int): how many iterations between
+            the two dropping local scopes. Default 100.
+        )DOC");
  exec_strategy.def(py::init())
      .def_property(
          "num_threads",
@@ -658,7 +674,25 @@ All parameter, weight, gradient are variables in Paddle.
                                  : ExecutionStrategy::kDefault;
      });
-  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy");
+  py::class_<BuildStrategy> build_strategy(pe, "BuildStrategy", R"DOC(
+    BuildStrategy allows the user to more preciously control how to
+    build the SSA Graph in ParallelExecutor by setting the property.
+    The available properties include:
+        reduce_strategy (str): There are two reduce strategies, 'AllReduce'
+            and 'Reduce'. If you want that all parameters will be optimized
+            on all devices, you can choose 'AllReduce'; if you choose
+            'Reduce', all parameters will be evenly allocated to different
+            devices for optimization, and then broadcast the optimized
+            parameter to other devices. Default 'AllReduce'.
+        gradient_scale_strategy (str): There are two ways of defining loss@grad,
+            'CoeffNumDevice' and 'Customized'. By default, ParallelExecutor
+            sets the loss@grad according to the number of devices. If you want
+            to customize loss@grad, you can choose 'Customized'.
+            Default 'CoeffNumDevice'.
+        debug_graphviz_path (str): Whether to write the SSA Graph to file in the
+            form of graphviz. It is useful for debugging. Default "".
+)DOC");
  py::enum_<BuildStrategy::ReduceStrategy>(build_strategy, "ReduceStrategy")
      .value("Reduce", BuildStrategy::ReduceStrategy::kReduce)

--- a/paddle/scripts/paddle_build.sh
+++ b/paddle/scripts/paddle_build.sh
@@ -600,7 +600,7 @@ EOF
    if [[ ${WITH_GPU} == "ON"  ]]; then
        NCCL_DEPS="apt-get install -y --allow-downgrades libnccl2=2.2.13-1+cuda${CUDA_MAJOR} libnccl-dev=2.2.13-1+cuda${CUDA_MAJOR} || true"
    else
-        NCCL_DEPS=""
+        NCCL_DEPS="true"
    fi
    if [[ ${WITH_FLUID_ONLY:-OFF} == "OFF" ]]; then

--- a/python/CMakeLists.txt
+++ b/python/CMakeLists.txt
@@ -60,7 +60,7 @@ add_custom_command(OUTPUT ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND env ${py_env} ${PYTHON_EXECUTABLE} setup.py bdist_wheel
    COMMAND ${CMAKE_COMMAND} -E touch ${PADDLE_PYTHON_BUILD_DIR}/.timestamp
    COMMAND ${CMAKE_COMMAND} -E remove_directory ${PADDLE_PYTHON_BUILD_DIR}/lib-python
-    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
+    COMMAND ${CMAKE_COMMAND} -E copy_directory ${PADDLE_PYTHON_BUILD_DIR}/lib.* ${PADDLE_PYTHON_BUILD_DIR}/lib-python
    DEPENDS gen_proto_py copy_paddle_pybind ${FLUID_CORE} framework_py_proto profiler_py_proto ${PY_FILES} ${external_project_dependencies} ${COPY_PADDLE_MASTER})
 set(paddle_python_deps ${PADDLE_PYTHON_BUILD_DIR}/.timestamp ${MKL_DEPENDS})

--- a/python/paddle/dataset/flowers.py
+++ b/python/paddle/dataset/flowers.py
@@ -35,16 +35,15 @@ import itertools
 import functools
 from .common import download
 import tarfile
-import six
 import scipy.io as scio
 from paddle.dataset.image import *
 from paddle.reader import *
+from paddle import compat as cpt
 import os
 import numpy as np
 from multiprocessing import cpu_count
 import six
 from six.moves import cPickle as pickle
-from six.moves import zip
 __all__ = ['train', 'test', 'valid']
 DATA_URL = 'http://paddlemodels.cdn.bcebos.com/flowers/102flowers.tgz'
@@ -126,9 +125,11 @@ def reader_creator(data_file,
                        batch = pickle.load(f)
                    else:
                        batch = pickle.load(f, encoding='bytes')
+                if six.PY3:
+                    batch = cpt.to_text(batch)
                data = batch['data']
                labels = batch['label']
-                for sample, label in zip(data, batch['label']):
+                for sample, label in six.moves.zip(data, batch['label']):
                    yield sample, int(label) - 1
            if not cycle:
                break

--- a/python/paddle/fluid/layers/tensor.py
+++ b/python/paddle/fluid/layers/tensor.py
@@ -24,21 +24,10 @@ from .layer_function_generator import templatedoc
 import numpy
 __all__ = [
-    'create_tensor',
+    'create_tensor', 'create_parameter', 'create_global_var', 'cast', 'concat',
-    'create_parameter',
+    'sums', 'assign', 'fill_constant_batch_size_like', 'fill_constant',
-    'create_global_var',
+    'argmin', 'argmax', 'argsort', 'ones', 'zeros', 'reverse', 'has_inf',
-    'cast',
+    'has_nan', 'isfinite'
-    'concat',
-    'sums',
-    'assign',
-    'fill_constant_batch_size_like',
-    'fill_constant',
-    'argmin',
-    'argmax',
-    'argsort',
-    'ones',
-    'zeros',
-    'reverse',
 ]
@@ -652,3 +641,52 @@ def load_combine(out, file_path):
        inputs={},
        output={"Out": out},
        args={"file_path": file_path})
+def has_inf(x):
+    """
+    Test if any of x contains an infinity number
+    Args:
+       x(variable): The Tensor/LoDTensor to be checked.
+    Returns:
+        Variable: The tensor variable storing the output, only a bool value.
+    """
+    helper = LayerHelper("isinf", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="isinf", inputs={"X": x}, outputs={"Out": out})
+    return out
+def has_nan(x):
+    """
+    Test if any of x contains a NAN
+    Args:
+       x(variable): The Tensor/LoDTensor to be checked.
+    Returns:
+        Variable: The tensor variable storing the output, only a bool value.
+    """
+    helper = LayerHelper("isnan", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="isnan", inputs={"X": x}, outputs={"Out": out})
+    return out
+def isfinite(x):
+    """
+    Test if any of x contains an infinity/NAN number. If all the elements are finite,
+    returns true, else false.
+    Args:
+       x(variable): The Tensor/LoDTensor to be checked.
+    Returns:
+        Variable: The tensor variable storing the output, contains a bool value.
+    """
+    helper = LayerHelper("isfinite", **locals())
+    out = helper.create_tmp_variable(dtype=x.dtype)
+    helper.append_op(type="isfinite", inputs={"X": x}, outputs={"Out": out})
+    return out
--- a/python/paddle/fluid/lod_tensor.py
+++ b/python/paddle/fluid/lod_tensor.py
@@ -74,7 +74,7 @@ def create_lod_tensor(data, recursive_seq_lens, place):
        assert [
            new_recursive_seq_lens
        ] == recursive_seq_lens, "data and recursive_seq_lens do not match"
-        flattened_data = np.concatenate(data, axis=0).astype("int64")
+        flattened_data = np.concatenate(data, axis=0)
        flattened_data = flattened_data.reshape([len(flattened_data), 1])
        return create_lod_tensor(flattened_data, recursive_seq_lens, place)
    elif isinstance(data, np.ndarray):

--- a/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
+++ b/python/paddle/fluid/tests/unittests/test_elementwise_mul_op.py
@@ -16,6 +16,8 @@ from __future__ import print_function
 import unittest
 import numpy as np
 from op_test import OpTest
+import paddle.fluid.core as core
+from paddle.fluid.op import Operator
 class ElementwiseMulOp(OpTest):
@@ -115,5 +117,56 @@ class TestElementwiseMulOp_broadcast_3(ElementwiseMulOp):
        }
+class TestElementWiseMulSelectedRows(OpTest):
+    def setUp(self):
+        self.rows = [0, 1, 2, 3, 4, 5, 6]
+        self.feature = 12
+        self.height = 100
+        self.input_shape = (len(self.rows), self.feature)
+    def prepare_input(self, scope, place):
+        self.input = {
+            "X": np.random.random(self.input_shape).astype("float32"),
+            "Y": np.random.random(self.input_shape).astype("float32")
+        }
+        def init_input(in_name):
+            x_selected_rows = scope.var(in_name).get_selected_rows()
+            x_selected_rows.set_height(self.height)
+            x_selected_rows.set_rows(self.rows)
+            x_array = self.input[in_name]
+            x_tensor = x_selected_rows.get_tensor()
+            x_tensor.set(x_array, place)
+        init_input("X")
+        init_input("Y")
+    def create_out_selected_row(self, scope):
+        return scope.var('Out').get_selected_rows()
+    def check_result(self, out_selected_rows):
+        assert out_selected_rows.height() == self.height
+        assert out_selected_rows.rows() == self.rows
+        out_tensor = np.array(out_selected_rows.get_tensor())
+        assert out_tensor.shape == self.input_shape
+    def check_with_place(self, place):
+        scope = core.Scope()
+        self.prepare_input(scope, place)
+        out_selected_rows = self.create_out_selected_row(scope)
+        out_selected_rows.set_height(0)
+        out_selected_rows.set_rows([])
+        elementwise_mul = Operator("elementwise_mul", X='X', Y='Y', Out='Out')
+        elementwise_mul.run(scope, place)
+        self.check_result(out_selected_rows)
+    def test_elewisemul_with_selected_rows_input(self):
+        places = [core.CPUPlace()]
+        for place in places:
+            self.check_with_place(place)
 if __name__ == '__main__':
    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+++ b/python/paddle/fluid/tests/unittests/test_isfinite_op.py
+# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+import unittest
+import numpy as np
+from op_test import OpTest
+class TestInf(OpTest):
+    def setUp(self):
+        self.op_type = "isinf"
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        x[0] = np.inf
+        x[-1] = np.inf
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.array(True).astype(self.dtype)}
+    def init_dtype(self):
+        pass
+    def test_output(self):
+        self.check_output()
+class TestFP16Inf(TestInf):
+    def init_dtype(self):
+        self.dtype = np.float16
+class TestNAN(OpTest):
+    def setUp(self):
+        self.op_type = "isnan"
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        x[0] = np.nan
+        x[-1] = np.nan
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.array(True).astype(self.dtype)}
+    def init_dtype(self):
+        pass
+    def test_output(self):
+        self.check_output()
+class TestFP16NAN(TestNAN):
+    def init_dtype(self):
+        self.dtype = np.float16
+class TestIsfinite(OpTest):
+    def setUp(self):
+        self.op_type = "isfinite"
+        self.dtype = np.float32
+        self.init_dtype()
+        x = np.random.uniform(0.1, 1, [11, 17]).astype(self.dtype)
+        x[0] = np.inf
+        x[-1] = np.nan
+        out = np.isinf(x) | np.isnan(x)
+        self.inputs = {'X': x}
+        self.outputs = {'Out': np.array(False).astype(self.dtype)}
+    def init_dtype(self):
+        pass
+    def test_output(self):
+        self.check_output()
+class TestFP16Isfinite(TestIsfinite):
+    def init_dtype(self):
+        self.dtype = np.float16
+if __name__ == '__main__':
+    unittest.main()
--- a/python/paddle/fluid/tests/unittests/test_reduce_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reduce_op.py
@@ -243,5 +243,87 @@ class TestKeepDimReduceSumMultiAxises(OpTest):
        self.check_grad(['X'], 'Out')
+class TestReduceSumWithDimOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((10, 1, 1)).astype("float64")}
+        self.attrs = {'dim': [1, 2], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=True)
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestReduceSumWithNumelOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((1, 1)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': False}
+        self.outputs = {
+            'Out': self.inputs['X'].sum(axis=tuple(self.attrs['dim']),
+                                        keepdims=False)
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestReduceMeanWithDimOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((10, 1, 1)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': False}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=False)
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestReduceMeanWithNumelOne(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_mean"
+        self.inputs = {'X': np.random.random((1, 1)).astype("float64")}
+        self.attrs = {'dim': [1], 'keep_dim': True}
+        self.outputs = {
+            'Out': self.inputs['X'].mean(
+                axis=tuple(self.attrs['dim']), keepdims=True)
+        }
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
+class TestReduceAll(OpTest):
+    def setUp(self):
+        self.op_type = "reduce_sum"
+        self.inputs = {'X': np.random.random((1, 1, 1)).astype("float64")}
+        self.attrs = {'reduce_all': True, 'keep_dim': False}
+        self.outputs = {'Out': self.inputs['X'].sum()}
+    def test_check_output(self):
+        self.check_output()
+    def test_check_grad(self):
+        self.check_grad(['X'], 'Out')
 if __name__ == '__main__':
    unittest.main()