make tensor_util contains no cuda code (#45256)

* make tensor_util contains no cuda code * refine isfinite * revert ut * move isfinite function to its op * fix test * fix compile * std::isnan is not defined for int type on windows * fix windows compile * fix fp16 * fix rocm compile * revert gradient node

make tensor_util contains no cuda code (#45256)
* make tensor_util contains no cuda code * refine isfinite * revert ut * move isfinite function to its op * fix test * fix compile * std::isnan is not defined for int type on windows * fix windows compile * fix fp16 * fix rocm compile * revert gradient node
78916a7a · Leo Chen · GitHub · 0d05e646 · 78916a7a · 78916a7a
23 changed file
--- a/paddle/fluid/framework/CMakeLists.txt
+++ b/paddle/fluid/framework/CMakeLists.txt
@@ -78,52 +78,31 @@ cc_test(
  data_type_test
  SRCS data_type_test.cc
  DEPS data_type place tensor)
-if(WITH_GPU)
-  if(WIN32)
+cc_library(
-    windows_symbolic(tensor_util SRCS tensor_util.cu)
-    nv_library(
-      tensor
-      SRCS .tensor_util.cu
-      DEPS place memory data_type device_context dense_tensor)
-    add_dependencies(tensor tensor_util)
-  else()
-    nv_library(
-      tensor
-      SRCS tensor_util.cu
-      DEPS place memory data_type device_context dense_tensor)
-  endif()
-elseif(WITH_ROCM)
-  hip_library(
-    tensor
-    SRCS tensor_util.cu
-    DEPS place memory data_type device_context dense_tensor)
-else()
-  cc_library(
  tensor
  SRCS tensor_util.cc
  DEPS place memory data_type device_context dense_tensor)
-endif()
-# target_link(tensor profiler)
 cc_test(
  tensor_test
  SRCS tensor_test.cc
-  DEPS tensor)
+  DEPS tensor isfinite_op)
 if(WITH_GPU)
  nv_test(
    tensor_util_test
    SRCS tensor_util_test.cc tensor_util_test.cu
-    DEPS tensor dlpack_tensor)
+    DEPS tensor dlpack_tensor isfinite_op)
 elseif(WITH_ROCM)
  hip_test(
    tensor_util_test
    SRCS tensor_util_test.cc tensor_util_test.cu
-    DEPS tensor dlpack_tensor)
+    DEPS tensor dlpack_tensor isfinite_op)
 else()
  cc_test(
    tensor_util_test
    SRCS tensor_util_test.cc
-    DEPS tensor dlpack_tensor)
+    DEPS tensor dlpack_tensor isfinite_op)
 endif()
 cc_test(
@@ -204,7 +183,7 @@ cc_test(
 cc_library(
  var_type_traits
  SRCS var_type_traits.cc
-  DEPS lod_tensor selected_rows_utils framework_proto scope)
+  DEPS selected_rows_utils framework_proto scope)
 if(WITH_GPU)
  target_link_libraries(var_type_traits dynload_cuda)
 endif()

--- a/paddle/fluid/framework/data_type.h
+++ b/paddle/fluid/framework/data_type.h
@@ -83,6 +83,13 @@ struct DataTypeTrait<void> {
  _ForEachDataTypeHelper_(                                      \
      callback, ::paddle::platform::complex<double>, COMPLEX128);
+#define _ForEachDataTypeNormal_(callback)            \
+  _ForEachDataTypeHelper_(callback, float, FP32);    \
+  _ForEachDataTypeHelper_(callback, double, FP64);   \
+  _ForEachDataTypeHelper_(callback, int, INT32);     \
+  _ForEachDataTypeHelper_(callback, int64_t, INT64); \
+  _ForEachDataTypeHelper_(callback, ::paddle::platform::float16, FP16);
 // For the use of thrust, as index-type elements can be only integers.
 #define _ForEachDataTypeTiny_(callback)          \
  _ForEachDataTypeHelper_(callback, int, INT32); \
@@ -148,6 +155,21 @@ inline void VisitDataTypeSmall(proto::VarType::Type type, Visitor visitor) {
 #undef VisitDataTypeCallbackSmall
 }
+// for normal dtype, int, int64, float, float64, float16
+template <typename Visitor>
+inline void VisitDataTypeNormal(proto::VarType::Type type, Visitor visitor) {
+#define VisitDataTypeCallbackNormal(cpp_type, proto_type) \
+  do {                                                    \
+    if (type == proto_type) {                             \
+      visitor.template apply<cpp_type>();                 \
+      return;                                             \
+    }                                                     \
+  } while (0)
+  _ForEachDataTypeNormal_(VisitDataTypeCallbackNormal);
+#undef VisitDataTypeCallbackNormal
+}
 template <typename Visitor>
 inline void VisitIntDataType(proto::VarType::Type type, Visitor visitor) {
 #define VisitIntDataTypeCallback(cpp_type, proto_type) \

--- a/paddle/fluid/framework/downpour_lite_worker.cc
+++ b/paddle/fluid/framework/downpour_lite_worker.cc
@@ -15,6 +15,7 @@ limitations under the License. */
 #if defined(PADDLE_WITH_PSCORE)
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/fleet/metrics.h"
+#include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 namespace phi {

--- a/paddle/fluid/framework/downpour_worker.cc
+++ b/paddle/fluid/framework/downpour_worker.cc
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
 #include "paddle/fluid/framework/fleet/metrics.h"
+#include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 namespace phi {

--- a/paddle/fluid/framework/downpour_worker_opt.cc
+++ b/paddle/fluid/framework/downpour_worker_opt.cc
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "paddle/fluid/framework/device_worker.h"
+#include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/platform/cpu_helper.h"
 namespace paddle {

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -27,6 +27,7 @@ limitations under the License. */
 #include "paddle/fluid/framework/transfer_scope_cache.h"
 #include "paddle/fluid/framework/unused_var_check.h"
 #include "paddle/fluid/framework/var_type.h"
+#include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/platform/device/device_wrapper.h"
 #include "paddle/fluid/platform/enforce.h"
 #include "paddle/fluid/platform/profiler.h"

--- a/paddle/fluid/framework/tensor_util.cc
+++ b/paddle/fluid/framework/tensor_util.cc
@@ -651,393 +651,6 @@ void TensorCopySync(const Tensor& src,
 #endif
 }
-template <typename Predicate, typename DevCtx>
-struct AnyDTypeVisitor {
-  Predicate predicate_;
-  const Tensor& tensor_;
-  const DevCtx& ctx_;
-  Tensor* out_;
-  AnyDTypeVisitor(Predicate predicate,
-                  const Tensor& tensor,
-                  const DevCtx& ctx,
-                  Tensor* out)
-      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
-  template <typename T>
-  void apply() const {
-    auto t = EigenVector<T>::Flatten(tensor_);
-    auto o = EigenScalar<bool>::From(*out_);
-    // return any of predicate_(t) is true.
-    o.device(*ctx_.eigen_device()) = predicate_(t).any();
-  }
-};
-template <typename Predicate, typename DevCtx>
-inline void AnyImpl(Predicate predicate,
-                    const framework::Tensor& tensor,
-                    const DevCtx& ctx,
-                    framework::Tensor* out) {
-  VisitDataType(
-      framework::TransToProtoVarType(tensor.dtype()),
-      AnyDTypeVisitor<Predicate, DevCtx>(predicate, tensor, ctx, out));
-}
-template <typename Predicate>
-class AnyVisitor : public std::unary_function<const Place&, bool> {
- private:
-  const framework::Tensor& tensor_;
-  Predicate predicate_;
-  bool GetResultHelper(const framework::Tensor& out,
-                       const platform::Place& place) const {
-    platform::CPUPlace cpu;
-    framework::Tensor tmp;
-    tmp.Resize({1});
-    tmp.mutable_data<bool>(cpu);
-    auto ctx = platform::DeviceContextPool::Instance().Get(place);
-    ctx->Wait();
-    TensorCopy(out, cpu, *ctx, &tmp);
-    ctx->Wait();
-    return GetResult(tmp, cpu);
-  }
- public:
-  AnyVisitor(const framework::Tensor& tensor, Predicate predicate)
-      : tensor_(tensor), predicate_(std::move(predicate)) {}
-  template <typename Place>
-  bool operator()(const Place& place) const {
-    framework::Tensor out;
-    out.Resize({1});
-    out.mutable_data<bool>(place);
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-    AnyImpl(predicate_, tensor_, *ctx, &out);
-    return this->GetResult(out, place);
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::XPUPlace& xpu) const {
-    return GetResultHelper(out, xpu);
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::MLUPlace& mlu) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("Not supported on place (%s) ", mlu));
-    return true;
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CUDAPlace& gpu) const {
-    return GetResultHelper(out, gpu);
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::NPUPlace& npu) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("Not supported on place (%s) ", npu));
-    // return GetResultHelper(out, npu);
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::IPUPlace& ipu) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("Not supported on place (%s) ", ipu));
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::NPUPinnedPlace& cpu) const {
-    return *out.data<bool>();
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CPUPlace& cpu) const {
-    return *out.data<bool>();
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CUDAPinnedPlace& cpu) const {
-    return *out.data<bool>();
-  }
-  bool GetResult(const framework::Tensor& out,
-                 const platform::CustomPlace& custom_dev) const {
-    PADDLE_THROW(platform::errors::Unimplemented("Not supported on place (%s) ",
-                                                 custom_dev));
-    return false;
-  }
-};
-template <typename Predicate>
-class AnyOutVisitor : public std::unary_function<const Place&, void> {
- private:
-  const framework::Tensor& tensor_;
-  mutable framework::Tensor* out_;
-  Predicate predicate_;
- public:
-  AnyOutVisitor(const framework::Tensor& tensor,
-                Predicate predicate,
-                framework::Tensor* out)
-      : tensor_(tensor), out_(out), predicate_(std::move(predicate)) {}
-  template <typename Place>
-  void operator()(const Place& place) const {
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-    out_->Resize({1});
-    out_->mutable_data<bool>(place);
-    AnyImpl(predicate_, tensor_, *ctx, out_);
-  }
-};
-template <typename Predicate>
-inline bool Any(const framework::Tensor& tensor, Predicate predicate) {
-  AnyVisitor<Predicate> visitor(tensor, predicate);
-  auto place = tensor.place();
-  return platform::VisitPlace(place, visitor);
-}
-template <typename Predicate>
-inline void Any(const framework::Tensor& tensor,
-                Predicate predicate,
-                framework::Tensor* out) {
-  AnyOutVisitor<Predicate> visitor(tensor, predicate, out);
-  auto place = tensor.place();
-  platform::VisitPlace(place, visitor);
-}
-template <typename Predicate, typename DevCtx>
-struct AllDTypeVisitor {
-  Predicate predicate_;
-  const Tensor& tensor_;
-  const DevCtx& ctx_;
-  Tensor* out_;
-  AllDTypeVisitor(Predicate predicate,
-                  const Tensor& tensor,
-                  const DevCtx& ctx,
-                  Tensor* out)
-      : predicate_(predicate), tensor_(tensor), ctx_(ctx), out_(out) {}
-  template <typename T>
-  void apply() const {
-    auto t = EigenVector<T>::Flatten(tensor_);
-    auto o = EigenVector<bool>::Flatten(*out_);
-    o.device(*ctx_.eigen_device()) = predicate_(t);
-  }
-};
-template <typename Predicate, typename DevCtx>
-inline void AllImpl(Predicate predicate,
-                    const framework::Tensor& tensor,
-                    const DevCtx& ctx,
-                    framework::Tensor* out) {
-  VisitDataType(
-      framework::TransToProtoVarType(tensor.dtype()),
-      AllDTypeVisitor<Predicate, DevCtx>(predicate, tensor, ctx, out));
-}
-template <typename Predicate>
-class AllOutVisitor : public std::unary_function<const Place&, void> {
- private:
-  const framework::Tensor& tensor_;
-  mutable framework::Tensor* out_;
-  Predicate predicate_;
- public:
-  AllOutVisitor(const framework::Tensor& tensor,
-                Predicate predicate,
-                framework::Tensor* out)
-      : tensor_(tensor), out_(out), predicate_(predicate) {}
-  template <typename Place>
-  void operator()(const Place& place) const {
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(place);
-    out_->Resize(tensor_.dims());
-    out_->mutable_data<bool>(place);
-    AllImpl(predicate_, tensor_, *ctx, out_);
-  }
-};
-template <typename Predicate>
-inline void All(const framework::Tensor& tensor,
-                Predicate predicate,
-                framework::Tensor* out) {
-  AllOutVisitor<Predicate> visitor(tensor, predicate, out);
-  auto place = tensor.place();
-  platform::VisitPlace(place, visitor);
-}
-struct ContainsNANPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isnan()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isnan();
-  }
-};
-bool TensorContainsNAN(const framework::Tensor& tensor) {
-  ContainsNANPredicate predicate;
-  return Any(tensor, predicate);
-}
-void TensorContainsNAN(const framework::Tensor& tensor,
-                       framework::Tensor* out) {
-  ContainsNANPredicate predicate;
-  Any(tensor, predicate, out);
-}
-void TensorContainsNANV2(const framework::Tensor& tensor,
-                         framework::Tensor* out) {
-  ContainsNANPredicate predicate;
-  All(tensor, predicate, out);
-}
-struct ContainsInfPredicate {
-  template <typename T>
-  auto operator()(const T& eigen_vec) const
-      -> decltype(std::declval<T>().isinf()) {
-    // Cast eigen_vector to vector of bool. true if is inf.
-    return eigen_vec.isinf();
-  }
-};
-bool TensorContainsInf(const framework::Tensor& tensor) {
-  ContainsInfPredicate predicate;
-  return Any(tensor, predicate);
-}
-void TensorContainsInf(const framework::Tensor& tensor,
-                       framework::Tensor* out) {
-  ContainsInfPredicate predicate;
-  Any(tensor, predicate, out);
-}
-void TensorContainsInfV2(const framework::Tensor& tensor,
-                         framework::Tensor* out) {
-  ContainsInfPredicate predicate;
-  All(tensor, predicate, out);
-}
-// NOTE(dzhwinter):
-// Isfinite need a AllVisitor to loop through all the elements.
-// We choose two cuda call instead of one allvisitor. The AllVisitor
-// should be implemented if the performance hurts.
-bool TensorIsfinite(const framework::Tensor& tensor) {
-  ContainsInfPredicate pred_inf;
-  ContainsNANPredicate pred_nan;
-  return !Any(tensor, pred_inf) && !Any(tensor, pred_nan);
-}
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-template <typename T>
-static inline void __global__ BothFalse(const T* cmp, T* out, int element_num) {
-  CUDA_KERNEL_LOOP(i, element_num) { out[i] = (!cmp[i]) && (!out[i]); }
-}
-#endif
-struct BothFalseVisitor : public std::unary_function<const Place&, void> {
-  const framework::Tensor& in_;
-  mutable framework::Tensor* out_;
-  BothFalseVisitor(const framework::Tensor& in, framework::Tensor* out)
-      : in_(in), out_(out) {}
-  template <typename Place>
-  void operator()(const Place& place) const {
-    VisitorImpl(place);
-  }
-  void VisitorImpl(const platform::XPUPlace& xpu) const {
-    PADDLE_THROW(platform::errors::Unimplemented("XPUPlace is not supported"));
-  }
-  void VisitorImpl(const platform::IPUPlace& ipu) const {
-    PADDLE_THROW(platform::errors::Unimplemented("IPUPlace is not supported"));
-  }
-  void VisitorImpl(const platform::CUDAPlace& gpu) const {
-#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-    auto* ctx = platform::DeviceContextPool::Instance().GetByPlace(gpu);
-    constexpr int MAX_BLOCK_DIM = 512;
-    const int MAX_GRID_DIM = ctx->GetMaxPhysicalThreadCount() / MAX_BLOCK_DIM;
-    int element_num = in_.numel();
-    int block_size = (element_num >= MAX_BLOCK_DIM)
-                         ? MAX_BLOCK_DIM
-                         : (1 << static_cast<int>(std::log2(element_num)));
-    int grid_size = element_num / block_size;
-    grid_size = (grid_size >= MAX_GRID_DIM) ? MAX_GRID_DIM : grid_size;
-    BothFalse<bool><<<grid_size, block_size, 0, ctx->stream()>>>(
-        in_.data<bool>(), out_->mutable_data<bool>(gpu), element_num);
-#endif
-  }
-  void VisitorImpl(const platform::NPUPlace& npu) const {
-    // TODO(zhiqiu)
-  }
-  void VisitorImpl(const platform::MLUPlace& mlu) const {
-    PADDLE_THROW(platform::errors::Unimplemented("MLUPlace is not supported"));
-  }
-  void VisitorImpl(const platform::CPUPlace& cpu) const {
-    int num = in_.numel();
-    const bool* in_ptr = in_.data<bool>();
-    bool* out_ptr = out_->data<bool>();
-    for (int i = 0; i < num; ++i) {
-      bool lhs = !in_ptr[i];
-      bool rhs = !out_ptr[i];
-      out_ptr[i] = lhs && rhs;
-    }
-  }
-  void VisitorImpl(
-      const platform::CUDAPinnedPlace& cpu /* equals to cpu*/) const {
-    int num = in_.numel();
-    const bool* in_ptr = in_.data<bool>();
-    bool* out_ptr = out_->data<bool>();
-    for (int i = 0; i < num; ++i) {
-      bool lhs = !in_ptr[i];
-      bool rhs = !out_ptr[i];
-      out_ptr[i] = lhs && rhs;
-    }
-  }
-  void VisitorImpl(
-      const platform::NPUPinnedPlace& cpu /* equals to cpu*/) const {
-    int num = in_.numel();
-    const bool* in_ptr = in_.data<bool>();
-    bool* out_ptr = out_->data<bool>();
-    for (int i = 0; i < num; ++i) {
-      bool lhs = !in_ptr[i];
-      bool rhs = !out_ptr[i];
-      out_ptr[i] = lhs && rhs;
-    }
-  }
-  void VisitorImpl(const platform::CustomPlace& custom_dev) const {
-    PADDLE_THROW(
-        platform::errors::Unimplemented("CustomPlace is not supported"));
-  }
-};
-void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out) {
-  framework::Tensor tmp;
-  TensorContainsInf(tensor, &tmp);
-  TensorContainsNAN(tensor, out);
-  BothFalseVisitor visitor(tmp, out);
-  auto place = tensor.place();
-  platform::VisitPlace(place, visitor);
-}
-void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out) {
-  framework::Tensor tmp;
-  TensorContainsInfV2(tensor, &tmp);
-  TensorContainsNANV2(tensor, out);
-  BothFalseVisitor visitor(tmp, out);
-  auto place = tensor.place();
-  platform::VisitPlace(place, visitor);
-}
 void TensorToStream(std::ostream& os,
                    const Tensor& tensor,
                    const platform::DeviceContext& dev_ctx) {

--- a/paddle/fluid/framework/tensor_util.h
+++ b/paddle/fluid/framework/tensor_util.h
@@ -112,16 +112,6 @@ void TensorToVector(const Tensor& src,
 template <typename T>
 void TesnorToVector(const Tensor& src, std::vector<T>* dst);
-// copy the result bool to cpu
-bool TensorContainsNAN(const framework::Tensor& tensor);
-bool TensorContainsInf(const framework::Tensor& tensor);
-bool TensorIsfinite(const framework::Tensor& tensor);
-// store the result bool in gpu tensor, async operation. Faster than above ones.
-void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out);
-void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out);
-void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out);
 void TensorToStream(std::ostream& os,
                    const Tensor& tensor,
                    const platform::DeviceContext& dev_ctx);
@@ -134,13 +124,6 @@ void TensorFromStream(std::istream& is,
                      const size_t& seek,
                      const std::vector<int64_t>& shape);
-// store the bool result tensor in out tensor
-void TensorContainsNANV2(const framework::Tensor& tensor,
-                         framework::Tensor* out);
-void TensorContainsInfV2(const framework::Tensor& tensor,
-                         framework::Tensor* out);
-void TensorIsfiniteV2(const framework::Tensor& tensor, framework::Tensor* out);
 // convert dlpack's DLTensor to tensor
 void TensorFromDLPack(const ::DLTensor& dl_tensor, framework::Tensor* dst);
@@ -601,6 +584,24 @@ inline Tensor ReshapeToMatrix(const Tensor& src, int num_col_dims) {
  return res;
 }
+template <typename T>
+inline T GetValue(const framework::Tensor* x) {
+  T value = static_cast<T>(0);
+  if (!platform::is_cpu_place(x->place())) {
+    framework::Tensor cpu_x;
+    framework::TensorCopy(*x, platform::CPUPlace(), &cpu_x);
+#if defined(PADDLE_WITH_ASCEND_CL) || defined(PADDLE_WITH_MLU)
+    platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance();
+    const platform::DeviceContext* dev_ctx = pool.Get(x->place());
+    dev_ctx->Wait();
+#endif
+    value = cpu_x.data<T>()[0];
+  } else {
+    value = x->data<T>()[0];
+  }
+  return value;
+}
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/framework/tensor_util_test.cc
+++ b/paddle/fluid/framework/tensor_util_test.cc
@@ -13,8 +13,8 @@
 // limitations under the License.
 #include "paddle/fluid/framework/tensor_util.h"
 #include <gtest/gtest.h>
+#include "paddle/fluid/operators/isfinite_op.h"
 #include <cmath>

--- a/paddle/fluid/framework/tensor_util_test.cu
+++ b/paddle/fluid/framework/tensor_util_test.cu
@@ -14,6 +14,7 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include "gtest/gtest.h"
+#include "paddle/fluid/operators/isfinite_op.h"
 #include "paddle/fluid/platform/device_context.h"
 #include "paddle/fluid/platform/place.h"

--- a/paddle/fluid/operators/isfinite_op.cc
+++ b/paddle/fluid/operators/isfinite_op.cc
@@ -122,14 +122,6 @@ namespace ops = paddle::operators;
      paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>, \
      paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>)
-#define REGISTER_OVERFLOW_CPU_KERNEL(op_type, functor)             \
-  REGISTER_OP_CPU_KERNEL(                                          \
-      op_type,                                                     \
-      ops::OverflowKernel<phi::CPUContext, int, ops::functor>,     \
-      ops::OverflowKernel<phi::CPUContext, int64_t, ops::functor>, \
-      ops::OverflowKernel<phi::CPUContext, float, ops::functor>,   \
-      ops::OverflowKernel<phi::CPUContext, double, ops::functor>);
 REGISTER_OP_MAKER(isinf, "isinf(X)");
 REGISTER_OP_MAKER(isnan, "isnan(X)");
 REGISTER_OP_MAKER(isfinite, "isfinite(X)");

--- a/paddle/fluid/operators/isfinite_op.h
+++ b/paddle/fluid/operators/isfinite_op.h
@@ -21,14 +21,128 @@
 #include "paddle/fluid/framework/tensor_util.h"
 #include "paddle/fluid/platform/float16.h"
 #include "paddle/fluid/platform/transform.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+#include "paddle/phi/kernels/reduce_all_kernel.h"
+#include "paddle/phi/kernels/reduce_any_kernel.h"
 namespace phi {
 class DenseTensor;
 }  // namespace phi
 namespace paddle {
-namespace operators {
+namespace framework {
+// store the result bool in gpu tensor, async operation. Faster than above ones.
+void TensorContainsNAN(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorContainsInf(const framework::Tensor& tensor, framework::Tensor* out);
+void TensorIsfinite(const framework::Tensor& tensor, framework::Tensor* out);
+// copy the result bool to cpu
+bool TensorContainsNAN(const framework::Tensor& tensor);
+bool TensorContainsInf(const framework::Tensor& tensor);
+bool TensorIsfinite(const framework::Tensor& tensor);
+#define FiniteVisitor(type, reduce_type, device)                             \
+  struct type##Visitor##device {                                             \
+    type##Visitor##device(const phi::DenseTensor& in, phi::DenseTensor* out) \
+        : in_(in), out_(out) {}                                              \
+    template <typename T>                                                    \
+    void apply() const {                                                     \
+      auto place = in_.place();                                              \
+      auto* ctx = static_cast<phi::device##Context*>(                        \
+          platform::DeviceContextPool::Instance().Get(place));               \
+      Tensor tmp;                                                            \
+      tmp.Resize(in_.dims());                                                \
+      out_->Resize({1});                                                     \
+      std::vector<int64_t> dims(tmp.dims().size());                          \
+      std::iota(dims.begin(), dims.end(), 0);                                \
+      phi::type##Kernel<T, phi::device##Context>(*ctx, in_, &tmp);           \
+      phi::reduce_type##Kernel<bool, phi::device##Context>(                  \
+          *ctx, tmp, dims, false, out_);                                     \
+    }                                                                        \
+    const phi::DenseTensor& in_;                                             \
+    phi::DenseTensor* out_;                                                  \
+  };
+FiniteVisitor(Isnan, Any, CPU);
+FiniteVisitor(Isinf, Any, CPU);
+FiniteVisitor(Isfinite, All, CPU);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+FiniteVisitor(Isnan, Any, GPU);
+FiniteVisitor(Isinf, Any, GPU);
+FiniteVisitor(Isfinite, All, GPU);
+#endif
+// store the result bool in gpu tensor, async operation. Faster than above ones.
+inline void TensorContainsNAN(const framework::Tensor& tensor,
+                              framework::Tensor* out) {
+  auto place = tensor.place();
+  if (platform::is_cpu_place(tensor.place())) {
+    VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
+                        IsnanVisitorCPU(tensor, out));
+    return;
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(place)) {
+    VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
+                        IsnanVisitorGPU(tensor, out));
+    return;
+  }
+#endif
+  PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
+}
+inline void TensorContainsInf(const framework::Tensor& tensor,
+                              framework::Tensor* out) {
+  auto place = tensor.place();
+  if (platform::is_cpu_place(tensor.place())) {
+    VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
+                        IsinfVisitorCPU(tensor, out));
+    return;
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(place)) {
+    VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
+                        IsinfVisitorGPU(tensor, out));
+    return;
+  }
+#endif
+  PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
+}
+inline void TensorIsfinite(const framework::Tensor& tensor,
+                           framework::Tensor* out) {
+  auto place = tensor.place();
+  if (platform::is_cpu_place(tensor.place())) {
+    VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
+                        IsfiniteVisitorCPU(tensor, out));
+    return;
+  }
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+  if (platform::is_gpu_place(place)) {
+    VisitDataTypeNormal(TransToProtoVarType(tensor.dtype()),
+                        IsfiniteVisitorGPU(tensor, out));
+    return;
+  }
+#endif
+  PADDLE_THROW(platform::errors::Unimplemented("Not supported on %s.", place));
+}
+// copy the result bool to cpu
+inline bool TensorContainsNAN(const framework::Tensor& tensor) {
+  Tensor out;
+  TensorContainsNAN(tensor, &out);
+  return GetValue<bool>(&out);
+}
+inline bool TensorContainsInf(const framework::Tensor& tensor) {
+  Tensor out;
+  TensorContainsInf(tensor, &out);
+  return GetValue<bool>(&out);
+}
+inline bool TensorIsfinite(const framework::Tensor& tensor) {
+  Tensor out;
+  TensorIsfinite(tensor, &out);
+  return GetValue<bool>(&out);
+}
+}  // namespace framework
+namespace operators {
 struct InfinityFunctor {
  void operator()(const framework::Tensor& tensor, framework::Tensor* out) {
    framework::TensorContainsInf(tensor, out);

--- a/paddle/fluid/operators/memcpy_d2h_op.h
+++ b/paddle/fluid/operators/memcpy_d2h_op.h
@@ -72,6 +72,7 @@ class MemcpyD2HFunctor {
                     framework::LoDTensor &dst) const {  // NOLINT
    if (dst_place_type_ == 1) {
      framework::TensorCopy(src, platform::CUDAPinnedPlace(), dev_ctx_, &dst);
+      dev_ctx_.Wait();
    } else if (dst_place_type_ == 0) {
      framework::TensorCopy(src, platform::CPUPlace(), dev_ctx_, &dst);
    } else {

--- a/paddle/phi/common/float16.h
+++ b/paddle/phi/common/float16.h
@@ -1028,6 +1028,10 @@ inline bool isnan(const phi::dtype::float16& a) { return phi::dtype::isnan(a); }
 inline bool isinf(const phi::dtype::float16& a) { return phi::dtype::isinf(a); }
+inline bool isfinite(const phi::dtype::float16& a) {
+  return phi::dtype::isfinite(a);
+}
 template <>
 struct numeric_limits<phi::dtype::float16> {
  static const bool is_specialized = true;

--- a/paddle/phi/kernels/cpu/amp_kernel.cc
+++ b/paddle/phi/kernels/cpu/amp_kernel.cc
@@ -21,6 +21,8 @@
 #include "paddle/phi/kernels/empty_kernel.h"
 #include "paddle/phi/kernels/funcs/eigen/common.h"
 #include "paddle/phi/kernels/impl/amp_kernel_impl.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
+#include "paddle/phi/kernels/reduce_all_kernel.h"
 #include "paddle/fluid/framework/tensor_util.h"
@@ -85,7 +87,13 @@ void CheckFiniteAndUnscaleKernel(const Context& dev_ctx,
    auto* out = outs[i];
    dev_ctx.template Alloc<T>(out);
    if (!(*found_inf_data)) {
-      paddle::framework::TensorIsfinite(*x, &is_finite);
+      DenseTensor tmp;
+      tmp.Resize(x->dims());
+      phi::IsfiniteKernel<T, Context>(dev_ctx, *x, &tmp);
+      std::vector<int64_t> dims(x->dims().size());
+      std::iota(dims.begin(), dims.end(), 0);
+      phi::AllKernel<bool, Context>(dev_ctx, tmp, dims, false, &is_finite);
      *found_inf_data = !(*is_finite_data);
    }
    auto eigen_out = EigenVector<T>::Flatten(*out);

--- a/paddle/phi/kernels/cpu/isfinite_kernel.cc
+++ b/paddle/phi/kernels/cpu/isfinite_kernel.cc
@@ -18,19 +18,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
-namespace phi {
-template <typename T, typename Context, typename Functor>
-inline void IsfiniteKernelImpl(const Context& dev_ctx,
-                               const DenseTensor& x,
-                               DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-  Functor functor;
-  functor(x, out);
-}
-}  // namespace phi
 PD_REGISTER_KERNEL(isinf,
                   CPU,
                   ALL_LAYOUT,

--- a/paddle/phi/kernels/funcs/CMakeLists.txt
+++ b/paddle/phi/kernels/funcs/CMakeLists.txt
@@ -9,7 +9,7 @@ math_library(fc_functor DEPS blas jit_kernel_helper)
 math_library(gpc DEPS phi_enforce)
 math_library(gru_compute DEPS activation_functions math_function)
 math_library(lstm_compute DEPS activation_functions)
-math_library(math_function DEPS blas dense_tensor tensor)
+math_library(math_function DEPS blas dense_tensor)
 math_library(matrix_reduce DEPS dense_tensor)
 math_library(matrix_inverse DEPS dense_tensor eigen3 blas)
 math_library(pooling DEPS dense_tensor)

--- a/paddle/phi/kernels/funcs/isfinite_functor.h
+++ b/paddle/phi/kernels/funcs/isfinite_functor.h
@@ -14,30 +14,83 @@
 #pragma once
-#include "paddle/fluid/framework/op_registry.h"
-#include "paddle/fluid/framework/tensor_util.h"
-#include "paddle/phi/common/scalar.h"
-#include "paddle/phi/core/dense_tensor.h"
-#include "paddle/phi/infermeta/unary.h"
 namespace phi {
 namespace funcs {
-struct InfinityV2Functor {
+template <typename T, class Enable = void>
-  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+struct IsNanFunctor {
-    paddle::framework::TensorContainsInfV2(tensor, out);
+  HOSTDEVICE bool operator()(const T& a) const {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    return ::isnan(a);
+#else
+    return std::isnan(a);
+#endif
+  }
+};
+template <typename T>
+struct IsNanFunctor<T,
+                    typename std::enable_if<std::is_integral<T>::value>::type> {
+  HOSTDEVICE bool operator()(const T& a) const { return false; }
+};
+// isnan is defined in namespace std in float16.h, but
+// on rocm platform, it still got:
+// "error: call to 'isnan' is ambiguous".
+// So use phi::dtype::isnan here.
+template <>
+struct IsNanFunctor<phi::dtype::float16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::float16& a) const {
+    return phi::dtype::isnan(a);
+  }
+};
+template <typename T, class Enable = void>
+struct IsInfFunctor {
+  HOSTDEVICE bool operator()(const T& a) const {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    return ::isinf(a);
+#else
+    return std::isinf(a);
+#endif
+  }
+};
+template <typename T>
+struct IsInfFunctor<T,
+                    typename std::enable_if<std::is_integral<T>::value>::type> {
+  HOSTDEVICE bool operator()(const T& a) const { return false; }
+};
+template <>
+struct IsInfFunctor<phi::dtype::float16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::float16& a) const {
+    return phi::dtype::isinf(a);
  }
 };
-struct NANV2Functor {
+template <typename T, class Enable = void>
-  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+struct IsFiniteFunctor {
-    paddle::framework::TensorContainsNANV2(tensor, out);
+  HOSTDEVICE bool operator()(const T& a) const {
+#if defined(__CUDACC__) || defined(__HIPCC__)
+    return ::isfinite(a);
+#else
+    return std::isfinite(a);
+#endif
  }
 };
-struct IsfiniteV2Functor {
+template <typename T>
-  void operator()(const DenseTensor& tensor, DenseTensor* out) {
+struct IsFiniteFunctor<
-    paddle::framework::TensorIsfiniteV2(tensor, out);
+    T,
+    typename std::enable_if<std::is_integral<T>::value>::type> {
+  HOSTDEVICE bool operator()(const T& a) const { return true; }
+};
+template <>
+struct IsFiniteFunctor<phi::dtype::float16, void> {
+  HOSTDEVICE bool operator()(const phi::dtype::float16& a) const {
+    return phi::dtype::isfinite(a);
  }
 };

--- a/paddle/phi/kernels/gpu/isfinite_kernel.cu
+++ b/paddle/phi/kernels/gpu/isfinite_kernel.cu
@@ -18,19 +18,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/isfinite_kernel_impl.h"
-namespace phi {
-template <typename T, typename Context, typename Functor>
-inline void IsfiniteKernelImpl(const Context& dev_ctx,
-                               const DenseTensor& x,
-                               DenseTensor* out) {
-  dev_ctx.template Alloc<T>(out);
-  Functor functor;
-  functor(x, out);
-}
-}  // namespace phi
 PD_REGISTER_KERNEL(isinf,
                   GPU,
                   ALL_LAYOUT,

--- a/paddle/phi/kernels/impl/isfinite_kernel_impl.h
+++ b/paddle/phi/kernels/impl/isfinite_kernel_impl.h
@@ -17,23 +17,24 @@
 #include "paddle/phi/kernels/funcs/isfinite_functor.h"
 #include "paddle/phi/kernels/isfinite_kernel.h"
-namespace phi {
+// See Note [ Why still include the fluid headers? ]
+#include "paddle/fluid/platform/transform.h"
-template <typename T, typename Context, typename Functor>
+namespace phi {
-inline void IsfiniteKernelImpl(const Context& ctx,
-                               const DenseTensor& x,
-                               DenseTensor* out);
 #define DEFINE_ISFINITE_KERNEL(isfinite_kernel, functor)                   \
  template <typename T, typename Context>                                  \
  void isfinite_kernel(                                                    \
      const Context& ctx, const DenseTensor& x, DenseTensor* out) {        \
-    IsfiniteKernelImpl<T, Context, functor>(ctx, x, out);           \
+    auto* out_ptr = ctx.template Alloc<bool>(out);                         \
+    funcs::functor<T> unary_func;                                          \
+    paddle::platform::Transform<Context> trans;                            \
+    trans(ctx, x.data<T>(), x.data<T>() + x.numel(), out_ptr, unary_func); \
  }
-DEFINE_ISFINITE_KERNEL(IsinfKernel, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_KERNEL(IsinfKernel, IsInfFunctor)
-DEFINE_ISFINITE_KERNEL(IsnanKernel, funcs::NANV2Functor)
+DEFINE_ISFINITE_KERNEL(IsnanKernel, IsNanFunctor)
-DEFINE_ISFINITE_KERNEL(IsfiniteKernel, funcs::IsfiniteV2Functor)
+DEFINE_ISFINITE_KERNEL(IsfiniteKernel, IsFiniteFunctor)
 #undef DEFINE_ISFINITE_KERNEL
 }  // namespace phi
--- a/paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h
+++ b/paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h
@@ -19,21 +19,17 @@
 namespace phi {
-template <typename T, typename Context, typename Functor>
+#define DEFINE_ISFINITE_SR(isfinite)                                  \
-inline void IsfiniteSRImpl(const Context& ctx,
-                           const SelectedRows& x,
-                           SelectedRows* out);
-#define DEFINE_ISFINITE_SR(isfinite_sr, functor)                      \
  template <typename T, typename Context>                             \
-  void isfinite_sr(                                                   \
+  void isfinite##SR(                                                  \
      const Context& ctx, const SelectedRows& x, SelectedRows* out) { \
-    IsfiniteSRImpl<T, Context, functor>(ctx, x, out);                 \
+    ctx.template Alloc<bool>(out);                                    \
+    Isinf##Kernel<T, Context>(ctx, x.value(), out->mutable_value());  \
  }
-DEFINE_ISFINITE_SR(IsinfSR, funcs::InfinityV2Functor)
+DEFINE_ISFINITE_SR(Isinf)
-DEFINE_ISFINITE_SR(IsnanSR, funcs::NANV2Functor)
+DEFINE_ISFINITE_SR(Isnan)
-DEFINE_ISFINITE_SR(IsfiniteSR, funcs::IsfiniteV2Functor)
+DEFINE_ISFINITE_SR(Isfinite)
 #undef DEFINE_ISFINITE_SR
 }  // namespace phi
--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.cc
@@ -21,18 +21,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/selected_rows/impl/isfinite_kernel_impl.h"
-namespace phi {
-template <typename T, typename Context, typename Functor>
-inline void IsfiniteSRImpl(const Context& dev_ctx,
-                           const SelectedRows& x,
-                           SelectedRows* out) {
-  dev_ctx.template Alloc<T>(out);
-  Functor functor;
-  functor(x.value(), out->mutable_value());
-}
-}  // namespace phi
 PD_REGISTER_KERNEL(isinf_sr,
                   CPU,
                   ALL_LAYOUT,

--- a/paddle/phi/kernels/selected_rows/isfinite_kernel.h
+++ b/paddle/phi/kernels/selected_rows/isfinite_kernel.h
@@ -15,6 +15,7 @@ limitations under the License. */
 #pragma once
 #include "paddle/phi/core/selected_rows.h"
+#include "paddle/phi/kernels/isfinite_kernel.h"
 namespace phi {