register fluid kerenls to phi [part 6.5] (#52882)

* update * fix bug * update * fix bug

register fluid kerenls to phi [part 6.5] (#52882)
* update * fix bug * update * fix bug
cb81befa · huangjiyi · GitHub · bc91012f · cb81befa · cb81befa
20 changed file
--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2483,21 +2483,32 @@ Scope* OperatorWithKernel::PrepareData(
      }

      std::unique_ptr<phi::KernelKey> new_expected_kernel_key = nullptr;
-      if (run_phi_kernel_ && in_def != nullptr &&
-          in_def->backend != phi::Backend::ALL_BACKEND) {
-        auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
-        if ((in_def->backend != tensor_backend &&
-             !(in_def->backend == phi::Backend::GPUDNN &&
-               tensor_backend == phi::Backend::GPU) &&
-             !(in_def->backend == phi::Backend::KPS &&
-               tensor_backend == phi::Backend::XPU) &&
-             !(in_def->backend == phi::Backend::ONEDNN &&
-               tensor_backend == phi::Backend::CPU)) ||
-            tensor_in->place().GetType() == AllocationType::GPUPINNED) {
-          new_expected_kernel_key =
-              std::make_unique<phi::KernelKey>(in_def->backend,
-                                               expected_kernel_key.layout(),
-                                               expected_kernel_key.dtype());
+      if (run_phi_kernel_) {
+        if (phi_kernel_->GetKernelRegisteredType() ==
+            phi::KernelRegisteredType::STRUCTURE) {
+          if (!backends_are_same_class(kernel_type_for_var.backend(),
+                                       expected_kernel_key.backend())) {
+            new_expected_kernel_key =
+                std::make_unique<phi::KernelKey>(expected_kernel_key.backend(),
+                                                 expected_kernel_key.layout(),
+                                                 expected_kernel_key.dtype());
+          }
+        } else if (in_def != nullptr &&
+                   in_def->backend != phi::Backend::ALL_BACKEND) {
+          auto tensor_backend = phi::TransToPhiBackend(tensor_in->place());
+          if ((in_def->backend != tensor_backend &&
+               !(in_def->backend == phi::Backend::GPUDNN &&
+                 tensor_backend == phi::Backend::GPU) &&
+               !(in_def->backend == phi::Backend::KPS &&
+                 tensor_backend == phi::Backend::XPU) &&
+               !(in_def->backend == phi::Backend::ONEDNN &&
+                 tensor_backend == phi::Backend::CPU)) ||
+              tensor_in->place().GetType() == AllocationType::GPUPINNED) {
+            new_expected_kernel_key =
+                std::make_unique<phi::KernelKey>(in_def->backend,
+                                                 expected_kernel_key.layout(),
+                                                 expected_kernel_key.dtype());
+          }
        }
      }


--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cc
@@ -73,6 +73,8 @@ class MpAllReduceSumOpGradMaker : public framework::SingleGradOpMaker<T> {

 DECLARE_INPLACE_OP_INFERER(MpAllReduceSumInplaceInferer, {"X", "Out"});

+DEFINE_C_ALLREDUCE_CPU_KERNEL(MpAllReduceSum, kRedSum);
+
 }  // namespace operators
 }  // namespace paddle

@@ -86,9 +88,12 @@ REGISTER_OPERATOR(mp_allreduce_sum,
                  ops::MpAllReduceSumOpMaker,
                  ops::MpAllReduceSumInplaceInferer);

-REGISTER_OP_CPU_KERNEL(mp_allreduce_sum,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, float>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, double>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, int64_t>,
-                       ops::CAllReduceOpCPUKernel<ops::kRedSum, plat::float16>)
+PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MpAllReduceSumCPUKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
+                          plat::float16) {}
--- a/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
+++ b/paddle/fluid/operators/collective/mp_allreduce_sum_op.cu.cc
@@ -15,16 +15,24 @@
 #include "paddle/fluid/framework/op_registry.h"
 #include "paddle/fluid/operators/collective/c_allreduce_op.h"

+namespace paddle {
+namespace operators {
+DEFINE_C_ALLREDUCE_CUDA_KERNEL(MpAllReduceSum, kRedSum)
+}  // namespace operators
+}  // namespace paddle
+
 namespace ops = paddle::operators;
 namespace plat = paddle::platform;
-
-REGISTER_OP_CUDA_KERNEL(
-    mp_allreduce_sum,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, float>,
+PD_REGISTER_STRUCT_KERNEL(mp_allreduce_sum,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::MpAllReduceSumCUDAKernel,
+                          float,
+                          double,
+                          int,
+                          int64_t,
 #if NCCL_VERSION_CODE >= 21000
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::bfloat16>,
+                          plat::bfloat16,
 #endif
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, double>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, int64_t>,
-    ops::CAllReduceOpCUDAKernel<ops::kRedSum, plat::float16>)
+                          plat::float16) {
+}
--- a/paddle/fluid/operators/detection/iou_similarity_op.cc
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cc
@@ -114,6 +114,5 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(iou_similarity,
-                       ops::IOUSimilarityKernel<phi::CPUContext, float>,
-                       ops::IOUSimilarityKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    iou_similarity, CPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
--- a/paddle/fluid/operators/detection/iou_similarity_op.cu
+++ b/paddle/fluid/operators/detection/iou_similarity_op.cu
@@ -15,6 +15,5 @@ limitations under the License. */
 #include "paddle/fluid/operators/detection/iou_similarity_op.h"

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(iou_similarity,
-                        ops::IOUSimilarityKernel<phi::GPUContext, float>,
-                        ops::IOUSimilarityKernel<phi::GPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(
+    iou_similarity, GPU, ALL_LAYOUT, ops::IOUSimilarityKernel, float, double) {}
--- a/paddle/fluid/operators/detection/iou_similarity_op.h
+++ b/paddle/fluid/operators/detection/iou_similarity_op.h
@@ -105,7 +105,7 @@ struct IOUSimilarityFunctor {
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class IOUSimilarityKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/detection/locality_aware_nms_op.cc
+++ b/paddle/fluid/operators/detection/locality_aware_nms_op.cc
@@ -160,7 +160,7 @@ void GetMaxScoreIndexWithLocalityAware(
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class LocalityAwareNMSKernel : public framework::OpKernel<T> {
 public:
  void LocalityAwareNMSFast(phi::DenseTensor* bbox,
@@ -520,6 +520,9 @@ REGISTER_OPERATOR(
    ops::LocalityAwareNMSOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(locality_aware_nms,
-                       ops::LocalityAwareNMSKernel<float>,
-                       ops::LocalityAwareNMSKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(locality_aware_nms,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::LocalityAwareNMSKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/detection/mine_hard_examples_op.cc
+++ b/paddle/fluid/operators/detection/mine_hard_examples_op.cc
@@ -49,7 +49,7 @@ inline MiningType GetMiningType(std::string str) {
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class MineHardExamplesKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -403,6 +403,9 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);

-REGISTER_OP_CPU_KERNEL(mine_hard_examples,
-                       ops::MineHardExamplesKernel<phi::CPUContext, float>,
-                       ops::MineHardExamplesKernel<phi::CPUContext, double>);
+PD_REGISTER_STRUCT_KERNEL(mine_hard_examples,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MineHardExamplesKernel,
+                          float,
+                          double) {}
--- a/paddle/fluid/operators/detection/multiclass_nms_op.cc
+++ b/paddle/fluid/operators/detection/multiclass_nms_op.cc
@@ -143,7 +143,7 @@ void SliceOneClass(const platform::DeviceContext& ctx,
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class MultiClassNMSKernel : public framework::OpKernel<T> {
 public:
  void NMSFast(const phi::DenseTensor& bbox,
@@ -629,6 +629,9 @@ class MultiClassNMS3OpMaker : public MultiClassNMS2OpMaker {
  }
 };

+template <typename T, typename DeviceContext>
+class MultiClassNMS2Kernel : public MultiClassNMSKernel<T, DeviceContext> {};
+
 }  // namespace operators
 }  // namespace paddle

@@ -643,18 +646,21 @@ REGISTER_OPERATOR(
    ops::MultiClassNMSOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(multiclass_nms,
-                       ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(
+    multiclass_nms, CPU, ALL_LAYOUT, ops::MultiClassNMSKernel, float, double) {}
+
 REGISTER_OPERATOR(
    multiclass_nms2,
    ops::MultiClassNMS2Op,
    ops::MultiClassNMS2OpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(multiclass_nms2,
-                       ops::MultiClassNMSKernel<float>,
-                       ops::MultiClassNMSKernel<double>);
+PD_REGISTER_STRUCT_KERNEL(multiclass_nms2,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::MultiClassNMS2Kernel,
+                          float,
+                          double) {}

 REGISTER_OPERATOR(
    multiclass_nms3,

--- a/paddle/fluid/operators/fused/multihead_matmul_op.cu
+++ b/paddle/fluid/operators/fused/multihead_matmul_op.cu
@@ -270,7 +270,7 @@ __global__ void broadcast_batch_head_number(const T *src,
  }
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext &context) const override {
@@ -423,12 +423,15 @@ class MultiHeadMatMulV2Kernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
+namespace plat = paddle::platform;
 #if defined(PADDLE_WITH_CUDA) && CUDA_VERSION >= 10000
-REGISTER_OP_CUDA_KERNEL(
-    multihead_matmul,
-    ops::MultiHeadMatMulV2Kernel<phi::GPUContext, paddle::platform::float16>,
-    ops::MultiHeadMatMulV2Kernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(multihead_matmul,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::MultiHeadMatMulV2Kernel,
+                          float,
+                          plat::float16) {}
 #else
-REGISTER_OP_CUDA_KERNEL(multihead_matmul,
-                        ops::MultiHeadMatMulV2Kernel<phi::GPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(
+    multihead_matmul, GPU, ALL_LAYOUT, ops::MultiHeadMatMulV2Kernel, float) {}
 #endif
--- a/paddle/fluid/operators/match_matrix_tensor_op.cc
+++ b/paddle/fluid/operators/match_matrix_tensor_op.cc
@@ -240,7 +240,7 @@ void MatchMatrixTensorOpMaker::Make() {
    )DOC");
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -321,7 +321,7 @@ class CPUMatchMatrixTensorOPKernel : public framework::OpKernel<T> {
  }
 };

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class CPUMatchMatrixTensorOPGradKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -458,10 +458,13 @@ REGISTER_OPERATOR(
    ops::MatchMatrixTensorGradOpMaker<paddle::imperative::OpBase>);
 REGISTER_OPERATOR(match_matrix_tensor_grad, ops::MatchMatrixTensorOpGrad);

-REGISTER_OP_CPU_KERNEL(
-    match_matrix_tensor,
-    ops::CPUMatchMatrixTensorOPKernel<phi::CPUContext, float>);
-
-REGISTER_OP_CPU_KERNEL(
-    match_matrix_tensor_grad,
-    ops::CPUMatchMatrixTensorOPGradKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(match_matrix_tensor,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUMatchMatrixTensorOPKernel,
+                          float) {}
+PD_REGISTER_STRUCT_KERNEL(match_matrix_tensor_grad,
+                          CPU,
+                          ALL_LAYOUT,
+                          ops::CPUMatchMatrixTensorOPGradKernel,
+                          float) {}
--- a/paddle/fluid/operators/mean_iou_op.cc
+++ b/paddle/fluid/operators/mean_iou_op.cc
@@ -107,7 +107,6 @@ REGISTER_OPERATOR(
    ops::MeanIoUOpMaker,
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>);
-REGISTER_OP_CPU_KERNEL(mean_iou,
-                       ops::MeanIoUKernel<int>,
-                       ops::MeanIoUKernel<int32_t>,
-                       ops::MeanIoUKernel<int64_t>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    mean_iou, CPU, ALL_LAYOUT, ops::MeanIoUKernel, int, int64_t) {}
--- a/paddle/fluid/operators/mean_iou_op.cu
+++ b/paddle/fluid/operators/mean_iou_op.cu
@@ -88,7 +88,7 @@ __global__ void ComputeIoUCUDAKernel(
  }
 }

-template <typename T>
+template <typename T, typename DeviceContext>
 class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {
@@ -166,7 +166,5 @@ class MeanIoUCUDAOpKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(mean_iou,
-                        ops::MeanIoUCUDAOpKernel<int>,
-                        ops::MeanIoUCUDAOpKernel<int64_t>,
-                        ops::MeanIoUCUDAOpKernel<int32_t>);
+PD_REGISTER_STRUCT_KERNEL(
+    mean_iou, GPU, ALL_LAYOUT, ops::MeanIoUCUDAOpKernel, int, int64_t) {}
--- a/paddle/fluid/operators/mean_iou_op.h
+++ b/paddle/fluid/operators/mean_iou_op.h
@@ -27,7 +27,7 @@ template <typename T,
          typename IndexType = Eigen::DenseIndex>
 using EigenTensor = framework::EigenTensor<T, D, MajorType, IndexType>;

-template <typename T>
+template <typename T, typename DeviceContext>
 class MeanIoUKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/minus_op.cc
+++ b/paddle/fluid/operators/minus_op.cc
@@ -155,6 +155,8 @@ REGISTER_OPERATOR(minus,
                  ops::MinusOpMaker,
                  ops::MinusGradDescMaker,
                  ops::MinusGradMaker);
-REGISTER_OP_CPU_KERNEL(minus, ops::MinusKernel<phi::CPUContext, float>);
+PD_REGISTER_STRUCT_KERNEL(minus, CPU, ALL_LAYOUT, ops::MinusKernel, float) {}

-REGISTER_OP_CUDA_KERNEL(minus, ops::MinusKernel<phi::GPUContext, float>);
+#if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
+PD_REGISTER_STRUCT_KERNEL(minus, GPU, ALL_LAYOUT, ops::MinusKernel, float) {}
+#endif
--- a/paddle/fluid/operators/minus_op.h
+++ b/paddle/fluid/operators/minus_op.h
@@ -20,7 +20,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class MinusKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& context) const override {

--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cc
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cc
@@ -233,6 +233,6 @@ REGISTER_OPERATOR(
    paddle::framework::EmptyGradOpMaker<paddle::framework::OpDesc>,
    paddle::framework::EmptyGradOpMaker<paddle::imperative::OpBase>,
    ops::LarsMomentumOpVarTypeInference);
-REGISTER_OP_CPU_KERNEL(lars_momentum,
-                       ops::LarsMomentumOpKernel<float>,
-                       ops::LarsMomentumOpKernel<double>);
+
+PD_REGISTER_STRUCT_KERNEL(
+    lars_momentum, CPU, ALL_LAYOUT, ops::LarsMomentumOpKernel, float, double) {}
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.cu
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.cu
@@ -474,7 +474,7 @@ inline void SeparatedLarsMomentumOpCUDAKernel(const phi::GPUContext& cuda_ctx,
                              is_amp);
 }

-template <typename DeviceContext, typename T>
+template <typename T, typename DeviceContext>
 class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
  using MT = MultiPrecisionType<T>;

@@ -679,8 +679,11 @@ class LarsMomentumOpCUDAKernel : public framework::OpKernel<T> {
 }  // namespace paddle

 namespace ops = paddle::operators;
-REGISTER_OP_CUDA_KERNEL(
-    lars_momentum,
-    ops::LarsMomentumOpCUDAKernel<phi::GPUContext, float>,
-    ops::LarsMomentumOpCUDAKernel<phi::GPUContext, double>,
-    ops::LarsMomentumOpCUDAKernel<phi::GPUContext, paddle::platform::float16>);
+namespace plat = paddle::platform;
+PD_REGISTER_STRUCT_KERNEL(lars_momentum,
+                          GPU,
+                          ALL_LAYOUT,
+                          ops::LarsMomentumOpCUDAKernel,
+                          float,
+                          double,
+                          plat::float16) {}
--- a/paddle/fluid/operators/optimizers/lars_momentum_op.h
+++ b/paddle/fluid/operators/optimizers/lars_momentum_op.h
@@ -19,7 +19,7 @@ limitations under the License. */
 namespace paddle {
 namespace operators {

-template <typename T>
+template <typename T, typename DeviceContext>
 class LarsMomentumOpKernel : public framework::OpKernel<T> {
 public:
  void Compute(const framework::ExecutionContext& ctx) const override {

--- a/paddle/fluid/operators/optimizers/unity_build_rule.cmake
+++ b/paddle/fluid/operators/optimizers/unity_build_rule.cmake
@@ -10,7 +10,6 @@ register_unity_group(
  lars_momentum_op.cc
  proximal_adagrad_op.cc
  adam_op.cc
-  dgc_momentum_op.cc
  proximal_gd_op.cc
  decayed_adagrad_op.cc
  adadelta_op.cc