[PHI] Register custom kernel for all type of custom device (#51262)

* register custom kernel for all type of custom device * fix bug * fix GetKernelInputArgDef * fix amp bug * fix TransToPhiPlace * adapt interpreter_util

[PHI] Register custom kernel for all type of custom device (#51262)
* register custom kernel for all type of custom device * fix bug * fix GetKernelInputArgDef * fix amp bug * fix TransToPhiPlace * adapt interpreter_util
782454bd · zyfncg · GitHub · 2847980c · 782454bd · 782454bd
13 changed file
--- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc
@@ -665,9 +665,17 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
      bool should_skip_input =
          no_buffer_ins && no_buffer_ins->count(parameter_name) > 0;

+      phi::TensorArgDef in_def = input_defs.at(i);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      // When the backend of input tensor arg_def is CUSTOM, we need to set it
+      // to the actual backend by expected_kernel_key.
+      if (in_def.backend == phi::Backend::CUSTOM) {
+        in_def.SetBackend(phi::TransToPhiBackend(expected_kernel_key.place_));
+      }
+#endif
      apply_data_transform_for_one_parameter(parameter_name,
                                             new_ins[parameter_name],
-                                             &input_defs.at(i),
+                                             &in_def,
                                             should_skip_input,
                                             &arguments);
    }

--- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc
@@ -1090,7 +1090,9 @@ void FakeInitializeOutputsForFunctionKernel(
      if (out_tensor && !out_tensor->initialized()) {
        phi::TensorArgDef& tensor_arg_def = output_defs[start_idx + offset];
        phi::DataType dtype = tensor_arg_def.dtype;
-        phi::Place place = phi::TransToPhiPlace(tensor_arg_def.backend);
+        phi::Place place = tensor_arg_def.backend == phi::Backend::CUSTOM
+                               ? dev_ctx.GetPlace()
+                               : phi::TransToPhiPlace(tensor_arg_def.backend);

        if (dtype == DataType::UNDEFINED ||
            OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -2650,7 +2650,6 @@ Scope* OperatorWithKernel::PrepareData(
                          input_names.size(),
                          input_defs.size()));
    for (size_t i = 0; i < input_defs.size(); ++i) {
-      auto& in_def = input_defs.at(i);
      std::string input_name = input_names[i];
      auto iter = ctx->inputs.find(input_name);
      if (iter == ctx->inputs.end()) {
@@ -2659,6 +2658,15 @@ Scope* OperatorWithKernel::PrepareData(
      auto& ins_vector = iter->second;
      bool should_skip_input =
          no_buffer_ins && no_buffer_ins->count(input_name) > 0;
+
+      phi::TensorArgDef in_def = input_defs.at(i);
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      // When the backend of input tensor arg_def is CUSTOM, we need to set it
+      // to the actual backend by expected_kernel_key.
+      if (in_def.backend == phi::Backend::CUSTOM) {
+        in_def.SetBackend(expected_kernel_key.backend());
+      }
+#endif
      prepare_input_data(input_name, &ins_vector, &in_def, should_skip_input);
    }
 #ifdef PADDLE_WITH_MKLDNN

--- a/paddle/fluid/imperative/amp_auto_cast.cc
+++ b/paddle/fluid/imperative/amp_auto_cast.cc
@@ -78,14 +78,31 @@ OpSupportedInfos(const std::string& place,
    }
  }

+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  auto is_custom_place = [&](const std::string& place) {
+    return is_target_place.count(place) && place != "CPU" && place != "GPU" &&
+           place != "XPU";
+  };
+#endif
  auto phi_kernels = phi::KernelFactory::Instance().kernels();
  for (auto& kernel_pair : phi_kernels) {
    auto op_type = phi::TransToFluidOpName(kernel_pair.first);
    for (auto& info_pair : kernel_pair.second) {
-      framework::OpKernelType kernel_type =
-          framework::TransPhiKernelKeyToOpKernelType(info_pair.first);
-      if (is_target_place[query_place](kernel_type.place_) &&
-          kernel_type.data_type_ == dtype && all_ops.count(op_type)) {
+      if (dtype != framework::TransToProtoVarType(info_pair.first.dtype()) ||
+          all_ops.count(op_type) == 0) {
+        continue;
+      }
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+      if (info_pair.first.backend() == phi::Backend::CUSTOM) {
+        if (is_custom_place(query_place)) {
+          VLOG(4) << op_type << " " << supported_ops.size();
+          supported_ops.emplace(op_type);
+        }
+        continue;
+      }
+#endif
+      if (is_target_place[query_place](
+              phi::TransToPhiPlace(info_pair.first.backend(), false))) {
        VLOG(4) << op_type << " " << supported_ops.size();
        supported_ops.emplace(op_type);
      }

--- a/paddle/fluid/operators/controlflow/feed_op.cc
+++ b/paddle/fluid/operators/controlflow/feed_op.cc
@@ -273,13 +273,16 @@ PD_REGISTER_GENERAL_KERNEL(
    ALL_DTYPE) {}
 #endif
 #ifdef PADDLE_WITH_CUSTOM_DEVICE
-namespace paddle {
-namespace operators {
-template void FeedDenseTensorKernel<phi::CustomContext>(
-    const phi::CustomContext& dev_ctx,
-    const phi::ExtendedTensor& x,
-    int col,
-    phi::DenseTensor* out);
-}  // namespace operators
-}  // namespace paddle
+PD_REGISTER_GENERAL_KERNEL(
+    feed_dense_tensor,
+    Custom,
+    ALL_LAYOUT,
+    paddle::operators::FeedDenseTensorKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
+PD_REGISTER_GENERAL_KERNEL(
+    feed_strings,
+    Custom,
+    ALL_LAYOUT,
+    paddle::operators::FeedStringsKernel<phi::CustomContext>,
+    ALL_DTYPE) {}
 #endif
--- a/paddle/fluid/operators/custom_device_common_op_registry.cc
+++ b/paddle/fluid/operators/custom_device_common_op_registry.cc
@@ -87,11 +87,6 @@ void RegisterCustomDeviceCommonKernel(const std::string& dev_type) {
          LoadCombineOpKernel<paddle::platform::CustomDeviceContext, int8_t>,
      paddle::operators::
          LoadCombineOpKernel<paddle::platform::CustomDeviceContext, int64_t>);
-  REGISTER_CUSTOM_DEVICE_GENERAL_KERNEL(
-      feed_dense_tensor,
-      device_type,
-      ALL_LAYOUT,
-      paddle::operators::FeedDenseTensorKernel<phi::CustomContext>);
 #endif
 }


--- a/paddle/phi/api/lib/data_transform.h
+++ b/paddle/phi/api/lib/data_transform.h
@@ -62,6 +62,19 @@ class TransformFlag {
  bool trans_layout_ = true;
 };

+static inline phi::TensorArgDef GetKernelInputArgDef(
+    const phi::TensorArgDef& input_def, phi::Backend kernel_backend) {
+  phi::TensorArgDef input_actual_def = input_def;
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+  // When the backend of input tensor arg_def is CUSTOM, we need to set it to
+  // the actual backend by expected_kernel_key.
+  if (input_actual_def.backend == phi::Backend::CUSTOM) {
+    input_actual_def.SetBackend(kernel_backend);
+  }
+#endif
+  return input_actual_def;
+}
+
 std::shared_ptr<phi::DenseTensor> PrepareData(
    const Tensor& input,
    const phi::TensorArgDef& target_args_def,

--- a/paddle/phi/api/yaml/generator/api_base.py
+++ b/paddle/phi/api/yaml/generator/api_base.py
@@ -701,7 +701,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
        input_tensor_code = (
            input_tensor_code
            + f"""
-{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});"""
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareData({input_name}, GetKernelInputArgDef(kernel.InputAt({kernel_param.index(input_name)}), kernel_backend), {trans_flag});"""
        )
        return input_tensor_code

@@ -722,7 +722,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
        input_tensor_code = (
            input_tensor_code
            + f"""
-{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareDataForSelectedRows({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name} = PrepareDataForSelectedRows({input_name}, GetKernelInputArgDef(kernel.InputAt({kernel_param.index(input_name)}), kernel_backend), {trans_flag});
 """
        )
        return input_tensor_code
@@ -753,7 +753,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
            input_tensor_code = (
                input_tensor_code
                + f"""
-{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, GetKernelInputArgDef(kernel.InputAt({kernel_param.index(input_name)}), kernel_backend), {trans_flag});
 {code_indent}  paddle::optional<std::vector<const phi::DenseTensor*>> {PREFIX_TENSOR_NAME}{input_name};
 {code_indent}  if ({PREFIX_TENSOR_NAME}{input_name}_vec){{
 {code_indent}    {PREFIX_TENSOR_NAME}{input_name} = paddle::optional<std::vector<const phi::DenseTensor*>>({PREFIX_TENSOR_NAME}{input_name}_vec->size());
@@ -791,7 +791,7 @@ PADDLE_API {self.get_return_type(inplace_flag=True)} {api_func_name}({self.get_d
            input_tensor_code = (
                input_tensor_code
                + f"""
-{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, kernel.InputAt({kernel_param.index(input_name)}), {trans_flag});
+{code_indent}  auto {PREFIX_TENSOR_NAME}{input_name}_vec = PrepareData({input_name}, GetKernelInputArgDef(kernel.InputAt({kernel_param.index(input_name)}), kernel_backend), {trans_flag});
 {code_indent}  std::vector<const phi::DenseTensor*> {PREFIX_TENSOR_NAME}{input_name}({PREFIX_TENSOR_NAME}{input_name}_vec->size());
 {code_indent}  for (size_t i = 0; i < {PREFIX_TENSOR_NAME}{input_name}.size(); ++i) {{
 {code_indent}    {PREFIX_TENSOR_NAME}{input_name}[i] = &{PREFIX_TENSOR_NAME}{input_name}_vec->at(i);

--- a/paddle/phi/common/backend.h
+++ b/paddle/phi/common/backend.h
@@ -134,6 +134,9 @@ inline std::ostream& operator<<(std::ostream& os, Backend backend) {
    case Backend::IPU:
      os << "IPU";
      break;
+    case Backend::CUSTOM:
+      os << "CUSTOM";
+      break;
    default: {
      size_t device_type_id_ = static_cast<size_t>(backend) -
                               static_cast<size_t>(Backend::NUM_BACKENDS);
@@ -181,6 +184,8 @@ inline Backend StringToBackend(const char* backend_cstr) {
 #endif
  } else if (s == std::string("IPU")) {
    return Backend::IPU;
+  } else if (s == std::string("Custom")) {
+    return Backend::CUSTOM;
  } else {
    return static_cast<Backend>(static_cast<size_t>(Backend::NUM_BACKENDS) +
                                phi::CustomRegisteredDeviceMap::Instance()

--- a/paddle/phi/core/compat/convert_utils.cc
+++ b/paddle/phi/core/compat/convert_utils.cc
@@ -99,6 +99,8 @@ phi::Place TransToPhiPlace(const Backend& backend, bool set_device_id) {
        return phi::CustomPlace(
            device_type,
            set_device_id ? phi::DeviceManager::GetDevice(device_type) : 0);
+      } else if (backend == Backend::CUSTOM) {
+        return phi::CustomPlace();
      }
 #endif
      PADDLE_THROW(phi::errors::Unimplemented(

--- a/paddle/phi/core/kernel_factory.cc
+++ b/paddle/phi/core/kernel_factory.cc
@@ -93,6 +93,14 @@ const Kernel& KernelFactory::SelectKernel(const std::string& kernel_name,
        kernel_key.backend(), phi::DataLayout::ALL_LAYOUT, kernel_key.dtype());
    kernel_iter = iter->second.find(any_layout_kernel_key);
  }
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
+  if (kernel_iter == iter->second.end() &&
+      kernel_key.backend() > phi::Backend::NUM_BACKENDS) {
+    kernel_iter = iter->second.find({phi::Backend::CUSTOM,
+                                     phi::DataLayout::ALL_LAYOUT,
+                                     kernel_key.dtype()});
+  }
+#endif

  if (kernel_iter == iter->second.end()) {
    return empty_kernel;
@@ -220,6 +228,12 @@ KernelResult KernelFactory::SelectKernelOrThrowError(
      !phi::backends::xpu::is_xpu_support_op(TransToFluidOpName(kernel_name),
                                             kernel_key.dtype())
 #elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+  if (kernel_iter == iter->second.end() &&
+      kernel_key.backend() > phi::Backend::NUM_BACKENDS) {
+    kernel_iter = iter->second.find({phi::Backend::CUSTOM,
+                                     phi::DataLayout::ALL_LAYOUT,
+                                     kernel_key.dtype()});
+  }
  if (FLAGS_enable_api_kernel_fallback &&
      (kernel_iter == iter->second.end() ||
       phi::backends::custom_device::is_in_custom_black_list(

--- a/paddle/phi/core/kernel_registry.h
+++ b/paddle/phi/core/kernel_registry.h
@@ -61,15 +61,15 @@ struct KernelArgsParseFunctor<Return_ (*)(Args_...)> {
          || arg_type == std::type_index(typeid(const OneDNNContext&))
 #endif
 #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP)
-          || arg_type == std::type_index(typeid(const GPUContext&))) {
+          || arg_type == std::type_index(typeid(const GPUContext&))
 #elif defined(PADDLE_WITH_XPU) && !defined(PADDLE_WITH_XPU_KP)
-          || arg_type == std::type_index(typeid(const XPUContext&))) {
+          || arg_type == std::type_index(typeid(const XPUContext&))
 #elif defined(PADDLE_WITH_XPU) && defined(PADDLE_WITH_XPU_KP)
-          || arg_type == std::type_index(typeid(const KPSContext&))) {
-#elif defined(PADDLE_WITH_CUSTOM_DEVICE)
+          || arg_type == std::type_index(typeid(const KPSContext&))
+#endif
+#if defined(PADDLE_WITH_CUSTOM_DEVICE)
          || arg_type == std::type_index(typeid(const CustomContext&))) {
 #else
-
      ) {
 #endif
        // do nothing, skip context arg now

--- a/paddle/phi/kernels/empty_kernel.cc
+++ b/paddle/phi/kernels/empty_kernel.cc
@@ -142,3 +142,34 @@ PD_REGISTER_KERNEL(empty_like,
  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
 }
 #endif
+
+#ifdef PADDLE_WITH_CUSTOM_DEVICE
+PD_REGISTER_KERNEL(empty,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::EmptyKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16) {}
+PD_REGISTER_KERNEL(empty_like,
+                   Custom,
+                   ALL_LAYOUT,
+                   phi::EmptyLikeKernel,
+                   float,
+                   double,
+                   int8_t,
+                   uint8_t,
+                   int16_t,
+                   int,
+                   int64_t,
+                   bool,
+                   phi::dtype::float16) {
+  kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND);
+}
+#endif