Remove need_move_to_phi (#56371)

* remove flag * open static build flag * add searchsorted to list * add register info for fused layernorm * fix fused_layernorm_kernel output registe info * fix stft registe info * add include * fix registe info * add skip fake init for fused_layernorm:residual_out * fix error * add distributed_fused_lamb_init to StaticBuildBlackList * set static_build flag to false

Remove need_move_to_phi (#56371)
* remove flag * open static build flag * add searchsorted to list * add register info for fused layernorm * fix fused_layernorm_kernel output registe info * fix stft registe info * add include * fix registe info * add skip fake init for fused_layernorm:residual_out * fix error * add distributed_fused_lamb_init to StaticBuildBlackList * set static_build flag to false
daac3829 · Sonder · GitHub · 0c3e4cf6 · daac3829 · daac3829
7 changed file
--- a/paddle/fluid/framework/new_executor/interpreter/static_build.cc
+++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc
@@ -46,19 +46,11 @@ std::set<std::string> StaticBuildBlackList = {
    "cinn_instruction_run" /*: to handle subgraph infermeta*/,
    "cinn_launch" /*: to handle subgraph infermeta*/,
    "run_program" /*: to handle scope output*/,
-    "sparse_sparse_coo_tensor" /*: to handle sparse output*/};
-
-// TODO(lizhiyu): This operator list is only for pipeline strategy temporarily.
-std::set<std::string> SkipCheckForPipelineTempList = {
-    "c_broadcast",
-    "c_allreduce_sum",
-    "c_allgather",
-    "layer_norm",
-    "recv_v2",
-    "reshape2_grad",
-    "c_identity",
-    "c_reduce_sum",
-};
+    "sparse_sparse_coo_tensor" /*: to handle sparse output*/,
+    "shuffle_batch",
+    "shuffle_batch_grad",
+    "distributed_fused_lamb_init"};
+
 namespace paddle {
 namespace framework {
 namespace interpreter {
@@ -70,15 +62,10 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
  // use_mkldnn = (kernelCode >> 4) & 1
  // has_fluid_kernel = (kernelCode >> 3) & 1
  // has_structed_kernel = (kernelCode >> 2) & 1
-  // need_move_to_phi = (kernelCode >> 1) & 1
  using KernelCode = int8_t;
  std::set<std::pair<std::string, KernelCode>> invalid_ops;
  for (auto& op : block.AllOps()) {
    auto op_type = op->Type();
-    if (SkipCheckForPipelineTempList.find(op_type) !=
-        SkipCheckForPipelineTempList.end()) {
-      continue;
-    }
    const framework::OpInfo& info = OpInfoMap::Instance().Get(op_type);
    auto op_base =
        info.Creator()(op_type, op->Inputs(), op->Outputs(), op->GetAttrMap());
@@ -97,17 +84,16 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
    bool has_fluid_kernel = OperatorWithKernel::AllOpKernels().count(op_type);
    bool has_structured_kernel =
        phi::KernelFactory::Instance().HasStructuredKernel(op_type);
-    bool need_move_to_phi = (has_fluid_kernel || has_structured_kernel);

-    KernelCode kernel_code =
-        (in_black_list << 7) + (is_operator_base << 6) + (is_custom_op << 5) +
-        (use_mkldnn << 4) + (has_fluid_kernel << 3) +
-        (has_structured_kernel << 2) + (need_move_to_phi << 1);
+    KernelCode kernel_code = (in_black_list << 7) + (is_operator_base << 6) +
+                             (is_custom_op << 5) + (use_mkldnn << 4) +
+                             (has_fluid_kernel << 3) +
+                             (has_structured_kernel << 2);
    if (!OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
      if (in_black_list ||
          (is_operator_base &&
           !OperatorBasesHandledInStaticBuild.count(op_type)) ||
-          is_custom_op || use_mkldnn || need_move_to_phi) {
+          is_custom_op || use_mkldnn) {
        invalid_ops.insert(std::make_pair(op_type, kernel_code));
      }
    }
@@ -122,8 +108,7 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
         << ", is_custom_op = " << (item.second >> 5 & 1)
         << ", use_mkldnn = " << (item.second >> 4 & 1)
         << ", has_fluid_kernel = " << (item.second >> 3 & 1)
-         << ", has_structed_kerenl = " << (item.second >> 2 & 1)
-         << ", need_move_to_phi = " << (item.second >> 1 & 1) << "]\n";
+         << ", has_structed_kerenl = " << (item.second >> 2 & 1) << "]\n";
    }
    VLOG(1) << ss.str();
  }
@@ -168,6 +153,27 @@ bool TensorShouldBeFakeInitialized(const OperatorBase& op,
    return false;
  }

+  if (op_type == "fused_bias_residual_layernorm" &&
+      parameter_name == "residual_out") {
+    if (op.HasInputs("residual")) {
+      bool is_residual_empty = op.Input("residual") == kEmptyVarName;
+      bool is_norm_weight_empty = op.Input("norm_weight") == kEmptyVarName;
+      bool is_norm_bias_empty = op.Input("norm_bias") == kEmptyVarName;
+      if (!is_residual_empty) {
+        if (is_norm_weight_empty && is_norm_bias_empty) {
+          VLOG(2) << "Skip fake initialization for: " << parameter_name;
+          return false;
+        }
+      } else {
+        VLOG(2) << "Skip fake initialization for: " << parameter_name;
+        return false;
+      }
+    } else {
+      VLOG(2) << "Skip fake initialization for: " << parameter_name;
+      return false;
+    }
+  }
+
  if (op_type == "fake_quantize_range_abs_max") {
    if (op.Attr<bool>("is_test") &&
        (parameter_name == "OutScale" || parameter_name == "OutScales")) {
@@ -375,6 +381,15 @@ phi::DataType GetInputDType(const RuntimeContext& runtime_ctx,
  return in_tensor->dtype();
 }

+bool InputExisted(const RuntimeContext& runtime_ctx,
+                  const std::string& parameter_name) {
+  auto it = runtime_ctx.inputs.find(parameter_name);
+  if (it == runtime_ctx.inputs.end() || it->second.empty()) {
+    return false;
+  }
+  return true;
+}
+
 phi::DataType InferDTypeFromAttr(const framework::OperatorBase& op,
                                 const RuntimeContext& runtime_ctx,
                                 const std::string& attr_name) {
@@ -497,6 +512,20 @@ void FakeInitializeOutputsForFunctionKernel(
            } else {
              dtype = DataType::INT64;
            }
+          } else if (op_type == "fused_bias_residual_layernorm") {
+            auto in_dtype = GetInputDType(runtime_ctx, "x");
+            float quant_scale = op.Attr<float>("quant_scale");
+            if (InputExisted(runtime_ctx, "residual") &&
+                !InputExisted(runtime_ctx, "norm_weight") &&
+                !InputExisted(runtime_ctx, "norm_bias")) {
+              dtype = in_dtype;
+            } else {
+              if (quant_scale > 0.0f) {
+                dtype = DataType::INT8;
+              } else {
+                dtype = in_dtype;
+              }
+            }
          } else {
            VLOG(4) << "Get dtype result from InferMeta";
            RuntimeInferShapeContext infer_shape_ctx(op, runtime_ctx);

--- a/paddle/phi/infermeta/multiary.cc
+++ b/paddle/phi/infermeta/multiary.cc
@@ -1646,10 +1646,14 @@ void FusedLayerNormInferMeta(const MetaTensor& x,
  auto out_dims = phi::make_ddim(x_dims_vec);

  out->set_dims(out_dims);
-  if (quant_scale <= 0.0f) {
+  if (residual_out && !norm_weight && !norm_bias) {
    out->set_dtype(x.dtype());
  } else {
-    out->set_dtype(phi::DataType::INT8);
+    if (quant_scale <= 0.0f) {
+      out->set_dtype(x.dtype());
+    } else {
+      out->set_dtype(phi::DataType::INT8);
+    }
  }
  out->set_layout(x.layout());


--- a/paddle/phi/kernels/cpu/stft_kernel.cc
+++ b/paddle/phi/kernels/cpu/stft_kernel.cc
@@ -18,11 +18,5 @@
 #include "paddle/phi/kernels/impl/stft_kernel_impl.h"

 PD_REGISTER_KERNEL(stft, CPU, ALL_LAYOUT, phi::StftKernel, float, double) {
-  if (kernel_key.dtype() == phi::DataType::FLOAT16 &&
-      kernel_key.dtype() == phi::DataType::FLOAT32 &&
-      kernel_key.dtype() == phi::DataType::FLOAT64) {
-    kernel->OutputAt(0).SetDataType(phi::DataType::COMPLEX64);
-  } else {
-    kernel->OutputAt(0).SetDataType(phi::DataType::COMPLEX128);
-  }
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
 }
--- a/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
+++ b/paddle/phi/kernels/fusion/gpu/fused_layernorm_kernel.cu
@@ -1074,14 +1074,22 @@ PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
                   phi::fusion::FusedLayerNormKernel,
                   float,
                   phi::dtype::float16,
-                   phi::dtype::bfloat16) {}
+                   phi::dtype::bfloat16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+}
 #else
 PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
                   GPU,
                   ALL_LAYOUT,
                   phi::fusion::FusedLayerNormKernel,
                   float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+}
 #endif  // CUDNN_VERSION_MIN
 #else
 PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
@@ -1089,5 +1097,9 @@ PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
                   ALL_LAYOUT,
                   phi::fusion::FusedLayerNormKernel,
                   float,
-                   phi::dtype::float16) {}
+                   phi::dtype::float16) {
+  kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
+  kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
+  kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
+}
 #endif  // PADDLE_WITH_HIP
--- a/paddle/phi/kernels/gpu/c_identity_kernel.cu
+++ b/paddle/phi/kernels/gpu/c_identity_kernel.cu
@@ -18,6 +18,7 @@ limitations under the License. */
 #include "paddle/phi/backends/gpu/gpu_context.h"
 #include "paddle/phi/core/kernel_registry.h"

+#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
 PD_REGISTER_KERNEL(c_identity,
                   GPU,
                   ALL_LAYOUT,
@@ -28,3 +29,14 @@ PD_REGISTER_KERNEL(c_identity,
                   int64_t,
                   phi::dtype::bfloat16,
                   phi::dtype::float16) {}
+#else
+PD_REGISTER_KERNEL(c_identity,
+                   GPU,
+                   ALL_LAYOUT,
+                   phi::CIdentityKernel,
+                   float,
+                   double,
+                   int,
+                   int64_t,
+                   phi::dtype::float16) {}
+#endif
--- a/paddle/phi/kernels/gpu/stft_kernel.cu
+++ b/paddle/phi/kernels/gpu/stft_kernel.cu
@@ -17,4 +17,6 @@
 #include "paddle/phi/core/kernel_registry.h"
 #include "paddle/phi/kernels/impl/stft_kernel_impl.h"

-PD_REGISTER_KERNEL(stft, GPU, ALL_LAYOUT, phi::StftKernel, float, double) {}
+PD_REGISTER_KERNEL(stft, GPU, ALL_LAYOUT, phi::StftKernel, float, double) {
+  kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
+}
--- a/paddle/phi/kernels/impl/stft_kernel_impl.h
+++ b/paddle/phi/kernels/impl/stft_kernel_impl.h
@@ -16,6 +16,7 @@

 #include <vector>
 #include "paddle/phi/common/complex.h"
+#include "paddle/phi/common/type_traits.h"
 #include "paddle/phi/core/dense_tensor.h"
 #include "paddle/phi/kernels/cpu/elementwise.h"
 #include "paddle/phi/kernels/funcs/elementwise_base.h"