未验证 提交 daac3829 编写于 作者: S Sonder 提交者: GitHub

Remove need_move_to_phi (#56371)

* remove flag

* open static build flag

* add searchsorted to list

* add register info for fused layernorm

* fix fused_layernorm_kernel output registe info

* fix stft registe info

* add include

* fix registe info

* add skip fake init for fused_layernorm:residual_out

* fix error

* add distributed_fused_lamb_init to StaticBuildBlackList

* set static_build flag to false
上级 0c3e4cf6
......@@ -46,19 +46,11 @@ std::set<std::string> StaticBuildBlackList = {
"cinn_instruction_run" /*: to handle subgraph infermeta*/,
"cinn_launch" /*: to handle subgraph infermeta*/,
"run_program" /*: to handle scope output*/,
"sparse_sparse_coo_tensor" /*: to handle sparse output*/};
// TODO(lizhiyu): This operator list is only for pipeline strategy temporarily.
std::set<std::string> SkipCheckForPipelineTempList = {
"c_broadcast",
"c_allreduce_sum",
"c_allgather",
"layer_norm",
"recv_v2",
"reshape2_grad",
"c_identity",
"c_reduce_sum",
};
"sparse_sparse_coo_tensor" /*: to handle sparse output*/,
"shuffle_batch",
"shuffle_batch_grad",
"distributed_fused_lamb_init"};
namespace paddle {
namespace framework {
namespace interpreter {
......@@ -70,15 +62,10 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
// use_mkldnn = (kernelCode >> 4) & 1
// has_fluid_kernel = (kernelCode >> 3) & 1
// has_structed_kernel = (kernelCode >> 2) & 1
// need_move_to_phi = (kernelCode >> 1) & 1
using KernelCode = int8_t;
std::set<std::pair<std::string, KernelCode>> invalid_ops;
for (auto& op : block.AllOps()) {
auto op_type = op->Type();
if (SkipCheckForPipelineTempList.find(op_type) !=
SkipCheckForPipelineTempList.end()) {
continue;
}
const framework::OpInfo& info = OpInfoMap::Instance().Get(op_type);
auto op_base =
info.Creator()(op_type, op->Inputs(), op->Outputs(), op->GetAttrMap());
......@@ -97,17 +84,16 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
bool has_fluid_kernel = OperatorWithKernel::AllOpKernels().count(op_type);
bool has_structured_kernel =
phi::KernelFactory::Instance().HasStructuredKernel(op_type);
bool need_move_to_phi = (has_fluid_kernel || has_structured_kernel);
KernelCode kernel_code =
(in_black_list << 7) + (is_operator_base << 6) + (is_custom_op << 5) +
(use_mkldnn << 4) + (has_fluid_kernel << 3) +
(has_structured_kernel << 2) + (need_move_to_phi << 1);
KernelCode kernel_code = (in_black_list << 7) + (is_operator_base << 6) +
(is_custom_op << 5) + (use_mkldnn << 4) +
(has_fluid_kernel << 3) +
(has_structured_kernel << 2);
if (!OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) {
if (in_black_list ||
(is_operator_base &&
!OperatorBasesHandledInStaticBuild.count(op_type)) ||
is_custom_op || use_mkldnn || need_move_to_phi) {
is_custom_op || use_mkldnn) {
invalid_ops.insert(std::make_pair(op_type, kernel_code));
}
}
......@@ -122,8 +108,7 @@ bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) {
<< ", is_custom_op = " << (item.second >> 5 & 1)
<< ", use_mkldnn = " << (item.second >> 4 & 1)
<< ", has_fluid_kernel = " << (item.second >> 3 & 1)
<< ", has_structed_kerenl = " << (item.second >> 2 & 1)
<< ", need_move_to_phi = " << (item.second >> 1 & 1) << "]\n";
<< ", has_structed_kerenl = " << (item.second >> 2 & 1) << "]\n";
}
VLOG(1) << ss.str();
}
......@@ -168,6 +153,27 @@ bool TensorShouldBeFakeInitialized(const OperatorBase& op,
return false;
}
if (op_type == "fused_bias_residual_layernorm" &&
parameter_name == "residual_out") {
if (op.HasInputs("residual")) {
bool is_residual_empty = op.Input("residual") == kEmptyVarName;
bool is_norm_weight_empty = op.Input("norm_weight") == kEmptyVarName;
bool is_norm_bias_empty = op.Input("norm_bias") == kEmptyVarName;
if (!is_residual_empty) {
if (is_norm_weight_empty && is_norm_bias_empty) {
VLOG(2) << "Skip fake initialization for: " << parameter_name;
return false;
}
} else {
VLOG(2) << "Skip fake initialization for: " << parameter_name;
return false;
}
} else {
VLOG(2) << "Skip fake initialization for: " << parameter_name;
return false;
}
}
if (op_type == "fake_quantize_range_abs_max") {
if (op.Attr<bool>("is_test") &&
(parameter_name == "OutScale" || parameter_name == "OutScales")) {
......@@ -375,6 +381,15 @@ phi::DataType GetInputDType(const RuntimeContext& runtime_ctx,
return in_tensor->dtype();
}
bool InputExisted(const RuntimeContext& runtime_ctx,
const std::string& parameter_name) {
auto it = runtime_ctx.inputs.find(parameter_name);
if (it == runtime_ctx.inputs.end() || it->second.empty()) {
return false;
}
return true;
}
phi::DataType InferDTypeFromAttr(const framework::OperatorBase& op,
const RuntimeContext& runtime_ctx,
const std::string& attr_name) {
......@@ -497,6 +512,20 @@ void FakeInitializeOutputsForFunctionKernel(
} else {
dtype = DataType::INT64;
}
} else if (op_type == "fused_bias_residual_layernorm") {
auto in_dtype = GetInputDType(runtime_ctx, "x");
float quant_scale = op.Attr<float>("quant_scale");
if (InputExisted(runtime_ctx, "residual") &&
!InputExisted(runtime_ctx, "norm_weight") &&
!InputExisted(runtime_ctx, "norm_bias")) {
dtype = in_dtype;
} else {
if (quant_scale > 0.0f) {
dtype = DataType::INT8;
} else {
dtype = in_dtype;
}
}
} else {
VLOG(4) << "Get dtype result from InferMeta";
RuntimeInferShapeContext infer_shape_ctx(op, runtime_ctx);
......
......@@ -1646,10 +1646,14 @@ void FusedLayerNormInferMeta(const MetaTensor& x,
auto out_dims = phi::make_ddim(x_dims_vec);
out->set_dims(out_dims);
if (quant_scale <= 0.0f) {
if (residual_out && !norm_weight && !norm_bias) {
out->set_dtype(x.dtype());
} else {
out->set_dtype(phi::DataType::INT8);
if (quant_scale <= 0.0f) {
out->set_dtype(x.dtype());
} else {
out->set_dtype(phi::DataType::INT8);
}
}
out->set_layout(x.layout());
......
......@@ -18,11 +18,5 @@
#include "paddle/phi/kernels/impl/stft_kernel_impl.h"
PD_REGISTER_KERNEL(stft, CPU, ALL_LAYOUT, phi::StftKernel, float, double) {
if (kernel_key.dtype() == phi::DataType::FLOAT16 &&
kernel_key.dtype() == phi::DataType::FLOAT32 &&
kernel_key.dtype() == phi::DataType::FLOAT64) {
kernel->OutputAt(0).SetDataType(phi::DataType::COMPLEX64);
} else {
kernel->OutputAt(0).SetDataType(phi::DataType::COMPLEX128);
}
kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
}
......@@ -1074,14 +1074,22 @@ PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
phi::fusion::FusedLayerNormKernel,
float,
phi::dtype::float16,
phi::dtype::bfloat16) {}
phi::dtype::bfloat16) {
kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
}
#else
PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
GPU,
ALL_LAYOUT,
phi::fusion::FusedLayerNormKernel,
float,
phi::dtype::float16) {}
phi::dtype::float16) {
kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
}
#endif // CUDNN_VERSION_MIN
#else
PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
......@@ -1089,5 +1097,9 @@ PD_REGISTER_KERNEL(fused_bias_residual_layernorm,
ALL_LAYOUT,
phi::fusion::FusedLayerNormKernel,
float,
phi::dtype::float16) {}
phi::dtype::float16) {
kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED);
kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32);
kernel->OutputAt(3).SetDataType(phi::DataType::FLOAT32);
}
#endif // PADDLE_WITH_HIP
......@@ -18,6 +18,7 @@ limitations under the License. */
#include "paddle/phi/backends/gpu/gpu_context.h"
#include "paddle/phi/core/kernel_registry.h"
#if NCCL_VERSION_CODE >= 21000 && CUDA_VERSION >= 11000
PD_REGISTER_KERNEL(c_identity,
GPU,
ALL_LAYOUT,
......@@ -28,3 +29,14 @@ PD_REGISTER_KERNEL(c_identity,
int64_t,
phi::dtype::bfloat16,
phi::dtype::float16) {}
#else
PD_REGISTER_KERNEL(c_identity,
GPU,
ALL_LAYOUT,
phi::CIdentityKernel,
float,
double,
int,
int64_t,
phi::dtype::float16) {}
#endif
......@@ -17,4 +17,6 @@
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/impl/stft_kernel_impl.h"
PD_REGISTER_KERNEL(stft, GPU, ALL_LAYOUT, phi::StftKernel, float, double) {}
PD_REGISTER_KERNEL(stft, GPU, ALL_LAYOUT, phi::StftKernel, float, double) {
kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype()));
}
......@@ -16,6 +16,7 @@
#include <vector>
#include "paddle/phi/common/complex.h"
#include "paddle/phi/common/type_traits.h"
#include "paddle/phi/core/dense_tensor.h"
#include "paddle/phi/kernels/cpu/elementwise.h"
#include "paddle/phi/kernels/funcs/elementwise_base.h"
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册