From f265a31324493e5cf426909f109cfd62f922060a Mon Sep 17 00:00:00 2001 From: zhupengyang Date: Tue, 28 Feb 2023 14:19:31 +0800 Subject: [PATCH] [XPU] support convert fp16 model (#50790) --- paddle/fluid/framework/ir/CMakeLists.txt | 2 +- .../framework/ir/auto_mixed_precision_pass.cc | 80 +++++++---- .../framework/ir/auto_mixed_precision_pass.h | 4 +- .../framework/ir/xpu/fc_xpu_fuse_pass.cc | 6 + .../ir/xpu/multi_encoder_xpu_fuse_pass.cc | 136 ++++++++++++++---- paddle/fluid/framework/ir/xpu/quant_utils.cc | 109 ++++++++++---- paddle/fluid/framework/ir/xpu/quant_utils.h | 7 +- .../passes/convert_to_mixed_precision.cc | 46 +++--- .../fluid/inference/api/analysis_predictor.cc | 7 +- .../inference/api/paddle_pass_builder.cc | 2 +- paddle/phi/backends/xpu/xpu2_op_list.cc | 6 +- paddle/phi/common/backend.h | 2 + .../phi/kernels/fusion/xpu/fc_xpu_kernel.cc | 51 ++++--- .../fusion/xpu/multi_encoder_xpu_kernel.cc | 50 ++++--- .../test_xpu_convert_mixed_precision.py | 56 ++++++++ 15 files changed, 426 insertions(+), 138 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt index d09b6bd3734..536b0b2f21b 100755 --- a/paddle/fluid/framework/ir/CMakeLists.txt +++ b/paddle/fluid/framework/ir/CMakeLists.txt @@ -215,7 +215,7 @@ if(WITH_XPU) cc_library( xpu_quant_utils SRCS xpu/quant_utils.cc - DEPS pass) + DEPS pass phi) cc_library( xpu_pass_utils SRCS xpu/pass_utils.cc diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc index fd2c8a024c2..061e2432eed 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.cc @@ -47,6 +47,23 @@ bool PhiKernelSupportPrecision( return phi::KernelFactory::Instance().HasKernel(op_type, kernel_key); } +static phi::Backend ConvertPlaceToBackend(const phi::Place& place) { + switch (place.GetType()) { + case phi::AllocationType::CPU: + return phi::Backend::CPU; + case phi::AllocationType::GPU: + return phi::Backend::GPU; + case phi::AllocationType::XPU: + return phi::Backend::XPU; + case phi::AllocationType::NPU: + return phi::Backend::NPU; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Cannot convert place(%d).", static_cast(place.GetType()))); + } + return phi::Backend::UNDEFINED; +} + bool KernelSupportPrecision( const std::string& op_type, phi::Backend backend, @@ -65,7 +82,7 @@ bool KernelSupportPrecision( auto it = all_kernels.find(op_type); if (it != all_kernels.end()) { for (const auto& kern_pair : it->second) { - if (platform::is_gpu_place(kern_pair.first.place_) && + if (ConvertPlaceToBackend(kern_pair.first.place_) == backend && kern_pair.first.data_type_ == framework::TransToProtoVarType(precision)) { support = true; @@ -150,20 +167,8 @@ bool OpSupportPrecision(const std::string& op_type, phi::Backend backend, phi::DataType precision, const std::unordered_set& black_list) { - bool support = false; - if (black_list.count(op_type) == 0) { - // Actual custom backend will be added after the NUM_BACKENDS. - // We use this feature to determine whether backend is custom device. - if (backend == phi::Backend::GPU || - static_cast(backend) > - static_cast(phi::Backend::NUM_BACKENDS)) { - support = KernelSupportPrecision(op_type, backend, precision); - } else { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "Now, only support backend of GPU and Custom Device .")); - } - } - return support; + return black_list.count(op_type) == 0 && + KernelSupportPrecision(op_type, backend, precision); } // The set of ops that support fp16 calculation and are considered @@ -192,15 +197,13 @@ void AutoMixedPrecisionPass::SetDefaultBlacklist() const { } void AutoMixedPrecisionPass::Init(Graph* graph) const { - bool enable_gpu_mixed = Get("enable_gpu_mixed"); - bool enable_custom_device_mixed = false; - if (Has("enable_custom_device_mixed")) { - enable_custom_device_mixed = Get("enable_custom_device_mixed"); - } - if (enable_gpu_mixed) { + if (Has("enable_gpu_mixed") && Get("enable_gpu_mixed")) { backend_ = phi::Backend::GPU; - } else if (enable_custom_device_mixed) { -// transform Backend::CUSTOM to actual backend. + } else if (Has("enable_xpu_mixed") && Get("enable_xpu_mixed")) { + backend_ = phi::Backend::XPU; + } else if (Has("enable_custom_device_mixed") && + Get("enable_custom_device_mixed")) { + // transform Backend::CUSTOM to actual backend. // Here, we only consider one custom backend. #ifdef PADDLE_WITH_CUSTOM_DEVICE auto device_type = phi::DeviceManager::GetAllCustomDeviceTypes()[0]; @@ -214,7 +217,7 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const { "Cannot enable custom_device_mixed.")); #endif } - skip_pass_ = !enable_gpu_mixed && !enable_custom_device_mixed; + skip_pass_ = backend_ == phi::Backend::UNDEFINED; low_precision_ = static_cast(Get("mixed_precision_mode")); @@ -225,7 +228,6 @@ void AutoMixedPrecisionPass::Init(Graph* graph) const { VLOG(4) << " - " << name; } - keep_io_types_ = true; if (Has("keep_io_types")) { keep_io_types_ = Get("keep_io_types"); } @@ -607,6 +609,20 @@ bool AutoMixedPrecisionPass::InputVarsNotConvert( return true; } } + + if (backend_ == phi::Backend::XPU) { + if (GetOpOriginalType(op_desc->Type()) == "layer_norm") { + auto vecs = op_desc->Input("Bias"); + if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) { + return true; + } + vecs = op_desc->Input("Scale"); + if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) { + return true; + } + } + } + return false; } @@ -632,6 +648,20 @@ bool AutoMixedPrecisionPass::OutputVarsNotConvert( return true; } } + + if (backend_ == phi::Backend::XPU) { + if (GetOpOriginalType(op_desc->Type()) == "layer_norm") { + auto vecs = op_desc->Output("Mean"); + if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) { + return true; + } + vecs = op_desc->Output("Variance"); + if (std::find(vecs.begin(), vecs.end(), var_name) != vecs.end()) { + return true; + } + } + } + return false; } diff --git a/paddle/fluid/framework/ir/auto_mixed_precision_pass.h b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h index 578d47282b7..c930c39a55f 100644 --- a/paddle/fluid/framework/ir/auto_mixed_precision_pass.h +++ b/paddle/fluid/framework/ir/auto_mixed_precision_pass.h @@ -68,11 +68,11 @@ class AutoMixedPrecisionPass : public FusePassBase { private: mutable bool skip_pass_{false}; - mutable bool keep_io_types_{false}; + mutable bool keep_io_types_{true}; // float16 or bfloat16 now mutable phi::DataType low_precision_{phi::DataType::FLOAT16}; - mutable phi::Backend backend_{phi::Backend::GPU}; + mutable phi::Backend backend_{phi::Backend::UNDEFINED}; mutable std::unordered_set black_list_; diff --git a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc index 54efd1ed897..0def7069e67 100644 --- a/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/fc_xpu_fuse_pass.cc @@ -245,6 +245,12 @@ void FcXPUFusePass::ApplyImpl(ir::Graph* graph, QuantWeight(mul_w_tensor, mul_w_max_tensor, !transpose_w); } + if (bias != nullptr) { + auto* bias_tensor = + scope->Var(bias->Name())->GetMutable(); + CastToFp32(bias_tensor); + } + std::string fc_out_name; if (act_out) { fc_out_name = act_out->Name(); diff --git a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc index 35d3b150e84..1c31db9810b 100644 --- a/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc +++ b/paddle/fluid/framework/ir/xpu/multi_encoder_xpu_fuse_pass.cc @@ -31,6 +31,7 @@ #include "paddle/fluid/framework/ir/xpu/quant_utils.h" #include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/platform/enforce.h" +#include "paddle/phi/kernels/concat_kernel.h" namespace phi { class DenseTensor; @@ -617,6 +618,9 @@ class MultiEncoderXPUFusePass : public FusePassBase { bool ApplyMultiEncoderXPUFuse(ir::Graph* graph) const; + // Mask must be fp32 even if model is fp16 + int CastMask(ir::Graph* graph) const; + // 1. Transpose q_w, k_w, v_w // 2. Concat q_w, k_w, v_w // 3. Generate qkv_w_max tensor @@ -674,8 +678,11 @@ void MultiEncoderXPUFusePass::ApplyImpl(ir::Graph* graph) const { } } } + int cast_mask_counts = CastMask(graph); + AddStatis(single_encoder_fused_counts); AddStatis(multi_encoder_fused_counts); + AddStatis(cast_mask_counts); } void MultiEncoderXPUFusePass::PrepareQKVWeight( @@ -685,29 +692,28 @@ void MultiEncoderXPUFusePass::PrepareQKVWeight( phi::DenseTensor* qkv_w, phi::DenseTensor* qkv_w_max) const { // Transpose - phi::DenseTensor q_w_trans; - phi::DenseTensor k_w_trans; - phi::DenseTensor v_w_trans; - Transpose2D(q_w, &q_w_trans); - Transpose2D(k_w, &k_w_trans); - Transpose2D(v_w, &v_w_trans); + phi::DenseTensor q_w_t; + phi::DenseTensor k_w_t; + phi::DenseTensor v_w_t; + Assign(q_w, &q_w_t); + Assign(k_w, &k_w_t); + Assign(v_w, &v_w_t); + Transpose2D(&q_w_t); + Transpose2D(&k_w_t); + Transpose2D(&v_w_t); // Concat - auto q_w_trans_dims = q_w_trans.dims(); - auto k_w_trans_dims = k_w_trans.dims(); - auto v_w_trans_dims = v_w_trans.dims(); - qkv_w->Resize(DDim({q_w_trans_dims[0] + k_w_trans_dims[0] + v_w_trans_dims[0], - q_w_trans_dims[1]})); + qkv_w->Resize(DDim( + {q_w_t.dims()[0] + k_w_t.dims()[0] + v_w_t.dims()[0], q_w_t.dims()[1]})); qkv_w->set_type(q_w.type()); auto* dev_ctx = static_cast( platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); - int size = q_w.numel(); - auto* qkv_w_data = dev_ctx->Alloc(qkv_w); - memcpy(qkv_w_data, q_w_trans.data(), size * sizeof(float)); - qkv_w_data += size; - memcpy(qkv_w_data, k_w_trans.data(), size * sizeof(float)); - qkv_w_data += size; - memcpy(qkv_w_data, v_w_trans.data(), size * sizeof(float)); + std::vector in_tensors{&q_w_t, &k_w_t, &v_w_t}; + if (q_w.type() == phi::DataType::FLOAT16) { + phi::ConcatKernel(*dev_ctx, in_tensors, 0, qkv_w); + } else { + phi::ConcatKernel(*dev_ctx, in_tensors, 0, qkv_w); + } // Quant to int16 QuantWeight(qkv_w, qkv_w_max, false); @@ -846,6 +852,9 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse( auto* block = q_matmul->Op()->Block(); auto* scope = param_scope(); + bool enable_fp16 = + scope->FindVar(q_matmul_w->Name())->Get().dtype() == + phi::DataType::FLOAT16; // Prepare q,k,v weight std::string q_w_name = q_matmul_w->Name(); @@ -905,12 +914,32 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse( auto* qkv_add_bias = graph->CreateVarNode(&qkv_add_bias_desc); auto* qkv_add_bias_var = block->Var(qkv_add_bias_name); qkv_add_bias_var->SetPersistable(true); + auto* q_add_bias_tensor = + scope->FindVar(q_add_bias_name)->GetMutable(); + auto* k_add_bias_tensor = + scope->FindVar(k_add_bias_name)->GetMutable(); + auto* v_add_bias_tensor = + scope->FindVar(v_add_bias_name)->GetMutable(); + CastToFp32(q_add_bias_tensor); + CastToFp32(k_add_bias_tensor); + CastToFp32(v_add_bias_tensor); ConcatQKVBias( - scope->FindVar(q_add_bias_name)->Get(), - scope->FindVar(k_add_bias_name)->Get(), - scope->FindVar(v_add_bias_name)->Get(), + *q_add_bias_tensor, + *k_add_bias_tensor, + *v_add_bias_tensor, scope->Var(qkv_add_bias_name)->GetMutable()); + // Prepare qkv_add_0_bias, qkv_add_2_bias, qkv_add_3_bias + auto qkv_add_0_bias_name = qkv_add_0_bias->Name(); + CastToFp32( + scope->FindVar(qkv_add_0_bias_name)->GetMutable()); + auto qkv_add_2_bias_name = qkv_add_2_bias->Name(); + CastToFp32( + scope->FindVar(qkv_add_2_bias_name)->GetMutable()); + auto qkv_add_3_bias_name = qkv_add_3_bias->Name(); + CastToFp32( + scope->FindVar(qkv_add_3_bias_name)->GetMutable()); + // Generate single_encoder_xpu op framework::OpDesc op_desc(block); op_desc.SetType("single_encoder_xpu"); @@ -927,9 +956,9 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse( qkv_matmul_3_w_max_name}); op_desc.SetInput("fc_bias", {qkv_add_bias_name, - qkv_add_0_bias->Name(), - qkv_add_2_bias->Name(), - qkv_add_3_bias->Name()}); + qkv_add_0_bias_name, + qkv_add_2_bias_name, + qkv_add_3_bias_name}); if (norm_before) { op_desc.SetInput("ln_scale", {ln_0_scale->Name(), ln_1_scale->Name()}); op_desc.SetInput("ln_bias", {ln_0_bias->Name(), ln_1_bias->Name()}); @@ -953,6 +982,7 @@ int MultiEncoderXPUFusePass::ApplySingleEncoderXPUFuse( static_cast(qkv_matmul_2_w_shape[1] / qkv_matmul_2_w_shape[0])); op_desc.SetAttr("act_type", ConvertActivationType(act_type)); op_desc.SetAttr("relative_type", static_cast(0)); + op_desc.SetAttr("enable_fp16", enable_fp16); if (norm_before) { op_desc.SetOutput("out", {qkv_add_4_out->Name()}); } else { @@ -1186,6 +1216,9 @@ bool MultiEncoderXPUFusePass::ApplyMultiEncoderXPUFuse(ir::Graph* graph) const { PADDLE_GET_CONST(int, single_encoders[0]->Op()->GetAttr(attr_name))); } op_desc.SetAttr("slice_idx", static_cast(-1)); + op_desc.SetAttr( + "enable_fp16", + PADDLE_GET_CONST(bool, single_encoders[0]->Op()->GetAttr("enable_fp16"))); op_desc.SetOutput("out", {out_name}); op_desc.SetOutput("x_fp16", {x_fp16_name}); op_desc.SetOutput("out_fp16", {out_fp16_name}); @@ -1213,6 +1246,61 @@ bool MultiEncoderXPUFusePass::ApplyMultiEncoderXPUFuse(ir::Graph* graph) const { return true; } +int MultiEncoderXPUFusePass::CastMask(ir::Graph* graph) const { + int cast_counts = 0; + auto nodes = graph->Nodes(); + for (auto node : nodes) { + if (node->IsVar()) continue; + auto op_desc = node->Op(); + if (node->IsVar() || // + op_desc->Type() != "multi_encoder_xpu" || + !op_desc->GetAttrIfExists("enable_fp16") || + op_desc->Inputs().count("mask") == 0) + continue; + + auto* block = op_desc->Block(); + auto* scope = param_scope(); + + // Find mask node + std::string mask_name = op_desc->Inputs().at("mask")[0]; + Node* mask = nullptr; + for (auto* in_node : node->inputs) { + if (in_node->Var()->Name() == mask_name) { + mask = in_node; + break; + } + } + + // Create new_mask node/var/tensor + std::string new_mask_name = mask_name + "_fp32"; + VarDesc new_mask_desc(new_mask_name); + auto* new_mask = graph->CreateVarNode(&new_mask_desc); + block->Var(new_mask_name); + scope->Var(new_mask_name)->GetMutable(); + + // Create cast op + framework::OpDesc cast_op_desc(block); + cast_op_desc.SetType("cast"); + cast_op_desc.SetInput("X", {mask_name}); + cast_op_desc.SetAttr("in_dtype", + static_cast(framework::proto::VarType::FP16)); + cast_op_desc.SetAttr("out_dtype", + static_cast(framework::proto::VarType::FP32)); + cast_op_desc.SetOutput("Out", {new_mask_name}); + auto* cast = graph->CreateOpNode(&cast_op_desc); + IR_NODE_LINK_TO(mask, cast); + IR_NODE_LINK_TO(cast, new_mask); + + // Update encoder + op_desc->SetInput("mask", {new_mask_name}); + IR_NODE_LINK_TO(new_mask, node); + IR_NODE_UNLINK(node, mask); + + cast_counts++; + } + return cast_counts; +} + } // namespace ir } // namespace framework } // namespace paddle diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.cc b/paddle/fluid/framework/ir/xpu/quant_utils.cc index b1aaace6952..de365f71c63 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.cc +++ b/paddle/fluid/framework/ir/xpu/quant_utils.cc @@ -16,33 +16,92 @@ #include #include "paddle/fluid/platform/device_context.h" #include "paddle/phi/core/enforce.h" -#include "paddle/phi/kernels/funcs/math_function.h" +#include "paddle/phi/kernels/assign_kernel.h" +#include "paddle/phi/kernels/cast_kernel.h" +#include "paddle/phi/kernels/transpose_kernel.h" namespace paddle { namespace framework { namespace ir { -template -void Transpose2D(const phi::DenseTensor& in, phi::DenseTensor* out) { - auto in_dims = in.dims(); +void Assign(const phi::DenseTensor& in, phi::DenseTensor* out) { + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + out->Resize(in.dims()); + out->set_type(in.dtype()); + out->set_layout(in.layout()); + phi::AssignKernel(*cpu_ctx, in, out); +} + +void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out) { + auto in_dims = in->dims(); PADDLE_ENFORCE_EQ( in_dims.size(), 2, platform::errors::InvalidArgument( "In dims rank should be 2, but received in dims size is [%d].", in_dims.size())); - out->Resize({in_dims[1], in_dims[0]}); - out->set_type(in.type()); - auto* dev_ctx = static_cast( + + phi::DenseTensor trans_tensor; + phi::DenseTensor* out_ptr = out == nullptr ? &trans_tensor : out; + out_ptr->Resize({in_dims[1], in_dims[0]}); + out_ptr->set_type(in->type()); + out_ptr->set_layout(in->layout()); + + auto* cpu_ctx = static_cast( platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); - dev_ctx->Alloc(out); std::vector axis{1, 0}; - phi::funcs::Transpose trans2d; - trans2d(*dev_ctx, in, out, axis); + switch (in->dtype()) { + case phi::DataType::FLOAT16: + phi::TransposeKernel(*cpu_ctx, *in, axis, out_ptr); + break; + case phi::DataType::FLOAT32: + phi::TransposeKernel(*cpu_ctx, *in, axis, out_ptr); + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support fp16 and fp32, but received dtype is %s.", + phi::DataTypeToString(in->dtype()))); + break; + } + + if (out == nullptr) { + Assign(*out_ptr, in); + } } -template void Transpose2D(const phi::DenseTensor& in, - phi::DenseTensor* out); +void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out) { + auto* cpu_ctx = static_cast( + platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); + + phi::DenseTensor fp32_tensor; + phi::DenseTensor* out_ptr = out == nullptr ? &fp32_tensor : out; + out_ptr->Resize(in->dims()); + out_ptr->set_type(phi::DataType::FLOAT32); + out_ptr->set_layout(in->layout()); + + switch (in->dtype()) { + case phi::DataType::FLOAT16: + phi::CastKernel(*cpu_ctx, *in, phi::DataType::FLOAT32, out_ptr); + break; + case phi::DataType::FLOAT32: + if (out == nullptr) { + return; + } else { + phi::AssignKernel(*cpu_ctx, *in, out_ptr); + } + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "Only support fp16 and fp32, but received dtype is %s.", + phi::DataTypeToString(in->dtype()))); + break; + } + + if (out == nullptr) { + Assign(*out_ptr, in); + } +} static float FindMaxAbs(const float* data, int len) { float max_f = 0.0f; @@ -151,14 +210,15 @@ template void QuantWeight(phi::DenseTensor* weight, phi::DenseTensor* weight_max, bool transpose) { + // Convert fp16 to fp32 + phi::DenseTensor weight_fp32; + CastToFp32(weight, &weight_fp32); + // Transpose - auto* weight_data = weight->data(); - phi::DenseTensor weight_trans; if (transpose) { - Transpose2D(*weight, &weight_trans); - weight_data = weight_trans.data(); - weight->Resize(weight_trans.dims()); + Transpose2D(&weight_fp32); } + // Find max paddle::platform::DeviceContextPool& pool = paddle::platform::DeviceContextPool::Instance(); @@ -171,21 +231,22 @@ void QuantWeight(phi::DenseTensor* weight, } phi::XPUContext* xpu_ctx = static_cast(pool.Get(place)); int max_ptr_size = xpu_ctx->x_context()->max_ptr_size(); - int size = weight->numel(); + int size = weight_fp32.numel(); + auto* weight_data = weight_fp32.data(); float max_val = FindMaxAbs(weight_data, size); std::vector max_vec(max_ptr_size, max_val); - weight_max->set_type(paddle::experimental::CppTypeToDataType::Type()); + weight_max->set_type(phi::DataType::FLOAT32); weight_max->Resize({max_ptr_size}); - auto* dev_ctx = static_cast( + auto* cpu_ctx = static_cast( platform::DeviceContextPool::Instance().Get(phi::CPUPlace())); - memcpy(dev_ctx->Alloc(weight_max), + memcpy(cpu_ctx->Alloc(weight_max), max_vec.data(), max_ptr_size * sizeof(float)); + // Quant - std::vector quant_data(size); - QuantFP32ToIntX(weight_data, quant_data.data(), max_val, size); weight->set_type(paddle::experimental::CppTypeToDataType::Type()); - memcpy(dev_ctx->Alloc(weight), quant_data.data(), size * sizeof(T)); + weight->Resize(weight_fp32.dims()); + QuantFP32ToIntX(weight_data, cpu_ctx->Alloc(weight), max_val, size); } template void QuantWeight(phi::DenseTensor* weight, diff --git a/paddle/fluid/framework/ir/xpu/quant_utils.h b/paddle/fluid/framework/ir/xpu/quant_utils.h index 4a059479aa0..57519a58432 100644 --- a/paddle/fluid/framework/ir/xpu/quant_utils.h +++ b/paddle/fluid/framework/ir/xpu/quant_utils.h @@ -19,8 +19,11 @@ namespace paddle { namespace framework { namespace ir { -template -void Transpose2D(const phi::DenseTensor& in, phi::DenseTensor* out); +void Assign(const phi::DenseTensor& in, phi::DenseTensor* out); + +void Transpose2D(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); + +void CastToFp32(phi::DenseTensor* in, phi::DenseTensor* out = nullptr); // 1. Quant weight from fp32 to int16/int31 // 2. Weight data is in-place update. diff --git a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc index e703b7d5bf6..2589a20eb28 100644 --- a/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc +++ b/paddle/fluid/inference/analysis/passes/convert_to_mixed_precision.cc @@ -41,18 +41,31 @@ ConvertToMixedPrecisionPass::ConvertToMixedPrecisionPass( backend_(backend), keep_io_types_(keep_io_types), black_list_(black_list) { - if (mixed_precision_ != phi::DataType::FLOAT16 && - mixed_precision_ != phi::DataType::BFLOAT16) { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "mixed_precision currently not supported dtype %d, we now only " - "support fp16 and bf16.", - static_cast(mixed_precision_))); - } - if (backend_ != phi::Backend::GPU && backend_ != phi::Backend::CUSTOM) { - PADDLE_THROW(paddle::platform::errors::InvalidArgument( - "mixed_precision currently not supported place %d, we now only " - "support gpu and custom device .", - static_cast(backend_))); + switch (backend_) { + case phi::Backend::GPU: + PADDLE_ENFORCE(mixed_precision_ == phi::DataType::FLOAT16 || + mixed_precision_ == phi::DataType::BFLOAT16, + platform::errors::InvalidArgument( + "mixed_precision of %s currently only supported fp16 " + "and bf16, not support %s.", + experimental::BackendToString(backend_), + phi::DataTypeToString(mixed_precision_))); + break; + case phi::Backend::XPU: + case phi::Backend::CUSTOM: + PADDLE_ENFORCE(mixed_precision_ == phi::DataType::FLOAT16, + platform::errors::InvalidArgument( + "mixed_precision of %s currently only supported fp16 " + "and bf16, not support %s.", + experimental::BackendToString(backend_), + phi::DataTypeToString(mixed_precision_))); + break; + default: + PADDLE_THROW(platform::errors::InvalidArgument( + "mixed_precision currently not supported place GPU or XPU or CUSTOM, " + "not support %s.", + experimental::BackendToString(backend_))); + break; } } @@ -70,17 +83,16 @@ void ConvertToMixedPrecisionPass::Run() { framework::ir::AutoMixedPrecisionPass pass; pass.Set("mixed_precision_mode", new int{static_cast(mixed_precision_)}); - pass.Set("mixed_black_list", - new std::unordered_set{black_list_}); if (backend_ == phi::Backend::GPU) { pass.Set("enable_gpu_mixed", new bool{true}); - pass.Set("enable_custom_device_mixed", new bool{false}); + } else if (backend_ == phi::Backend::XPU) { + pass.Set("enable_xpu_mixed", new bool{true}); } else if (backend_ == phi::Backend::CUSTOM) { - pass.Set("enable_gpu_mixed", new bool{false}); pass.Set("enable_custom_device_mixed", new bool{true}); } + pass.Set("mixed_black_list", + new std::unordered_set{black_list_}); pass.Set("keep_io_types", new bool{keep_io_types_}); - pass.Apply(main_graph_.get()); SaveMixedModel(); diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc index 7b12567a33f..72b5d9d316d 100644 --- a/paddle/fluid/inference/api/analysis_predictor.cc +++ b/paddle/fluid/inference/api/analysis_predictor.cc @@ -1302,18 +1302,23 @@ void AnalysisPredictor::PrepareArgument() { << ", we will use a new PassStrategy. Note that only the GPU " "backend is supported for now."; if (!config_.use_cinn_compiler_) { - pass_builder->ClearPasses(); const auto &deleted_passes = pass_builder->GetAllDeletedPasses(); if (config_.tensorrt_engine_enabled()) { + pass_builder->ClearPasses(); for (const auto &pass : kTrtLowerPrecisionPasses) { if (deleted_passes.count(pass)) continue; pass_builder->AppendPass(pass); } } else if (config_.use_gpu()) { + pass_builder->ClearPasses(); for (const auto &pass : kGpuLowerPrecisionPasses) { if (deleted_passes.count(pass)) continue; pass_builder->AppendPass(pass); } + } else if (config_.use_xpu()) { + // All passes support fp16. Not reset pass_builder. + } else { + pass_builder->ClearPasses(); } } } diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc index 9b5c98764d7..7680309744f 100644 --- a/paddle/fluid/inference/api/paddle_pass_builder.cc +++ b/paddle/fluid/inference/api/paddle_pass_builder.cc @@ -519,9 +519,9 @@ XpuPassStrategy::XpuPassStrategy() : PassStrategy({}) { "delete_dropout_op_pass", "identity_scale_op_clean_pass", "generate_sequence_xpu_fuse_pass", + "embedding_with_eltwise_add_xpu_fuse_pass", "multi_encoder_xpu_fuse_pass", "multi_encoder_xpu_slice_fuse_pass", - "embedding_with_eltwise_add_xpu_fuse_pass", "fc_xpu_fuse_pass", "link_xpu_op_max_pass", }); diff --git a/paddle/phi/backends/xpu/xpu2_op_list.cc b/paddle/phi/backends/xpu/xpu2_op_list.cc index 862e893dd86..ee2dbdb9d40 100644 --- a/paddle/phi/backends/xpu/xpu2_op_list.cc +++ b/paddle/phi/backends/xpu/xpu2_op_list.cc @@ -253,7 +253,8 @@ XPUOpMap& get_kl2_ops() { phi::DataType::BOOL, phi::DataType::FLOAT16, phi::DataType::FLOAT32})}, - {"fc_xpu", XPUKernelSet({phi::DataType::FLOAT32})}, + {"fc_xpu", + XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"fill", XPUKernelSet({phi::DataType::INT64, phi::DataType::INT32, @@ -461,7 +462,8 @@ XPUOpMap& get_kl2_ops() { phi::DataType::FLOAT16, phi::DataType::INT32, phi::DataType::INT64})}, - {"multi_encoder_xpu", XPUKernelSet({phi::DataType::FLOAT32})}, + {"multi_encoder_xpu", + XPUKernelSet({phi::DataType::FLOAT32, phi::DataType::FLOAT16})}, {"nearest_interp_v2", XPUKernelSet({phi::DataType::FLOAT32})}, {"nearest_interp_v2_grad", XPUKernelSet({phi::DataType::FLOAT32})}, {"not_equal", diff --git a/paddle/phi/common/backend.h b/paddle/phi/common/backend.h index 4e7ec83c427..de3502e5e54 100644 --- a/paddle/phi/common/backend.h +++ b/paddle/phi/common/backend.h @@ -210,6 +210,8 @@ inline std::string BackendToString(const Backend& backend) { return "KPS"; case Backend::IPU: return "IPU"; + case Backend::CUSTOM: + return "CUSTOM"; default: { size_t device_type_id_ = static_cast(backend) - static_cast(Backend::NUM_BACKENDS); diff --git a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc index f0f784f324b..8dbdbff7475 100644 --- a/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/fc_xpu_kernel.cc @@ -33,44 +33,53 @@ void FcXPUKernel(const Context& ctx, float act_alpha, DenseTensor* out, DenseTensor* out_max) { + using XPUType = typename XPUTypeTrait::Type; auto in_mat_dims = flatten_to_2d(x.dims(), in_num_col_dims); int m = in_mat_dims[0]; int k = in_mat_dims[1]; int n = w.dims()[0]; + auto* x_data = reinterpret_cast(x.data()); const float* x_max_data = x_max.get_ptr() == nullptr ? nullptr : x_max.get_ptr()->data(); const float* bias_data = bias.get_ptr() == nullptr ? nullptr : bias.get_ptr()->data(); + auto* out_data = reinterpret_cast(ctx.template Alloc(out)); xpu::Activation_t act(static_cast(act_type)); if (act_type == 5) { act.leaky_alpha = act_alpha; } else if (act_type == 15) { act.hard_sigmoid_slope = act_alpha; } - int r = xpu::fc_fusion( // TX, TW. TY, TGEMM - ctx.x_context(), // ctx - x.data(), // x - w.data(), // w - ctx.template Alloc(out), // y - m, // m - n, // n - k, // k - transpose_x, // x_trans - true, // w_trans - x_max_data, // x_maxptr - w_max.data(), // w_maxptr - ctx.template Alloc(out_max), // y_maxptr - transpose_x ? m : k, // ldx - k, // ldw - n, // ldy - alpha, // alpha - beta, // beta - bias_data, // bias - act); + int r = + xpu::fc_fusion( // TX, TW. TY, TGEMM + ctx.x_context(), // ctx + x_data, // x + w.data(), // w + out_data, // y + m, // m + n, // n + k, // k + transpose_x, // x_trans + true, // w_trans + x_max_data, // x_maxptr + w_max.data(), // w_maxptr + ctx.template Alloc(out_max), // y_maxptr + transpose_x ? m : k, // ldx + k, // ldw + n, // ldy + alpha, // alpha + beta, // beta + bias_data, // bias + act); PADDLE_ENFORCE_XDNN_SUCCESS(r, "fc_xpu"); } } // namespace fusion } // namespace phi -PD_REGISTER_KERNEL(fc_xpu, XPU, ALL_LAYOUT, phi::fusion::FcXPUKernel, float) {} +PD_REGISTER_KERNEL(fc_xpu, + XPU, + ALL_LAYOUT, + phi::fusion::FcXPUKernel, + float, + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc b/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc index 463b463c958..78adca9f86b 100644 --- a/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc +++ b/paddle/phi/kernels/fusion/xpu/multi_encoder_xpu_kernel.cc @@ -40,18 +40,26 @@ void MultiEncoderXPUKernel(const Context& ctx, DenseTensor* out, DenseTensor* x_fp16, DenseTensor* out_fp16) { - using float16 = typename XPUTypeTrait::Type; - // XPU2 only support fp16 input/output. - float16* x_fp16_data = reinterpret_cast( - ctx.template Alloc(x_fp16)); - int r_cast_x = xpu::cast_v2( - ctx.x_context(), x.data(), x_fp16_data, x.numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r_cast_x, - "multi_encoder_xpu(cast x from fp32 to fp16)"); - - float16* out_fp16_data = reinterpret_cast( - ctx.template Alloc(out_fp16)); + auto x_dtype = x.dtype(); + const float16* x_fp16_data = nullptr; + float16* out_fp16_data = nullptr; + if (x_dtype == phi::DataType::FLOAT32) { + auto* x_fp16_data_t = reinterpret_cast( + ctx.template Alloc(x_fp16)); + int r_cast_x = xpu::cast_v2( + ctx.x_context(), x.data(), x_fp16_data_t, x.numel()); + PADDLE_ENFORCE_XDNN_SUCCESS(r_cast_x, + "multi_encoder_xpu(cast x from fp32 to fp16)"); + x_fp16_data = x_fp16_data_t; + out_fp16_data = reinterpret_cast( + ctx.template Alloc(out_fp16)); + } else { + x_fp16_data = + reinterpret_cast(x.data()); + out_fp16_data = reinterpret_cast( + ctx.template Alloc(out)); + } // q,k,v weight are fused. // Each encoder's weight should be: w0, null, null, w3, w4, w5 @@ -78,8 +86,8 @@ void MultiEncoderXPUKernel(const Context& ctx, ln_scale_data.push_back(ln_scale[i]->data()); ln_bias_data.push_back(ln_bias[i]->data()); } - const T* mask_data = - mask.get_ptr() == nullptr ? nullptr : mask.get_ptr()->data(); + const float* mask_data = + mask.get_ptr() == nullptr ? nullptr : mask.get_ptr()->data(); xpu::Activation_t qkv_act(static_cast(act_type)); int batch = x.dims()[0]; @@ -152,10 +160,15 @@ void MultiEncoderXPUKernel(const Context& ctx, PADDLE_ENFORCE_XDNN_SUCCESS(r, "multi_encoder_xpu"); } - int r_cast_out = xpu::cast_v2( - ctx.x_context(), out_fp16_data, ctx.template Alloc(out), out->numel()); - PADDLE_ENFORCE_XDNN_SUCCESS(r_cast_out, - "multi_encoder_xpu(cast out from fp16 to fp32)"); + if (x_dtype == phi::DataType::FLOAT32) { + int r_cast_out = + xpu::cast_v2(ctx.x_context(), + out_fp16_data, + ctx.template Alloc(out), + out->numel()); + PADDLE_ENFORCE_XDNN_SUCCESS( + r_cast_out, "multi_encoder_xpu(cast out from fp16 to fp32)"); + } } } // namespace fusion @@ -165,4 +178,5 @@ PD_REGISTER_KERNEL(multi_encoder_xpu, XPU, ALL_LAYOUT, phi::fusion::MultiEncoderXPUKernel, - float) {} + float, + phi::dtype::float16) {} diff --git a/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py b/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py new file mode 100644 index 00000000000..f09d00440ac --- /dev/null +++ b/python/paddle/fluid/tests/unittests/ir/inference/test_xpu_convert_mixed_precision.py @@ -0,0 +1,56 @@ +# Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +import os +import tempfile +import unittest + +import paddle +from paddle.inference import ( + PlaceType, + PrecisionType, + convert_to_mixed_precision, +) +from paddle.jit import to_static +from paddle.static import InputSpec +from paddle.vision.models import resnet50 + + +class ConvertMixedPrecison(unittest.TestCase): + def test(self): + self.temp_dir = tempfile.TemporaryDirectory() + model = resnet50(True) + net = to_static( + model, input_spec=[InputSpec(shape=[None, 3, 224, 224], name='x')] + ) + paddle.jit.save( + net, os.path.join(self.temp_dir.name, 'resnet50/inference') + ) + convert_to_mixed_precision( + os.path.join(self.temp_dir.name, 'resnet50/inference.pdmodel'), + os.path.join(self.temp_dir.name, 'resnet50/inference.pdiparams'), + os.path.join( + self.temp_dir.name, 'mixed_precision/inference.pdmodel' + ), + os.path.join( + self.temp_dir.name, 'mixed_precision/inference.pdiparams' + ), + backend=PlaceType.XPU, + mixed_precision=PrecisionType.Half, + ) + self.temp_dir.cleanup() + + +if __name__ == "__main__": + unittest.main() -- GitLab