diff --git a/paddle/fluid/eager/amp_auto_cast.h b/paddle/fluid/eager/amp_auto_cast.h new file mode 100644 index 0000000000000000000000000000000000000000..6d5758adbe526144d155b8f7dbd6a3e1356ae4e2 --- /dev/null +++ b/paddle/fluid/eager/amp_auto_cast.h @@ -0,0 +1,97 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" +#include "paddle/fluid/framework/convert_utils.h" + +namespace egr { + +static inline bool NeedCast(const paddle::experimental::Tensor& tensor, + const paddle::experimental::DataType& dst_dtype) { + auto place = tensor.inner_place(); + auto data_type = tensor.dtype(); + if (paddle::platform::is_gpu_place(place) || + paddle::platform::is_cuda_pinned_place(place) || + paddle::platform::is_xpu_place(place) || + paddle::platform::is_mlu_place(place) || + paddle::platform::is_npu_place(place) || + paddle::platform::is_npu_pinned_place(place)) { + // CudaPinndePlace is added for varbase created by dataloader + if ((data_type == paddle::experimental::DataType::FLOAT32 || + data_type == paddle::experimental::DataType::FLOAT16 || + data_type == paddle::experimental::DataType::BFLOAT16) && + (data_type != dst_dtype)) { + return true; + } + } + return false; +} + +inline std::vector AmpAutoCasts( + const std::string& inputs_name, + const std::vector& inputs, + const paddle::experimental::DataType& dst_dtype, std::string op_name) { + VLOG(6) << "AMP AmpAutoCasts:" + << " inputs(" << inputs_name << ") dst_dtype(" + << paddle::framework::DataType2String(dst_dtype) << ")."; + std::vector inputs_casted; + for (auto& input : inputs) { + if (NeedCast(input, dst_dtype)) { + paddle::framework::AttributeMap cast_attrs = { + {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())}, + {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}}; + inputs_casted.emplace_back( + std::move(cast_dygraph_function(input, cast_attrs))); + } else { + inputs_casted.emplace_back(input); + } + } + return inputs_casted; +} + +inline paddle::experimental::Tensor AmpAutoCast( + const std::string& input_name, const paddle::experimental::Tensor& input, + const paddle::experimental::DataType& dst_dtype, std::string op_name) { + VLOG(6) << "AMP AmpAutoCasts:" + << " input(" << input_name << ") dst_dtype(" + << paddle::framework::DataType2String(dst_dtype) << ")."; + if (dst_dtype == paddle::experimental::DataType::FLOAT16) { + if (op_name == "run_program") { + return input; + } + if ((op_name == "batch_norm" || op_name == "layer_norm" || + op_name == "sync_batch_norm") && + input_name != "X") { + return input; + } + if ((op_name == "fused_attention" || op_name == "fused_feedforward")) { + if (input_name == "LnScale" || input_name == "LnBias" || + input_name == "Ln2Scale" || input_name == "Ln2Bias" || + input_name == "Ln1Scale" || input_name == "Ln1Bias") { + return input; + } + } + } + if (NeedCast(input, dst_dtype)) { + paddle::framework::AttributeMap cast_attrs = { + {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())}, + {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}}; + return cast_dygraph_function(input, cast_attrs); + } + return input; +} + +} // namespace egr diff --git a/paddle/fluid/eager/amp_utils.h b/paddle/fluid/eager/amp_utils.h index 229af41a38ad0aadba663c9fbe40634a7fd25466..95313bde02a2048bb52ee33c5ad95df7df4a41e3 100644 --- a/paddle/fluid/eager/amp_utils.h +++ b/paddle/fluid/eager/amp_utils.h @@ -13,30 +13,27 @@ // limitations under the License. #pragma once -#include #include -#include "paddle/fluid/eager/api/generated/fluid_generated/dygraph_forward_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" -#include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/imperative/amp_auto_cast.h" namespace egr { static inline paddle::experimental::DataType GetPromoteType( - const std::string& api_name, + const std::string& op_name, const std::vector>& amp_tensors_vector, const paddle::experimental::DataType& amp_dtype) { auto dst_type = amp_dtype; if (egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype() == "float16") { - if (api_name == "batch_norm" || api_name == "layer_norm" || - api_name == "sync_batch_norm") { + if (op_name == "batch_norm" || op_name == "layer_norm" || + op_name == "sync_batch_norm") { if (amp_tensors_vector[0][0].dtype() == paddle::experimental::DataType::FLOAT32) { dst_type = paddle::experimental::DataType::FLOAT32; } - } else if (api_name == "fused_attention") { + } else if (op_name == "fused_attention") { for (size_t i = 0; i < amp_tensors_vector.size(); i++) { if (i != 3 || i != 4 || i != 9 || i != 10) { if (amp_tensors_vector[i][0].dtype() == @@ -46,7 +43,7 @@ static inline paddle::experimental::DataType GetPromoteType( } } } - } else if (api_name == "fused_feedforward") { + } else if (op_name == "fused_feedforward") { for (size_t i = 0; i < amp_tensors_vector.size(); i++) { if (i != 7 || i != 8 || i != 9 || i != 10) { if (amp_tensors_vector[i][0].dtype() == @@ -78,7 +75,7 @@ static inline paddle::experimental::DataType GetPromoteType( } // NOTE(juncai): moving_average_abs_max_scale only consider the dtype of // input(X) - if (api_name == "moving_average_abs_max_scale") { + if (op_name == "moving_average_abs_max_scale") { if (amp_tensors_vector[0][0].dtype() == paddle::experimental::DataType::FLOAT16) { dst_type = paddle::experimental::DataType::FLOAT16; @@ -87,33 +84,33 @@ static inline paddle::experimental::DataType GetPromoteType( return dst_type; } -paddle::experimental::DataType GetAmpDestDtype( - const std::string& api_name, +inline paddle::experimental::DataType GetAmpDestDtype( + const std::string& op_name, const std::vector>& amp_tensors_vector) { auto amp_dtype = egr::Controller::Instance().GetCurrentTracer()->GetAmpDtype(); auto amp_level = egr::Controller::Instance().GetAMPLevel(); VLOG(6) << "AMP GetAmpDestDtype:" - << " op(" << api_name << ") amp_dtype(" << amp_dtype << ") amp_level(" + << " op(" << op_name << ") amp_dtype(" << amp_dtype << ") amp_level(" << static_cast(amp_level) << ")."; if (amp_dtype == "float16") { if (amp_level == paddle::imperative::AmpLevel::O1) { if (paddle::imperative::AmpOperators::Instance() .GetMutableAllowOps() - ->count(api_name)) { + ->count(op_name)) { return paddle::experimental::DataType::FLOAT16; } else if (paddle::imperative::AmpOperators::Instance() .GetMutableBlockOps() - ->count(api_name)) { + ->count(op_name)) { return paddle::experimental::DataType::FLOAT32; } else { - auto dst_type = GetPromoteType(api_name, amp_tensors_vector, + auto dst_type = GetPromoteType(op_name, amp_tensors_vector, paddle::experimental::DataType::FLOAT16); if (dst_type == paddle::experimental::DataType::FLOAT16 && paddle::imperative::AmpOperators::Instance() .GetMutableUnsupportedFp16Ops() - ->count(api_name)) { + ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; } return dst_type; @@ -122,10 +119,10 @@ paddle::experimental::DataType GetAmpDestDtype( auto dst_type = paddle::experimental::DataType::FLOAT16; if (paddle::imperative::AmpOperators::Instance() .GetMutableUnsupportedFp16Ops() - ->count(api_name) || + ->count(op_name) || paddle::imperative::AmpOperators::Instance() .GetMutableBlockOps() - ->count(api_name)) { + ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; } return dst_type; @@ -134,20 +131,20 @@ paddle::experimental::DataType GetAmpDestDtype( if (amp_level == paddle::imperative::AmpLevel::O1) { if (paddle::imperative::AmpOperators::Instance() .GetMutableAllowOps() - ->count(api_name)) { + ->count(op_name)) { return paddle::experimental::DataType::BFLOAT16; } else if (paddle::imperative::AmpOperators::Instance() .GetMutableBlockOps() - ->count(api_name)) { + ->count(op_name)) { return paddle::experimental::DataType::FLOAT32; } else { auto dst_type = - GetPromoteType(api_name, amp_tensors_vector, + GetPromoteType(op_name, amp_tensors_vector, paddle::experimental::DataType::BFLOAT16); if (dst_type == paddle::experimental::DataType::BFLOAT16 && paddle::imperative::AmpOperators::Instance() .GetMutableUnsupportedBf16Ops() - ->count(api_name)) { + ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; } return dst_type; @@ -156,10 +153,10 @@ paddle::experimental::DataType GetAmpDestDtype( auto dst_type = paddle::experimental::DataType::BFLOAT16; if (paddle::imperative::AmpOperators::Instance() .GetMutableUnsupportedBf16Ops() - ->count(api_name) || + ->count(op_name) || paddle::imperative::AmpOperators::Instance() .GetMutableBlockOps() - ->count(api_name)) { + ->count(op_name)) { dst_type = paddle::experimental::DataType::FLOAT32; } return dst_type; @@ -168,78 +165,4 @@ paddle::experimental::DataType GetAmpDestDtype( return paddle::experimental::DataType::FLOAT32; } -static inline bool NeedCast(const paddle::experimental::Tensor& tensor, - const paddle::experimental::DataType& dst_dtype) { - auto place = tensor.inner_place(); - auto data_type = tensor.dtype(); - if (paddle::platform::is_gpu_place(place) || - paddle::platform::is_cuda_pinned_place(place) || - paddle::platform::is_xpu_place(place) || - paddle::platform::is_mlu_place(place) || - paddle::platform::is_npu_place(place) || - paddle::platform::is_npu_pinned_place(place)) { - // CudaPinndePlace is added for varbase created by dataloader - if ((data_type == paddle::experimental::DataType::FLOAT32 || - data_type == paddle::experimental::DataType::FLOAT16 || - data_type == paddle::experimental::DataType::BFLOAT16) && - (data_type != dst_dtype)) { - return true; - } - } - return false; -} - -std::vector AmpAutoCasts( - const std::string& inputs_name, - const std::vector& inputs, - const paddle::experimental::DataType& dst_dtype, std::string api_name) { - VLOG(6) << "AMP AmpAutoCasts:" - << " inputs(" << inputs_name << ") dst_dtype(" - << paddle::framework::DataType2String(dst_dtype) << ")."; - std::vector inputs_casted; - for (auto& input : inputs) { - if (NeedCast(input, dst_dtype)) { - paddle::framework::AttributeMap cast_attrs = { - {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())}, - {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}}; - inputs_casted.emplace_back( - std::move(cast_dygraph_function(input, cast_attrs))); - } else { - inputs_casted.emplace_back(input); - } - } - return inputs_casted; -} - -paddle::experimental::Tensor AmpAutoCast( - const std::string& input_name, const paddle::experimental::Tensor& input, - const paddle::experimental::DataType& dst_dtype, std::string api_name) { - VLOG(6) << "AMP AmpAutoCasts:" - << " input(" << input_name << ") dst_dtype(" - << paddle::framework::DataType2String(dst_dtype) << ")."; - if (dst_dtype == paddle::experimental::DataType::FLOAT16) { - if (api_name == "run_program") { - return input; - } - if ((api_name == "batch_norm" || api_name == "layer_norm" || - api_name == "sync_batch_norm") && - input_name != "X") { - return input; - } - if ((api_name == "fused_attention" || api_name == "fused_feedforward")) { - if (input_name == "LnScale" || input_name == "LnBias" || - input_name == "Ln2Scale" || input_name == "Ln2Bias" || - input_name == "Ln1Scale" || input_name == "Ln1Bias") { - return input; - } - } - } - if (NeedCast(input, dst_dtype)) { - paddle::framework::AttributeMap cast_attrs = { - {"in_dtype", paddle::framework::TransToProtoVarType(input.dtype())}, - {"out_dtype", paddle::framework::TransToProtoVarType(dst_dtype)}}; - return cast_dygraph_function(input, cast_attrs); - } - return input; -} } // namespace egr diff --git a/paddle/fluid/eager/auto_code_generator/eager_generator.cc b/paddle/fluid/eager/auto_code_generator/eager_generator.cc index 0be4be6c963ed8a71e5271c1e837aa9f4fbaad78..4018c0c0918de0ea91a080762b49a9453de9d01b 100644 --- a/paddle/fluid/eager/auto_code_generator/eager_generator.cc +++ b/paddle/fluid/eager/auto_code_generator/eager_generator.cc @@ -2587,6 +2587,7 @@ static void GenerateForwardDygraphFile(const std::string& forward_cc_path, "\"paddle/fluid/eager/api/generated/fluid_generated/nodes/nodes.h\"\n" "#include \"paddle/fluid/eager/api/utils/global_utils.h\"\n" "#include \"paddle/fluid/eager/amp_utils.h\"\n" + "#include \"paddle/fluid/eager/amp_auto_cast.h\"\n" "#include \"paddle/fluid/platform/profiler/event_tracing.h\"\n\n"; std::string forward_cc_include_str = paddle::string::Sprintf(FORWARD_INCLUDE_TEMPLATE); diff --git a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py index 01ef711063c8677bc078cebfb1e249ff3e230aab..33c01c83c47837cc441d4b1a3cdfaed26f3da2bb 100644 --- a/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py +++ b/paddle/fluid/eager/auto_code_generator/final_state_generator/eager_gen.py @@ -163,7 +163,7 @@ FORWARD_FUNCTION_TEMPLATE = \ """ {} {}({}) {{ {} - + {} {} // Returns @@ -249,6 +249,8 @@ FORWARD_CC_FILE_TEMPLATE = \ #include "paddle/phi/api/include/sparse_api.h" #include "paddle/fluid/eager/api/utils/global_utils.h" #include "paddle/fluid/platform/profiler/event_tracing.h" +#include "paddle/fluid/eager/amp_utils.h" +#include "paddle/fluid/eager/eager_amp_auto_cast.h" {} {} @@ -304,6 +306,23 @@ BUMP_INPLACE_VERSION_TEMPLATE = \ """ +AMP_LOGIC_TEMPLATE = \ +""" + if (egr::Controller::Instance().GetAMPLevel() != paddle::imperative::AmpLevel::O0) {{ + VLOG(5) << "Check and Prepare For AMP"; + {} + std::vector> amp_tensors_vector = {}; + {} + {} + {} + {{ + paddle::imperative::AutoCastGuard guard(egr::Controller::Instance().GetCurrentTracer(), paddle::imperative::AmpLevel::O0); + {} + }} + }} +""" + + ####################### ## Generator Helpers ## ####################### @@ -769,26 +788,51 @@ class DygraphSingleFunctionGenerator(FunctionGeneratorBase): inputs_args_definition_list = ["" for i in range(num_inputs)] inputs_args_declaration_list = ["" for i in range(num_inputs)] inputs_call_list = ["" for i in range(num_inputs)] + amp_inputs_call_list = ["" for i in range(num_inputs)] + amp_tensors_vector_list = [] + amp_tensors_vector_optional_list = [] + amp_autocast_list = [] + amp_autocast_optional_list = [] for name, (ttype, pos) in forward_inputs_position_map.items(): inputs_call_list[pos] = f"{name}" + amp_inputs_call_list[pos] = f"NEW_{name}" is_optional = (name in optional_inputs) if IsPlainTensorType(ttype): if is_optional: arg_str = f"const paddle::optional {name}" + amp_tensors_vector_optional_list.append( + f"if ({name}.is)initialized() amp_tensors_vector.push_back({name}.get()));\n" + ) + amp_autocast_optional_list.append( + f"auto NEW_{name} = {name}.is_initialized() ? egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name) : {name};\n" + ) else: if inplace_map and name in inplace_map.keys(): arg_str = f"paddle::experimental::Tensor& {name}" + amp_tensors_vector_list.append(f"{{{name}}}") + amp_autocast_list.append( + f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + ) else: arg_str = f"const paddle::experimental::Tensor& {name}" + amp_tensors_vector_list.append(f"{{{name}}}") + amp_autocast_list.append( + f"auto NEW_{name} = egr::EagerAmpAutoCast(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + ) else: assert IsVectorTensorType(ttype) arg_str = f"const std::vector& {name}" + amp_tensors_vector_list.append(f"{name}") + amp_autocast_list.append( + f"auto NEW_{name} = egr::EagerAmpAutoCasts(\"{name}\", {name}, amp_dst_dtype, op_name);\n" + ) inputs_args_definition_list[pos] = arg_str inputs_args_declaration_list[pos] = arg_str for name, atype, default_val, pos in forward_attrs_list: inputs_call_list[pos] = name + amp_inputs_call_list[pos] = name if default_val is not None: inputs_args_declaration_list[ pos] = f"{atype} {name} = {default_val}" @@ -843,9 +887,28 @@ class DygraphSingleFunctionGenerator(FunctionGeneratorBase): dygraph_event_str = f"paddle::platform::RecordEvent dygraph_entrance_record_event(\"{forward_api_name} dygraph\", paddle::platform::TracerEventType::Operator, 1);" forward_function_name = GetDygraphForwardFunctionName(forward_api_name) + # Forward amp logic + kernel_trans2_op_name_str = f"auto op_name = phi::TransToFluidOpName(\"{forward_api_name}\");" + amp_tensors_vector_list_str = "{ " + ",".join( + amp_tensors_vector_list) + " }" + amp_tensors_vector_optional_list_str = "".join( + amp_tensors_vector_optional_list) + amp_get_dst_dtype_str = f"auto amp_dst_dtype = egr::GetAmpDestDtype(op_name, amp_tensors_vector);\n" + amp_autocast_list_str = " ".join( + amp_autocast_list) + " ".join(amp_autocast_optional_list) + amp_inputs_call_args_str = ", ".join(amp_inputs_call_list) + amp_call_str = f"return {forward_function_name}({amp_inputs_call_args_str});" + if is_inplaced or (forward_api_name == "cast"): + amp_logic_str = "" + else: + amp_logic_str = AMP_LOGIC_TEMPLATE.format( + kernel_trans2_op_name_str, amp_tensors_vector_list_str, + amp_tensors_vector_optional_list_str, amp_get_dst_dtype_str, + amp_autocast_list_str, amp_call_str) + self.forward_definition_str += FORWARD_FUNCTION_TEMPLATE.format( returns_type_str, forward_function_name, inputs_args_definition_str, - dygraph_event_str, node_creation_str, returns_str) + dygraph_event_str, amp_logic_str, node_creation_str, returns_str) self.forward_declaration_str += f"{returns_type_str} {forward_function_name}({inputs_args_declaration_str});\n" logging.info( diff --git a/paddle/fluid/eager/eager_amp_auto_cast.h b/paddle/fluid/eager/eager_amp_auto_cast.h new file mode 100644 index 0000000000000000000000000000000000000000..9bd1ca1f6fe53218bc15685bf4d7669eb6b11045 --- /dev/null +++ b/paddle/fluid/eager/eager_amp_auto_cast.h @@ -0,0 +1,90 @@ +// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/eager/api/generated/eager_generated/forwards/dygraph_functions.h" + +namespace egr { + +static inline bool NeedCast(const paddle::experimental::Tensor& tensor, + const paddle::experimental::DataType& dst_dtype) { + auto place = tensor.inner_place(); + auto data_type = tensor.dtype(); + if (paddle::platform::is_gpu_place(place) || + paddle::platform::is_cuda_pinned_place(place) || + paddle::platform::is_xpu_place(place) || + paddle::platform::is_mlu_place(place) || + paddle::platform::is_npu_place(place) || + paddle::platform::is_npu_pinned_place(place)) { + // CudaPinndePlace is added for varbase created by dataloader + if ((data_type == paddle::experimental::DataType::FLOAT32 || + data_type == paddle::experimental::DataType::FLOAT16 || + data_type == paddle::experimental::DataType::BFLOAT16) && + (data_type != dst_dtype)) { + return true; + } + } + return false; +} + +inline std::vector EagerAmpAutoCasts( + const std::string& inputs_name, + const std::vector& inputs, + const paddle::experimental::DataType& dst_dtype, std::string op_name) { + VLOG(6) << "AMP AmpAutoCasts:" + << " inputs(" << inputs_name << ") dst_dtype(" + << paddle::framework::DataType2String(dst_dtype) << ")."; + std::vector inputs_casted; + for (auto& input : inputs) { + if (NeedCast(input, dst_dtype)) { + inputs_casted.emplace_back( + std::move(cast_final_state_dygraph_function(input, dst_dtype))); + } else { + inputs_casted.emplace_back(input); + } + } + return inputs_casted; +} + +inline paddle::experimental::Tensor EagerAmpAutoCast( + const std::string& input_name, const paddle::experimental::Tensor& input, + const paddle::experimental::DataType& dst_dtype, std::string op_name) { + VLOG(6) << "AMP AmpAutoCasts:" + << " input(" << input_name << ") dst_dtype(" + << paddle::framework::DataType2String(dst_dtype) << ")."; + if (dst_dtype == paddle::experimental::DataType::FLOAT16) { + if (op_name == "run_program") { + return input; + } + if ((op_name == "batch_norm" || op_name == "layer_norm" || + op_name == "sync_batch_norm") && + input_name != "x") { + return input; + } + if ((op_name == "fused_attention" || op_name == "fused_feedforward")) { + if (input_name == "LnScale" || input_name == "LnBias" || + input_name == "Ln2Scale" || input_name == "Ln2Bias" || + input_name == "Ln1Scale" || input_name == "Ln1Bias") { + return input; + } + } + } + if (NeedCast(input, dst_dtype)) { + return cast_final_state_dygraph_function(input, dst_dtype); + } + return input; +} + +} // namespace egr