diff --git a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt index bbd2e93184315922a4f75c06d7eeeaf8fd7f15a8..3885c29c6a909d23ea292e6f9ce1ca7091fdeb78 100644 --- a/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt +++ b/paddle/fluid/framework/new_executor/interpreter/CMakeLists.txt @@ -1,9 +1,12 @@ set(INTERPRETER_SRCS data_transfer.cc dependency_builder.cc execution_config.cc - interpreter_util.cc stream_analyzer.cc) + interpreter_util.cc static_build.cc stream_analyzer.cc) set(INTERPRETER_DEPS + buffered_reader device_context + global_utils op_registry + phi_tensor_utils scope framework_proto data_feed_proto diff --git a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc index 7024a57bb84b12e5210bfe694eee75a734850449..1b208b1967018a93f4b7649a836124f83da81295 100644 --- a/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc +++ b/paddle/fluid/framework/new_executor/interpreter/data_transfer.cc @@ -17,6 +17,7 @@ #include "paddle/fluid/framework/convert_utils.h" #include "paddle/fluid/framework/data_transform.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +#include "paddle/fluid/framework/new_executor/interpreter/static_build.h" #include "paddle/phi/core/kernel_context.h" #include "paddle/phi/core/kernel_factory.h" @@ -37,7 +38,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var, std::vector* op_func_nodes, bool use_local_scope, bool is_fetch_v2, - bool skip_run) { + bool static_build) { bool is_transferred = false; auto* src_var_name = &var_name; @@ -52,7 +53,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var, is_fetch_v2); if (op) { RunAndConstructOpFuncNode( - op, *src_var_name, *new_var_name, op_func_nodes, skip_run); + op, *src_var_name, *new_var_name, op_func_nodes, static_build); } // update src_var_name src_var_name = new_var_name; @@ -70,7 +71,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var, scope_); if (op) { RunAndConstructOpFuncNode( - op, *src_var_name, *new_var_name, op_func_nodes, skip_run); + op, *src_var_name, *new_var_name, op_func_nodes, static_build); } // update src_var_name src_var_name = new_var_name; @@ -87,7 +88,7 @@ bool DataTranferHelper::apply(const phi::KernelKey& kernel_type_for_var, *src_var_name, new_var_name, src_place, dst_place, var_scope_, scope_); if (op) { RunAndConstructOpFuncNode( - op, *src_var_name, *new_var_name, op_func_nodes, skip_run); + op, *src_var_name, *new_var_name, op_func_nodes, static_build); } is_transferred = true; } @@ -98,7 +99,7 @@ void DataTranferHelper::RunAndConstructShareNode( const std::string& src_var_name, const std::string& dst_var_name, std::vector* op_func_nodes, - bool skip_run) { + bool static_build) { VariableNameMap in_name_map = {{"X", {src_var_name}}}; VariableNameMap out_name_map = {{"Out", {dst_var_name}}}; AttributeMap attr_map; @@ -112,7 +113,7 @@ void DataTranferHelper::RunAndConstructShareNode( "Insert %s with %s -> %s.", op_type, src_var_name, dst_var_name); RunAndConstructOpFuncNode( - op, src_var_name, dst_var_name, op_func_nodes, skip_run); + op, src_var_name, dst_var_name, op_func_nodes, static_build); } void DataTranferHelper::RunAndConstructOpFuncNode( @@ -120,15 +121,18 @@ void DataTranferHelper::RunAndConstructOpFuncNode( const std::string& var_name, const std::string& new_var_name, std::vector* new_op_func_nodes, - bool skip_run) { + bool static_build) { auto& op_type = op->Type(); // 1. Construct RuntimeContext RuntimeContext runtime_context({}, {}); runtime_context.inputs["X"] = {scope_->FindVar(var_name)}; runtime_context.outputs["Out"] = {scope_->Var(new_var_name)}; - RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context); - op.get()->Info().infer_shape_(&infer_shape_ctx); + + if (!static_build) { + RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context); + op->Info().infer_shape_(&infer_shape_ctx); + } // 2. choose kernel @@ -203,8 +207,9 @@ void DataTranferHelper::RunAndConstructOpFuncNode( } else { new_op_func_node.phi_kernel_ = op_with_kernel->PhiKernel(); - if (skip_run) { + if (static_build) { FakeInitializeOutputsForFunctionKernel( + *op, *(new_op_func_node.phi_kernel_), *(op_with_kernel->PhiKernelSignature()), runtime_context, @@ -449,7 +454,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, OpFuncNode* op_func_node, std::vector* new_op_func_nodes, bool use_local_scope, - bool skip_run) { + bool static_build) { Scope* local_scope = use_local_scope ? var_scope->GetMutableLocalScope() : var_scope->GetMutableScope(); @@ -546,7 +551,11 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, op_base->Type() == "fetch_v2"); if (op) { data_transfer_helper.RunAndConstructOpFuncNode( - op, var_name, new_var_name, new_op_func_nodes, skip_run); + op, + var_name, + new_var_name, + new_op_func_nodes, + static_build); } is_transferred = true; } else { @@ -611,7 +620,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key, new_op_func_nodes, use_local_scope, op_base->Type() == "fetch_v2", - skip_run); + static_build); } if (is_transferred) { @@ -741,7 +750,7 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node, VariableScope* var_scope, std::vector* op_func_nodes, framework::Scope* local_scope, - bool skip_run) { + bool static_build) { DataTranferHelper data_transfer_helper(place, var_scope, local_scope); for (auto& var_name_item : out_names) { std::vector& vars = out_vars->at(var_name_item.first); @@ -817,9 +826,9 @@ void HandleComplexGradToRealGrad(const OpFuncNode& op_func_node, auto op = TransferDtype( var_name, &new_var_name, src_type, dst_type, var_scope, local_scope); data_transfer_helper.RunAndConstructOpFuncNode( - op, var_name, new_var_name, op_func_nodes, skip_run); + op, var_name, new_var_name, op_func_nodes, static_build); data_transfer_helper.RunAndConstructShareNode( - new_var_name, var_name, op_func_nodes, skip_run); + new_var_name, var_name, op_func_nodes, static_build); } } } diff --git a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc index babac78146f406a7911656dbefc35549313b3a31..6709ad8978b9d5fc061e38e219613a471add9a39 100644 --- a/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc +++ b/paddle/fluid/framework/new_executor/interpreter/dependency_builder.cc @@ -61,10 +61,9 @@ const std::string StringizeDownstreamMap( const std::map>& DependencyBuilder::Build( const std::vector& instructions) { - PADDLE_ENFORCE_EQ( - is_build_, - false, - phi::errors::AlreadyExists("The op dependency has been built")); + if (is_build_) { + return op_downstream_map_; + } instructions_ = &instructions; op_num_ = instructions_->size(); diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc index 29626988132f929be40dec4b9ea0c4ebe876138b..0a93659cf9c16e49b8601f1810e7120f496c327d 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/executor_gc_helper.h" #include "paddle/fluid/framework/new_executor/interpreter/data_transfer.h" #include "paddle/fluid/framework/new_executor/interpreter/execution_config.h" +#include "paddle/fluid/framework/new_executor/interpreter/static_build.h" #include "paddle/fluid/memory/stats.h" #include "paddle/fluid/operators/controlflow/conditional_block_op_helper.h" #include "paddle/fluid/operators/controlflow/recurrent_op_helper.h" @@ -48,34 +49,6 @@ namespace interpreter { using VariableIdMap = std::map>; -// These Op needs set output dtype when register phi kernel, but they didn't -static std::set OpsNeedSetOutputDtypeWhenRegisterPhiKernel = { - "abs", - "adam", - "adamw", - "any_raw", - "eig_grad", - "eigh", - "lamb", - "layer_norm", - "layer_norm_grad", - "less_equal", - "less_than", - "merged_adam", - "sync_batch_norm_grad", - "unique", - "unique_consecutive_flattened_tensor", - "unique_raw"}; - -// These Ops can use InferMeta to infer the output dtype -static std::set OpsWithAvailablePhiInferMeta = { - "abs", "adam", "adamw", "layer_norm", "layer_norm_grad", "merged_adam"}; - -// Cannot static analysis these Ops' output dtype or backend because their -// kernels have not moved to PHI yet. -static std::set OpsWithFluidKernelNeedMoveToPhi = { - "fused_batch_norm_act", "fused_batch_norm_act_grad"}; - // NOTE(Ruibiao): SingleStreamGuard make some multi-strem op (i.e., // c_allreduce_sum) run in single stream. It is dedicated to BuildOpFuncList // which run kernel without stream synchronization. @@ -145,48 +118,6 @@ void AsyncWorkQueue::AddTask(const OpFuncType& op_func_type, queue_group_->AddTask(op_func_type == OpFuncType::kGpuAsync, std::move(fn)); } -bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) { - // has_fluid_kernel = (kernelCode >> 3) & 1 - // has_structed_kernel = (kernelCode >> 2) & 1 - // need_move_to_phi = (kernelCode >> 1) & 1 - // need_set_dtype = KernelCode & 1 - using KernelCode = int8_t; - std::set> invalid_ops; - for (auto& op : block.AllOps()) { - auto op_type = op->Type(); - bool has_fluid_kernel = OperatorWithKernel::AllOpKernels().count(op_type); - bool has_structured_kernel = - phi::KernelFactory::Instance().HasStructuredKernel(op_type); - bool need_move_to_phi = (has_fluid_kernel || has_structured_kernel) && - OpsWithFluidKernelNeedMoveToPhi.count(op_type); - bool need_set_dtype = - !has_fluid_kernel && !has_structured_kernel && - OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(op_type) && - !OpsWithAvailablePhiInferMeta.count(op_type); - - KernelCode kernel_code = (has_fluid_kernel << 3) + - (has_structured_kernel << 2) + - (need_move_to_phi << 1) + need_set_dtype; - if (need_move_to_phi || need_set_dtype) { - invalid_ops.insert(std::make_pair(op_type, kernel_code)); - } - } - - if (!invalid_ops.empty()) { - std::stringstream ss; - ss << "The following OPs are unable to static build:\n"; - for (auto& item : invalid_ops) { - ss << item.first << " [has_fluid_kernel = " << (item.second >> 3 & 1) - << ", has_structed_kerenl = " << (item.second >> 2 & 1) - << ", need_move_to_phi = " << (item.second >> 1 & 1) - << ", need_set_dtype = " << (item.second & 1) << "]\n"; - } - VLOG(0) << ss.str(); - } - - return invalid_ops.empty(); -} - bool IsCommunicationOp(const std::string& op_name) { const std::set special_comm_op_set = { "send", @@ -492,17 +423,25 @@ void ApplyDeviceGuard(const OperatorBase* op_base, } void HandleOperatorBase(const platform::Place& place, - const VariableScope* var_scope, - std::shared_ptr op_base, + std::shared_ptr op, OpFuncNode* op_func_node, - Scope* local_scope) { + Scope* scope, + bool static_build) { platform::DeviceContextPool& pool = platform::DeviceContextPool::Instance(); auto* dev_ctx = pool.Get(place); // input, output is prepared. set the other attributes. - op_func_node->operator_base_ = op_base; + op_func_node->operator_base_ = op; op_func_node->type_ = AnalyseOpFuncType(*op_func_node, place); op_func_node->kernel_func_ = nullptr; - op_base->Run(*local_scope, place); // Run without data transformer. + if (static_build) { + if (OperatorBasesMustRunInStaticBuild.count(op->Type())) { + op->Run(*scope, place); + } + FakeInitializeOutputsForOperatorBase(*op, place, scope); + } else { + op->Run(*scope, place); // Run without data transformer. + } + op_func_node->dev_ctx_ = dev_ctx; } @@ -636,7 +575,7 @@ void BuildOpFuncList(const platform::Place& place, VLOG(4) << "HandleOperatorBase"; // op is not a operatorwithkernel, so direcly run OperatorBase::Run() HandleOperatorBase( - place, var_scope, ops[i], &op_func_node, local_scope); + place, ops[i], &op_func_node, local_scope, static_build); vec_func_list->emplace_back(op_func_node); } else { VLOG(4) << "OP is not null"; @@ -754,15 +693,18 @@ void BuildOpFuncList(const platform::Place& place, use_local_scope, static_build); VLOG(4) << "apply data transform done. "; - // step 4. infershape, see OperatorWithKernel::RunImpl in operator.cc - // for why. - if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) && - op->Attr(kAllKernelsMustComputeRuntimeShape))) { + + // step 4. infershape + if (!static_build) { VLOG(4) << "infer shape"; - RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context); - // TODO(Aurelius84): In case of control flow ops, they are NOT - // inheritted from OperatorWithKernel. - op_with_kernel->Info().infer_shape_(&infer_shape_ctx); + // see kAllKernelsMustComputeRuntimeShape in operator.h for why + if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) && + op->Attr(kAllKernelsMustComputeRuntimeShape))) { + RuntimeInferShapeContext infer_shape_ctx(*op, runtime_context); + // TODO(Aurelius84): In case of control flow ops, they are NOT + // inheritted from OperatorWithKernel. + op_with_kernel->Info().infer_shape_(&infer_shape_ctx); + } } // step 5. run kernel @@ -772,6 +714,7 @@ void BuildOpFuncList(const platform::Place& place, VLOG(6) << op_type << " run function kernel"; if (static_build) { FakeInitializeOutputsForFunctionKernel( + *op, *(op_func_node.phi_kernel_), *(op_with_kernel->PhiKernelSignature()), runtime_context, @@ -826,7 +769,27 @@ void BuildOpFuncList(const platform::Place& place, auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar( local_scope->FindVar(var_scope->GetNameById(p.second))); - original_tensor->ShareDataWith(*transformed_tensor); + + // avoid overwriting valid data + if (static_build && original_tensor->initialized()) { + const phi::Place& target_place = transformed_tensor->place(); + platform::DeviceContext* dev_ctx_for_copy; + if (target_place.GetType() != AllocationType::CPU) { + dev_ctx_for_copy = pool.Get(target_place); + } else { + dev_ctx_for_copy = pool.Get(original_tensor->place()); + } + + phi::Copy(*dev_ctx_for_copy, + *original_tensor, + target_place, + /*blocking=*/true, + original_tensor); + original_tensor->set_type(transformed_tensor->dtype()); + original_tensor->set_layout(transformed_tensor->layout()); + } else { + original_tensor->ShareDataWith(*transformed_tensor); + } VLOG(4) << "Transfer inplace variable back form " << var_scope->GetNameById(p.first) << " to " << var_scope->GetNameById(p.second); @@ -866,32 +829,35 @@ void BuildOpFuncList(const platform::Place& place, VLOG(4) << "End run " << place << " " << op_func_node.operator_base_->DebugStringEx(local_scope); - // gc--------------------------------------------- - auto iter = unused_var_map.find(op); - if (iter == unused_var_map.end()) { - interpreter::LogDeviceMemoryStats(place); - continue; - } - - auto& delete_vars = iter->second; - std::deque>* garbages = - new std::deque>(); - - for (auto& var_name : delete_vars) { - auto* var = local_scope->FindVar(var_name); - if (var == nullptr || skip_gc_vars.find(var_name) != skip_gc_vars.end()) { + if (!static_build) { + // gc--------------------------------------------- + auto iter = unused_var_map.find(op); + if (iter == unused_var_map.end()) { + interpreter::LogDeviceMemoryStats(place); continue; } - VLOG(6) << "Erase variable " << var_name; - if (var->IsType()) { - garbages->emplace_back( - var->GetMutable()->MoveMemoryHolder()); + auto& delete_vars = iter->second; + std::deque>* garbages = + new std::deque>(); + + for (auto& var_name : delete_vars) { + auto* var = local_scope->FindVar(var_name); + if (var == nullptr || + skip_gc_vars.find(var_name) != skip_gc_vars.end()) { + continue; + } + + VLOG(6) << "Erase variable " << var_name; + if (var->IsType()) { + garbages->emplace_back( + var->GetMutable()->MoveMemoryHolder()); + } } - } - delete garbages; // free mem + delete garbages; // free mem - interpreter::LogDeviceMemoryStats(place); + interpreter::LogDeviceMemoryStats(place); + } } } @@ -942,160 +908,6 @@ void BuildVariableScope(const framework::BlockDesc& block, } } -phi::TensorBase* GetTensorFormVar(framework::Variable* var) { - if (var) { - if (var->template IsType()) { - return var->template GetMutable(); - } else if (var->template IsType()) { - return var->template GetMutable(); - } else if (var->template IsType()) { - return var->template GetMutable(); - } else if (var->template IsType()) { - return var->template GetMutable(); - } else if (var->template IsType()) { - return var->template GetMutable(); - } else if (var->template IsType()) { - return var->template GetMutable(); - } else if (!var->IsInitialized()) { - // The following is for RAW type of var - return var->template GetMutable(); - } else { - PADDLE_THROW(platform::errors::Unimplemented( - "Unsupported `%s` type when get tensor.", - framework::ToTypeName(var->Type()))); - } - } else { - VLOG(4) << "Var is nullptr"; - return nullptr; - } -} - -void FakeInitializeTensor(const platform::DeviceContext& dev_ctx, - const phi::DataType& dtype, - const phi::Place& place, - phi::TensorBase* tensor) { - PADDLE_ENFORCE_NOT_NULL( - tensor, - phi::errors::InvalidArgument( - "The tensor to fake intialize should not be null.")); - if (place == phi::CPUPlace()) { - dev_ctx.HostAlloc(tensor, - dtype, - /*requested_size=*/0, - /*fake_alloc=*/true); - } else { - PADDLE_ENFORCE_EQ( - place, - dev_ctx.GetPlace(), - phi::errors::Unavailable("The place %s for fack alloc is not equal to " - "the place %s of DeviceContext.", - place, - dev_ctx.GetPlace())); - dev_ctx.Alloc(tensor, - dtype, - /*requested_size=*/0, - /*pinned=*/false, - /*fake_alloc=*/true); - } -} - -void FakeInitializeOutputsForFunctionKernel( - const phi::Kernel& phi_kernel, - const phi::KernelSignature& kernel_sig, - const RuntimeContext& ctx, - const platform::DeviceContext& dev_ctx) { - std::string op_name = std::string(kernel_sig.name); - if (OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(op_name)) { - PADDLE_ENFORCE_GT( - OpsWithAvailablePhiInferMeta.count(op_name), - 0, - phi::errors::Unavailable( - "Cannot static build for op %s because it did not set output dtype " - "in phi kernel register. Please set its output dtype and remove it " - "from OpsNeedSetOutputDtypeWhenRegisterPhiKernel set, or add it to " - " OpsWithAvailablePhiInferMeta set if its InferMeta is available.", - op_name)); - } - - auto output_names = kernel_sig.output_names; - auto output_defs = phi_kernel.args_def().output_defs(); - PADDLE_ENFORCE_EQ(output_names.size(), - output_defs.size(), - platform::errors::InvalidArgument( - "The size of outputs_args names (%d) must be equal to " - "the size of kernel output_defs (%d).", - output_names.size(), - output_defs.size())); - - size_t start_idx = 0; - for (size_t i = 0; i < output_names.size(); ++i) { - auto it = ctx.outputs.find(output_names[i]); - - // Deal with the case that some outputs are not found or be NULL when run - // the kernel. For example : the outputs of matmul_grad are dx and dy, - // sometimes dx or dy may be NULL. - if (it == ctx.outputs.end() || it->second.empty()) { - VLOG(4) << "Output " << output_names[i] << " not found"; - ++start_idx; - continue; - } - - auto& outs_vector = it->second; - for (size_t offset = 0; offset < outs_vector.size(); ++offset) { - phi::TensorBase* out_tensor = GetTensorFormVar(outs_vector[offset]); - if (out_tensor && !out_tensor->initialized()) { - phi::TensorArgDef& tensor_arg_def = output_defs[start_idx + offset]; - phi::DataType dtype = tensor_arg_def.dtype; - phi::Place place = tensor_arg_def.backend == phi::Backend::CUSTOM - ? dev_ctx.GetPlace() - : phi::TransToPhiPlace(tensor_arg_def.backend); - - if (dtype == DataType::UNDEFINED || - OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count( - std::string(kernel_sig.name))) { - VLOG(4) << "Get dtype result from InferMeta"; - dtype = out_tensor->dtype(); // dtype from InferMeta - } - - VLOG(4) << output_names[i] << " fake alloc with type " << dtype - << " on place " << place << " " << out_tensor; - - FakeInitializeTensor(dev_ctx, dtype, place, out_tensor); - } - } - start_idx += outs_vector.size(); - } -} - -void FakeInitializeOutputsForStructureKernel( - const framework::OpKernelType& op_kernel_type, - ExecutionContext* execution_context) { - const std::string& op_type = execution_context->Type(); - if (op_type == "fetch_v2") { - return; - } - - const VariableNameMap& outputs = execution_context->GetOp().Outputs(); - for (auto& item : outputs) { - const std::string& parameter_name = item.first; - auto multi_output_var = execution_context->MultiOutputVar(parameter_name); - for (Variable* var : multi_output_var) { - phi::TensorBase* out_tensor = GetTensorFormVar(var); - if (out_tensor && !out_tensor->initialized()) { - phi::DataType dtype = - phi::TransToPhiDataType(op_kernel_type.data_type_); - phi::Place place = execution_context->GetPlace(); - - VLOG(4) << parameter_name << " fake alloc with type " << dtype - << " on place " << place << " " << out_tensor; - - FakeInitializeTensor( - execution_context->device_context(), dtype, place, out_tensor); - } - } - } -} - void LogDeviceMemoryStats(const platform::Place& place) { if (FLAGS_new_executor_log_memory_stats && platform::is_gpu_place(place)) { VLOG(0) << "memory_allocated: " diff --git a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h index 5a74ab662b88899bd8a63f243a35d297a48d6970..f31dd7f789d3743152ddfc6c505ff458df5affd5 100644 --- a/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h +++ b/paddle/fluid/framework/new_executor/interpreter/interpreter_util.h @@ -65,8 +65,6 @@ class AsyncWorkQueue { std::unique_ptr queue_group_; }; -bool BlockCanBeStaticBuilt(const framework::BlockDesc& block); - bool IsCommunicationOp(const std::string& op_name); bool IsCommunicationOp(const Instruction& instr); @@ -99,16 +97,6 @@ void BuildVariableScope(const framework::BlockDesc& block, const ExecutionConfig& execution_config, VariableScope* var_scope); -void FakeInitializeOutputsForFunctionKernel( - const phi::Kernel& phi_kernel, - const phi::KernelSignature& kernel_sig, - const RuntimeContext& ctx, - const platform::DeviceContext& dev_ctx); - -void FakeInitializeOutputsForStructureKernel( - const framework::OpKernelType& op_kernel_type, - ExecutionContext* execution_context); - void LogDeviceMemoryStats(const platform::Place& place); void SetDeviceCommContext(framework::OperatorBase* operator_base, diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.cc b/paddle/fluid/framework/new_executor/interpreter/static_build.cc new file mode 100644 index 0000000000000000000000000000000000000000..fc63666ba720b649ec40da5b76529508388e38b4 --- /dev/null +++ b/paddle/fluid/framework/new_executor/interpreter/static_build.cc @@ -0,0 +1,533 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#include "paddle/fluid/framework/new_executor/interpreter/static_build.h" + +#include "paddle/fluid/eager/api/utils/global_utils.h" +#include "paddle/fluid/framework/reader.h" +#include "paddle/fluid/operators/reader/buffered_reader.h" + +// These Ops is OperatorBase, but we have been handle them in static build +std::set OperatorBasesHandledInStaticBuild = {"read"}; + +std::set OperatorBasesMustRunInStaticBuild = { + "create_double_buffer_reader", "create_py_reader"}; + +std::set OpsCanSkipedFakeAllocInStaticBuild = { + "create_double_buffer_reader", "create_py_reader", "fetch_v2"}; + +// These Op needs set output dtype when register phi kernel, but they didn't +std::set OpsNeedSetOutputDtypeWhenRegisterPhiKernel = { + "eig_grad", + "eigh", + "lamb", + "sync_batch_norm_grad", + "update_loss_scaling", + "unique", + "unique_consecutive_flattened_tensor", + "unique_raw"}; + +// Cannot static analysis these Ops' output dtype or backend because their +// kernels have not moved to PHI yet. +std::set OpsWithFluidKernelNeedMoveToPhi = { + "cudnn_lstm", + "dequantize", + "distributed_fused_lamb", + "fused_attention", + "fused_attention_grad", + "fused_batch_norm_act", + "fused_batch_norm_act_grad", + "fusion_group", + "pow2_decay_with_linear_warmup", + "sequence_mask", + "sequence_pool", + "stft"}; + +std::set StaticBuildBlackList = { + "batch_norm" /*: to handle reserve_space output*/, + "cinn_instruction_run" /*: to handle subgraph infermeta*/, + "cinn_launch" /*: to handle subgraph infermeta*/, + "run_program" /*: to handle scope output*/, + "sparse_sparse_coo_tensor" /*: to handle sparse output*/}; + +namespace paddle { +namespace framework { +namespace interpreter { + +bool BlockCanBeStaticBuilt(const framework::BlockDesc& block) { + // in_black_list = (kernelCode >> 7) & 1 + // is_operator_base = (kernelCode >> 6) & 1 + // is_custom_op = (kernelCode >> 5) & 1 + // use_mkldnn = (kernelCode >> 4) & 1 + // has_fluid_kernel = (kernelCode >> 3) & 1 + // has_structed_kernel = (kernelCode >> 2) & 1 + // need_move_to_phi = (kernelCode >> 1) & 1 + // need_set_dtype = KernelCode & 1 + using KernelCode = int8_t; + std::set> invalid_ops; + for (auto& op : block.AllOps()) { + auto op_type = op->Type(); + const framework::OpInfo& info = OpInfoMap::Instance().Get(op_type); + auto op_base = + info.Creator()(op_type, op->Inputs(), op->Outputs(), op->GetAttrMap()); + + bool in_black_list = StaticBuildBlackList.count(op_type); + bool is_operator_base = + (dynamic_cast(op_base) == nullptr); + bool is_custom_op = + egr::Controller::Instance().GetOpMetaInfoMap().count(op_type); + bool use_mkldnn = false; + if (op->HasAttr("use_mkldnn")) { + Attribute attr = op->GetAttr("use_mkldnn"); + use_mkldnn = attr.index() == 1 ? PADDLE_GET_CONST(int, attr) + : PADDLE_GET_CONST(bool, attr); + } + bool has_fluid_kernel = OperatorWithKernel::AllOpKernels().count(op_type); + bool has_structured_kernel = + phi::KernelFactory::Instance().HasStructuredKernel(op_type); + bool need_move_to_phi = (has_fluid_kernel || has_structured_kernel) && + OpsWithFluidKernelNeedMoveToPhi.count(op_type); + bool need_set_dtype = + OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count(op_type); + + KernelCode kernel_code = + (in_black_list << 7) + (is_operator_base << 6) + (is_custom_op << 5) + + (use_mkldnn << 4) + (has_fluid_kernel << 3) + + (has_structured_kernel << 2) + (need_move_to_phi << 1) + need_set_dtype; + if (!OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) { + if (in_black_list || + (is_operator_base && + !OperatorBasesHandledInStaticBuild.count(op_type)) || + is_custom_op || use_mkldnn || need_move_to_phi || need_set_dtype) { + invalid_ops.insert(std::make_pair(op_type, kernel_code)); + } + } + } + + if (!invalid_ops.empty()) { + std::stringstream ss; + ss << "The following OPs are unable to static build:\n"; + for (auto& item : invalid_ops) { + ss << item.first << " [in_black_list = " << (item.second >> 7 & 1) + << ", is_operator_base = " << (item.second >> 6 & 1) + << ", is_custom_op = " << (item.second >> 5 & 1) + << ", use_mkldnn = " << (item.second >> 4 & 1) + << ", has_fluid_kernel = " << (item.second >> 3 & 1) + << ", has_structed_kerenl = " << (item.second >> 2 & 1) + << ", need_move_to_phi = " << (item.second >> 1 & 1) + << ", need_set_dtype = " << (item.second & 1) << "]\n"; + } + VLOG(1) << ss.str(); + } + + return invalid_ops.empty(); +} + +inline bool IsExtendedTensor(const phi::TensorBase& tensor) { + return framework::RawTensor::classof(&tensor) || + framework::Strings::classof(&tensor) || + framework::Vocab::classof(&tensor); +} + +bool TensorShouldBeFakeInitialized(const OperatorBase& op, + const std::string& parameter_name, + const phi::TensorBase* tensor) { + const std::string& op_type = op.Type(); + if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) { + return false; + } + + if (op_type == "adam" || op_type == "adamw" || op_type == "merged_adam") { + if (op.Attr("use_global_beta_pow") && + (parameter_name == "Beta1PowOut" || parameter_name == "Beta2PowOut")) { + VLOG(2) << "Skip fake initialization for: " << parameter_name; + return false; + } + } + + if (op_type == "coalesce_tensor" && parameter_name == "Output") { + VLOG(2) << "Skip fake initialization for: " << parameter_name; + return false; + } + + if (op_type == "dgc" && parameter_name == "k") { + VLOG(2) << "Skip fake initialization for: " << parameter_name; + return false; + } + + if (op_type == "fake_quantize_range_abs_max") { + if (op.Attr("is_test") && + (parameter_name == "OutScale" || parameter_name == "OutScales")) { + VLOG(2) << "Skip fake initialization for: " << parameter_name; + return false; + } + } + + if (op_type == "segment_pool" && parameter_name == "SummedIds") { + return op.Attr("pooltype") == "MEAN" && + dynamic_cast(&op) + ->kernel_type() + ->place_ != phi::CPUPlace(); + } + + return tensor && !IsExtendedTensor(*tensor); +} + +phi::TensorBase* GetTensorFormVar(framework::Variable* var) { + if (var) { + if (var->template IsType()) { + return var->template GetMutable(); + } else if (var->template IsType()) { + return var->template GetMutable(); + } else if (var->template IsType()) { + return var->template GetMutable(); + } else if (var->template IsType()) { + return var->template GetMutable(); + } else if (var->template IsType()) { + return var->template GetMutable(); + } else if (var->template IsType()) { + return var->template GetMutable(); + } else if (!var->IsInitialized()) { + // The following is for RAW type of var + return var->template GetMutable(); + } else { + PADDLE_THROW(platform::errors::Unimplemented( + "Unsupported `%s` type when get tensor.", + framework::ToTypeName(var->Type()))); + } + } else { + VLOG(4) << "Var is nullptr"; + return nullptr; + } +} + +template +void FakeInitializeTensor(const platform::DeviceContext& dev_ctx, + const phi::Place& place, + const phi::DataType& dtype, + const phi::DataLayout& layout, + TensorType* tensor) { + PADDLE_ENFORCE_NE(place.GetType(), + phi::AllocationType::UNDEFINED, + phi::errors::InvalidArgument( + "The place %s to fake intialize is not valid.", place)); + PADDLE_ENFORCE_NE(dtype, + phi::DataType::UNDEFINED, + phi::errors::InvalidArgument( + "The dtype %s to fake intialize is not valid.", dtype)); + PADDLE_ENFORCE_NE( + layout, + phi::DataLayout::UNDEFINED, + phi::errors::InvalidArgument( + "The layout %s to fake intialize is not valid.", layout)); + PADDLE_ENFORCE_NOT_NULL( + tensor, + phi::errors::InvalidArgument( + "The tensor to fake intialize should not be null.")); + + if (tensor->initialized() && place == tensor->place() && + dtype == tensor->dtype() && tensor->layout() == layout) { + return; + } + + // set place + if (tensor->initialized()) { // avoid overwriting valid data + platform::DeviceContext* dev_ctx_for_copy; + if (place.GetType() != AllocationType::CPU) { + dev_ctx_for_copy = platform::DeviceContextPool::Instance().Get(place); + } else { + dev_ctx_for_copy = + platform::DeviceContextPool::Instance().Get(tensor->place()); + } + phi::Copy(*dev_ctx_for_copy, *tensor, place, /*blocking=*/true, tensor); + } else { + if (place == phi::CPUPlace()) { + dev_ctx.HostAlloc(tensor, + dtype, + /*requested_size=*/0, + /*fake_alloc=*/true); + } else { + PADDLE_ENFORCE_EQ(place, + dev_ctx.GetPlace(), + phi::errors::Unavailable( + "The place %s for fack alloc is not equal to " + "the place %s of DeviceContext.", + place, + dev_ctx.GetPlace())); + dev_ctx.Alloc(tensor, + dtype, + /*requested_size=*/0, + /*pinned=*/false, + /*fake_alloc=*/true); + } + } + + // set dtype and layout + tensor->set_type(dtype); + tensor->set_layout(layout); + + VLOG(4) << "Tensor " << tensor << " fake alloc with type = " << dtype + << ", place = " << place << ", layout = " << layout; +} + +void FakeInitializeTensorBase(const platform::DeviceContext& dev_ctx, + const phi::Place& place, + const phi::DataType& dtype, + const phi::DataLayout& layout, + phi::TensorBase* tensor) { + if (phi::DenseTensor::classof(tensor)) { + FakeInitializeTensor( + dev_ctx, place, dtype, layout, dynamic_cast(tensor)); + } else if (phi::SelectedRows::classof(tensor)) { + FakeInitializeTensor(dev_ctx, + place, + dtype, + layout, + dynamic_cast(tensor)); + } else if (phi::SparseCooTensor::classof(tensor)) { + FakeInitializeTensor(dev_ctx, + place, + dtype, + layout, + dynamic_cast(tensor)); + } else if (phi::SparseCsrTensor::classof(tensor)) { + FakeInitializeTensor(dev_ctx, + place, + dtype, + layout, + dynamic_cast(tensor)); + } else if (phi::TensorArray::classof(tensor)) { + FakeInitializeTensor( + dev_ctx, place, dtype, layout, dynamic_cast(tensor)); + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported `%s` type when fake initialize tensor.", + tensor->type_info().name())); + } +} + +void FakeInitializeOutputsForOperatorBase(const OperatorBase& op, + const phi::Place& place, + Scope* scope) { + const std::string& op_type = op.Type(); + if (OpsCanSkipedFakeAllocInStaticBuild.count(op_type)) { + return; + } + + phi::DeviceContext* dev_ctx = + platform::DeviceContextPool::Instance().Get(place); + + if (op_type == "read") { + const std::string& reader_name = op.Input("Reader"); + framework::ReaderHolder* reader = + GET_DATA_SAFELY(scope->FindVar(reader_name), "Input", "Reader", "Read") + .GetMutable(); + + std::shared_ptr buffered_reader = + std::dynamic_pointer_cast( + reader->Get()); + phi::Place target_place = + buffered_reader ? buffered_reader->GetPlace() : phi::CPUPlace(); + + auto& outputs = op.Outputs("Out"); + auto& var_types = reader->VarTypes(); + PADDLE_ENFORCE_EQ( + outputs.size(), + var_types.size(), + phi::errors::Unavailable("The output size of read_op (%d) should equal " + "to the var_types size of ReaderHolder (%d).", + outputs.size(), + var_types.size())); + + for (size_t i = 0; i < outputs.size(); ++i) { + const std::string& parameter_name = outputs[i]; + phi::TensorBase* out_tensor = + GetTensorFormVar(scope->FindVar(parameter_name)); + if (TensorShouldBeFakeInitialized(op, parameter_name, out_tensor)) { + phi::DataType dtype = phi::TransToPhiDataType(var_types[i]); + FakeInitializeTensorBase( + *dev_ctx, target_place, dtype, out_tensor->layout(), out_tensor); + } + } + } else { + PADDLE_THROW( + phi::errors::Unimplemented("Can not static build for op: %s", op_type)); + } +} + +phi::DataType GetInputDType(const RuntimeContext& runtime_ctx, + const std::string parameter_name) { + phi::TensorBase* in_tensor = + GetTensorFormVar(runtime_ctx.inputs.find(parameter_name)->second.at(0)); + return in_tensor->dtype(); +} + +phi::DataType InferDTypeFromAttr(const framework::OperatorBase& op, + const RuntimeContext& runtime_ctx, + const std::string& attr_name) { + int dtype_attr = op.Attr(attr_name); + if (dtype_attr == -1) { // -1 means the dtype is same as intput + return GetInputDType(runtime_ctx, "X"); + } + return phi::TransToPhiDataType(dtype_attr); +} + +phi::DataType InferMPDType(const RuntimeContext& runtime_ctx, + const std::string parameter_name) { + phi::DataType in_dtype = GetInputDType(runtime_ctx, parameter_name); + return (in_dtype == phi::DataType::BFLOAT16 || + in_dtype == phi::DataType::FLOAT16) + ? phi::DataType::FLOAT32 + : in_dtype; +} + +void FakeInitializeOutputsForFunctionKernel( + const framework::OperatorBase& op, + const phi::Kernel& phi_kernel, + const phi::KernelSignature& kernel_sig, + const RuntimeContext& runtime_ctx, + const platform::DeviceContext& dev_ctx) { + std::string op_type = op.Type(); + auto output_names = kernel_sig.output_names; + auto output_defs = phi_kernel.args_def().output_defs(); + PADDLE_ENFORCE_EQ(output_names.size(), + output_defs.size(), + platform::errors::InvalidArgument( + "The size of outputs_args names (%d) must be equal to " + "the size of kernel output_defs (%d).", + output_names.size(), + output_defs.size())); + size_t start_idx = 0; + for (size_t i = 0; i < output_names.size(); ++i) { + const std::string& parameter_name = output_names[i]; + auto it = runtime_ctx.outputs.find(parameter_name); + // Deal with the case that some outputs are not found or be NULL when run + // the kernel. For example : the outputs of matmul_grad are dx and dy, + // sometimes dx or dy may be NULL. + if (it == runtime_ctx.outputs.end() || it->second.empty()) { + VLOG(4) << "Output " << parameter_name << " not found"; + ++start_idx; + continue; + } + auto& outs_vector = it->second; + for (size_t offset = 0; offset < outs_vector.size(); ++offset) { + phi::TensorBase* out_tensor = GetTensorFormVar(outs_vector[offset]); + if (TensorShouldBeFakeInitialized(op, parameter_name, out_tensor)) { + phi::TensorArgDef& tensor_arg_def = output_defs[i]; + + // analyze place + phi::Backend backend = tensor_arg_def.backend; + if (backend == phi::Backend::UNDEFINED) { + if (op_type == "adam" || op_type == "adamw" || + op_type == "merged_adam") { + phi::TensorBase* beta1_pow = GetTensorFormVar( + runtime_ctx.inputs.find("Beta1Pow")->second.at(0)); + phi::TensorBase* beta2_pow = GetTensorFormVar( + runtime_ctx.inputs.find("Beta2Pow")->second.at(0)); + if (beta1_pow->place() == CPUPlace() && + beta2_pow->place() == CPUPlace()) { + backend = phi::TransToPhiBackend(CPUPlace()); + } else { + backend = phi::TransToPhiBackend(GPUPlace()); + } + } else { + PADDLE_THROW(phi::errors::Unimplemented( + "Unsupported UNDEFINED backend for op: %s, parameter: %s", + op_type, + parameter_name)); + } + } + phi::Place place = backend == phi::Backend::CUSTOM + ? dev_ctx.GetPlace() + : phi::TransToPhiPlace(backend); + + // analyze dtype + phi::DataType dtype = tensor_arg_def.dtype; + if (dtype == DataType::UNDEFINED || + OpsNeedSetOutputDtypeWhenRegisterPhiKernel.count( + std::string(op_type))) { + // Some OP's InferMeta is sensitive to DDim, so we cannot get their + // output dtype from InferMeta + if (op_type == "adam" || op_type == "adamw") { + dtype = InferMPDType(runtime_ctx, "Param"); + } else if (op_type == "arg_min" || op_type == "arg_max" || + op_type == "coalesce_tensor" || op_type == "one_hot_v2") { + dtype = InferDTypeFromAttr(op, runtime_ctx, "dtype"); + } else if (op_type == "bincount" || op_type == "reduce_sum_grad") { + dtype = GetInputDType(runtime_ctx, "X"); + } else if (op_type == "layer_norm") { + dtype = InferMPDType(runtime_ctx, "X"); + } else if (op_type == "reduce_sum") { + int dtype_attr = op.Attr("out_dtype"); + if (dtype_attr != -1) { + dtype = phi::TransToPhiDataType(dtype_attr); + } else { + phi::DataType in_dtype = GetInputDType(runtime_ctx, "X"); + dtype = + (in_dtype == DataType::BOOL || in_dtype == DataType::INT32) + ? DataType::INT64 + : in_dtype; + } + } else { + VLOG(4) << "Get dtype result from InferMeta"; + RuntimeInferShapeContext infer_shape_ctx(op, runtime_ctx); + dynamic_cast(&op) + ->Info() + .infer_shape_(&infer_shape_ctx); + dtype = out_tensor->dtype(); // dtype from InferMeta + } + } + + // analyze layout + phi::DataLayout layout = tensor_arg_def.layout; + + FakeInitializeTensorBase(dev_ctx, place, dtype, layout, out_tensor); + } + } + start_idx += outs_vector.size(); + } +} + +void FakeInitializeOutputsForStructureKernel( + const framework::OpKernelType& op_kernel_type, + ExecutionContext* execution_context) { + const framework::OperatorBase& op = execution_context->GetOp(); + if (OpsCanSkipedFakeAllocInStaticBuild.count(op.Type())) { + return; + } + + const VariableNameMap& outputs = op.Outputs(); + for (auto& item : outputs) { + const std::string& parameter_name = item.first; + auto multi_output_var = execution_context->MultiOutputVar(parameter_name); + for (Variable* var : multi_output_var) { + phi::TensorBase* out_tensor = GetTensorFormVar(var); + if (TensorShouldBeFakeInitialized(op, parameter_name, out_tensor)) { + phi::Place place = execution_context->GetPlace(); + phi::DataType dtype = + phi::TransToPhiDataType(op_kernel_type.data_type_); + phi::DataLayout layout = out_tensor->layout(); + FakeInitializeTensorBase(execution_context->device_context(), + place, + dtype, + layout, + out_tensor); + } + } + } +} + +} // namespace interpreter +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpreter/static_build.h b/paddle/fluid/framework/new_executor/interpreter/static_build.h new file mode 100644 index 0000000000000000000000000000000000000000..e070f66b02549418bc78aeb15d99978480ff6851 --- /dev/null +++ b/paddle/fluid/framework/new_executor/interpreter/static_build.h @@ -0,0 +1,45 @@ +// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved. +// +// Licensed under the Apache License, Version 2.0 (the "License"); +// you may not use this file except in compliance with the License. +// You may obtain a copy of the License at +// +// http://www.apache.org/licenses/LICENSE-2.0 +// +// Unless required by applicable law or agreed to in writing, software +// distributed under the License is distributed on an "AS IS" BASIS, +// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +// See the License for the specific language governing permissions and +// limitations under the License. + +#pragma once + +#include "paddle/fluid/framework/operator.h" +#include "paddle/fluid/framework/scope.h" + +extern std::set OperatorBasesMustRunInStaticBuild; + +namespace paddle { +namespace framework { +namespace interpreter { + +bool BlockCanBeStaticBuilt(const framework::BlockDesc& block); + +void FakeInitializeOutputsForOperatorBase(const OperatorBase& op, + const platform::Place& place, + Scope* scope); + +void FakeInitializeOutputsForFunctionKernel( + const framework::OperatorBase& op, + const phi::Kernel& phi_kernel, + const phi::KernelSignature& kernel_sig, + const RuntimeContext& ctx, + const platform::DeviceContext& dev_ctx); + +void FakeInitializeOutputsForStructureKernel( + const framework::OpKernelType& op_kernel_type, + ExecutionContext* execution_context); + +} // namespace interpreter +} // namespace framework +} // namespace paddle diff --git a/paddle/fluid/framework/new_executor/interpretercore.cc b/paddle/fluid/framework/new_executor/interpretercore.cc index bee8e8ca7b795b7a26d737c35f027c779166a146..046edd45b2a89894f918bdf511884263e38864fb 100644 --- a/paddle/fluid/framework/new_executor/interpretercore.cc +++ b/paddle/fluid/framework/new_executor/interpretercore.cc @@ -21,6 +21,7 @@ #include "paddle/fluid/framework/details/nan_inf_utils.h" #include "paddle/fluid/framework/details/share_tensor_buffer_functor.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" +#include "paddle/fluid/framework/new_executor/interpreter/static_build.h" #include "paddle/fluid/framework/operator.h" #include "paddle/fluid/platform/device/gpu/gpu_info.h" #include "paddle/fluid/platform/os_info.h" @@ -112,6 +113,8 @@ InterpreterCore::InterpreterCore(const platform::Place& place, VLOG(4) << "InterpreterCore(): " << this << " on " << place_; static_build_ = FLAGS_new_executor_static_build && + !FLAGS_new_executor_use_cuda_graph && + !execution_config.used_for_control_flow_op && interpreter::BlockCanBeStaticBuilt(block); exception_notifier_ = main_thread_blocker_.RegisterEvent(kExceptionCaught); @@ -281,12 +284,12 @@ paddle::framework::FetchList InterpreterCore::Run( SetFeedVarsInplaceSkip(feed_names); // convert vec func_list to graph Convert(&op_func_nodes); - is_build_ = true; UpdateSyncOpNum(); if (static_build_) { VLOG(4) << "RUN impl"; RunImpl(); } + is_build_ = true; } else { RunImpl(); } @@ -597,7 +600,7 @@ void InterpreterCore::BuildOperatorDependences() { // analysis the dependences between ops, add next_instr_list to each instr, // and set the dependecy_count_ size_t instr_num = vec_instruction_.size(); - dependecy_count_.resize(instr_num); + dependecy_count_ = std::vector(instr_num, 0); auto downstream_map = dependency_builder_.Build(vec_instruction_); for (size_t instr_id = 0; instr_id < instr_num; ++instr_id) { @@ -657,6 +660,7 @@ void InterpreterCore::Convert( auto& vec_meta_info = var_scope_.MutableVecMetaInfo(); auto nodes = *op_func_nodes; auto op_nums = nodes.size(); + vec_instruction_.clear(); vec_instruction_.reserve(op_nums); for (size_t op_idx = 0; op_idx < op_nums; ++op_idx) { auto& op_func_node = nodes[op_idx]; @@ -825,8 +829,6 @@ void InterpreterCore::Convert( BuildAndCacheInstructionCtx(&vec_instruction_[i]); } - BuildSkipShareLoDInfo(); - bool inplaced = false; for (const Instruction& inst : vec_instruction_) { if (inst.OpBase()->Type() == "share_buffer" || @@ -867,6 +869,10 @@ void InterpreterCore::BuildSkipShareLoDInfo() { } } } + if (can_skip_lod) { + VLOG(8) << "skip share lod for: " << vec_instruction_[i].OpBase()->Type() + << " (" << i << ")"; + } vec_instruction_[i].InnerInferShapeContext()->SetSkipLoD(can_skip_lod); } } @@ -1060,6 +1066,7 @@ void InterpreterCore::ExecuteInstructionList( // EOF is not a fatal error. if (exception_holder_.Type() != "EOF") { async_work_queue_->Cancel(); + async_work_queue_.reset(); } VLOG(4) << "Cancel ok"; PADDLE_ENFORCE_EQ( @@ -1297,11 +1304,12 @@ void InterpreterCore::Prepare(const std::vector& feed_names, // convert vec func_list to graph Convert(&op_func_nodes); UpdateSyncOpNum(); - is_build_ = true; if (static_build_) { VLOG(4) << "RUN impl"; RunImpl(); } + BuildSkipShareLoDInfo(); + is_build_ = true; } // NOTE: Because feed_tensor will be GC after // paddle::framework::BuildOpFuncList, so we should diff --git a/paddle/fluid/framework/operator.cc b/paddle/fluid/framework/operator.cc index 62d91f5da8b58a57aa4c52923e5ce24747518f57..7fdc830ab268bc0e0080c4d32fccb2b24f101397 100644 --- a/paddle/fluid/framework/operator.cc +++ b/paddle/fluid/framework/operator.cc @@ -2753,13 +2753,6 @@ void OperatorWithKernel::ParseInputDataType( t = &(var->Get().value()); } else if (var->IsType()) { const phi::SparseCooTensor* sp_t = &(var->Get()); - PADDLE_ENFORCE_EQ( - sp_t->initialized(), - true, - platform::errors::InvalidArgument("The %s Op's Input Variable `%s` " - "contains uninitialized Tensor.", - Type(), - name)); *data_type = paddle::framework::TransToProtoVarType(sp_t->dtype()); return; } else if (var->IsType()) { diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc index 6eacc496db70d58d9cdeeded298abd9fc66efded..f2f47c5ffb0ffd03c0d6cae10684538c686a5a54 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.cc @@ -37,8 +37,23 @@ REGISTER_OP_WITHOUT_GRADIENT(c_sync_calc_stream, ops::CSyncCalcStreamOp, ops::CSyncCalcStreamOpMaker); -REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); - -REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); - -REGISTER_OP_MLU_KERNEL(c_sync_calc_stream, ops::CSyncCalcStreamKernel); +REGISTER_OP_CUDA_KERNEL(c_sync_calc_stream, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel); + +REGISTER_OP_NPU_KERNEL(c_sync_calc_stream, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel); + +REGISTER_OP_MLU_KERNEL(c_sync_calc_stream, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel, + ops::CSyncCalcStreamKernel); diff --git a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h index da3fdd345393473918b69f819dec4428fafc1135..369adc019c231d7a603ec210853871cd4e54780b 100644 --- a/paddle/fluid/operators/collective/c_sync_calc_stream_op.h +++ b/paddle/fluid/operators/collective/c_sync_calc_stream_op.h @@ -30,7 +30,8 @@ class CSyncCalcStreamOp : public framework::OperatorWithKernel { protected: phi::KernelKey GetExpectedKernelType( const framework::ExecutionContext& ctx) const override { - return phi::KernelKey(framework::proto::VarType::FP32, ctx.GetPlace()); + return phi::KernelKey(OperatorWithKernel::IndicateVarDataType(ctx, "X"), + ctx.GetPlace()); } }; diff --git a/paddle/fluid/operators/concat_op.cc b/paddle/fluid/operators/concat_op.cc index 979e7a529b79ec7d55b50dd443e0177f3b91bb1d..1f5618b498ffa8c31144d38c9351f0803bb76a65 100644 --- a/paddle/fluid/operators/concat_op.cc +++ b/paddle/fluid/operators/concat_op.cc @@ -41,7 +41,7 @@ class ConcatOp : public framework::OperatorWithKernel { auto input_data_type = framework::proto::VarType::Type(0); bool flag = 0; for (auto *input : inputs) { - if (input->IsInitialized() && input->numel() > 0) { + if (input->IsInitialized()) { input_data_type = framework::TransToProtoVarType(input->dtype()); flag = 1; break; diff --git a/paddle/fluid/operators/partial_concat_op.cc b/paddle/fluid/operators/partial_concat_op.cc index a8a7d82e4627a5011ef9dffdc13ed7e0294eb07c..1fb9dceb4150c035a5d6d84b0e09b5eb852943aa 100644 --- a/paddle/fluid/operators/partial_concat_op.cc +++ b/paddle/fluid/operators/partial_concat_op.cc @@ -95,7 +95,7 @@ class PartialConcatOp : public framework::OperatorWithKernel { auto input_data_type = framework::proto::VarType::Type(0); bool flag = 0; for (auto *input : inputs) { - if (input->IsInitialized() && input->numel() > 0) { + if (input->IsInitialized()) { input_data_type = framework::TransToProtoVarType(input->dtype()); flag = 1; break; diff --git a/paddle/fluid/operators/partial_sum_op.cc b/paddle/fluid/operators/partial_sum_op.cc index a2255d8e07abf886158615916925044e875f52ee..9ef7ac0a21a4813d71658f3c0a63a03eba37810d 100644 --- a/paddle/fluid/operators/partial_sum_op.cc +++ b/paddle/fluid/operators/partial_sum_op.cc @@ -97,7 +97,7 @@ class PartialSumOp : public framework::OperatorWithKernel { auto input_data_type = framework::proto::VarType::Type(0); bool flag = 0; for (auto *input : inputs) { - if (input->IsInitialized() && input->numel() > 0) { + if (input->IsInitialized()) { input_data_type = framework::TransToProtoVarType(input->dtype()); flag = 1; break; diff --git a/paddle/fluid/operators/reader/buffered_reader.h b/paddle/fluid/operators/reader/buffered_reader.h index 9bec93e635ef4a0c3ee41f78c8baf3f485d74b76..442330b8caa6f47afea4ad35661059501e77777f 100644 --- a/paddle/fluid/operators/reader/buffered_reader.h +++ b/paddle/fluid/operators/reader/buffered_reader.h @@ -55,6 +55,8 @@ class BufferedReader : public framework::DecoratedReader { ~BufferedReader() override; + platform::Place GetPlace() const { return place_; } + private: void ReadTillBufferFullAsync(); diff --git a/paddle/fluid/operators/sum_op.cc b/paddle/fluid/operators/sum_op.cc index a4902a85fcba7b5ae173b173d8fe75cada079f44..588521c400329f403ea439c9415f5cf14416154a 100644 --- a/paddle/fluid/operators/sum_op.cc +++ b/paddle/fluid/operators/sum_op.cc @@ -54,7 +54,7 @@ class SumOp : public framework::OperatorWithKernel { x_vars_name[idx])); auto tensor = framework::GetLoDTensorOrSelectedRowsValueFromVar(*x_vars[idx]); - if (tensor->numel() <= 0 || (!tensor->IsInitialized())) { + if (!tensor->IsInitialized()) { continue; } if (dtype == -1) { diff --git a/paddle/fluid/operators/transfer_layout_op.cc b/paddle/fluid/operators/transfer_layout_op.cc index a197546b357bb11b5e8a2a8aa051dd1c65451ee4..01514ed4d1075fb050296f167dd42b2d49888b6f 100644 --- a/paddle/fluid/operators/transfer_layout_op.cc +++ b/paddle/fluid/operators/transfer_layout_op.cc @@ -57,9 +57,9 @@ class TransferLayoutOp : public framework::OperatorWithKernel { } auto place = in_tensor->IsInitialized() ? in_tensor->place() : platform::CPUPlace(); - - // dtype is not important - return phi::KernelKey(framework::proto::VarType::FP32, place); + phi::DataType dtype = in_tensor->IsInitialized() ? in_tensor->dtype() + : phi::DataType::FLOAT32; + return phi::KernelKey(phi::TransToProtoVarType(dtype), place); } phi::KernelKey GetKernelTypeForVar( diff --git a/paddle/phi/core/device_context.cc b/paddle/phi/core/device_context.cc index 97ef11c37fe96e3b2aa72cbcce5d5cac4daca032..5c8fc75ff0ee25a072d15a3a81b21a7436b033c3 100644 --- a/paddle/phi/core/device_context.cc +++ b/paddle/phi/core/device_context.cc @@ -146,15 +146,26 @@ struct DeviceContext::Impl { // NOTE(paddle-dev): In case of tensor has already hold allocation and // is going to allocate allocation on new place, we will clear its holder // firstly and then re-alloc it. - if (tensor->initialized() && tensor->place() != place) { - ClearHolder(tensor); + if (phi::DenseTensor::classof(tensor)) { + // NOTE(Ruibiao): The tensor hold zero-size allocation is not regarded as + // `initialized`. Fix other tensor class when needed. + if (static_cast(tensor)->Holder() && + tensor->place() != place) { + ClearHolder(tensor); + } + } else { + if (tensor->initialized() && tensor->place() != place) { + ClearHolder(tensor); + } } + auto* allocator = - (tensor->numel() == 0 || fake_alloc) && requested_size == 0 + (fake_alloc || tensor->numel() == 0) && requested_size == 0 ? zero_allocator_ : (pinned ? pinned_allocator_ : device_allocator_); #ifdef PADDLE_WITH_CUDA - bool must_cuda_graph_allocator = (tensor->numel() != 0) && !pinned; + bool must_cuda_graph_allocator = + (!fake_alloc && tensor->numel() != 0) && !pinned; if (must_cuda_graph_allocator && place.GetType() == phi::AllocationType::GPU && phi::backends::gpu::CUDAGraph::IsThisThreadCapturing()) { @@ -189,11 +200,22 @@ struct DeviceContext::Impl { if (dtype == DataType::UNDEFINED) { dtype = tensor->dtype(); } - if (tensor->initialized() && tensor->place() != CPUPlace()) { - ClearHolder(tensor); + + if (phi::DenseTensor::classof(tensor)) { + // NOTE(Ruibiao): The tensor holds zero-size allocation is not regarded as + // `initialized`. Fix other tensor class when needed. + if (static_cast(tensor)->Holder() && + tensor->place() != CPUPlace()) { + ClearHolder(tensor); + } + } else { + if (tensor->initialized() && tensor->place() != CPUPlace()) { + ClearHolder(tensor); + } } + auto* allocator = - (tensor->numel() == 0 || fake_alloc) && requested_size == 0 + (fake_alloc || tensor->numel() == 0) && requested_size == 0 ? host_zero_allocator_ : host_allocator_; return tensor->AllocateFrom( @@ -246,8 +268,6 @@ struct DeviceContext::Impl { private: void ClearHolder(TensorBase* tensor) const { - if (!tensor->initialized()) return; - if (DenseTensor::classof(tensor)) { static_cast(tensor)->clear(); } else if (SelectedRows::classof(tensor)) { diff --git a/paddle/phi/core/selected_rows.h b/paddle/phi/core/selected_rows.h index 08d02bee40d25be9ce8cf029cfe5c3d730960da9..aa528969fbf8f2105353213221429103bf195f01 100644 --- a/paddle/phi/core/selected_rows.h +++ b/paddle/phi/core/selected_rows.h @@ -139,10 +139,14 @@ class SelectedRows : public TensorBase, /// \return The data type of the tensor. DataType dtype() const noexcept override { return impl_->dtype(); } + void set_type(const DataType dtype) { impl_->set_type(dtype); } + /// \brief Returns the data layout of the tensor. /// \return The data layout of the tensor. DataLayout layout() const noexcept override { return impl_->layout(); } + void set_layout(const DataLayout layout) { impl_->set_layout(layout); } + /// \brief Returns the data place of the tensor. /// \return The data place of the tensor. const Place& place() const override { return impl_->place(); }; diff --git a/paddle/phi/core/selected_rows_impl.h b/paddle/phi/core/selected_rows_impl.h index d4a42a9653b87eef4e2b0d8ef60914f95acd423b..a1864ad3aa657f57053e7a800c4e9c868a1a42ad 100644 --- a/paddle/phi/core/selected_rows_impl.h +++ b/paddle/phi/core/selected_rows_impl.h @@ -159,10 +159,14 @@ class SelectedRowsImpl { /// \return The data type of the tensor. DataType dtype() const noexcept { return value_->dtype(); } + void set_type(const DataType dtype) { value_->set_type(dtype); } + /// \brief Returns the data layout of the tensor. /// \return The data layout of the tensor. DataLayout layout() const noexcept { return value_->layout(); } + void set_layout(const DataLayout layout) { value_->set_layout(layout); } + /// \brief Returns the data place of the tensor. /// \return The data place of the tensor. const Place& place() const { return value_->place(); } diff --git a/paddle/phi/core/sparse_coo_tensor.h b/paddle/phi/core/sparse_coo_tensor.h index 13fc7d444b4242b386b0ab25e6197f3f45028174..542f4e8627758a6cb0c090ae1a50d4dbf4423d30 100644 --- a/paddle/phi/core/sparse_coo_tensor.h +++ b/paddle/phi/core/sparse_coo_tensor.h @@ -104,11 +104,14 @@ class SparseCooTensor : public TensorBase, /// \brief Returns the data type of the tensor. /// \return The data type of the tensor. DataType dtype() const noexcept override { return meta_.dtype; } + void set_type(const DataType dtype) { meta_.dtype = dtype; } /// \brief Returns the data layout of the tensor. /// \return The data layout of the tensor. DataLayout layout() const noexcept override { return meta_.layout; } + void set_layout(const DataLayout layout) { meta_.layout = layout; } + /// \brief Returns the data place of the tensor. /// \return The data place of the tensor. const Place& place() const override { return non_zero_elements_.place(); } diff --git a/paddle/phi/core/sparse_csr_tensor.h b/paddle/phi/core/sparse_csr_tensor.h index 4d607188d2ebbe150f30feb5db74687a327b3b26..ec9dd7ab7907ebb75570eb971f2503940f8255d4 100644 --- a/paddle/phi/core/sparse_csr_tensor.h +++ b/paddle/phi/core/sparse_csr_tensor.h @@ -110,10 +110,14 @@ class SparseCsrTensor : public TensorBase, /// \return The data type of the tensor. DataType dtype() const noexcept override { return meta_.dtype; } + void set_type(const DataType dtype) { meta_.dtype = dtype; } + /// \brief Returns the data layout of the tensor. /// \return The data layout of the tensor. DataLayout layout() const noexcept override { return meta_.layout; } + void set_layout(const DataLayout layout) { meta_.layout = layout; } + /// \brief Returns the data place of the tensor. /// \return The data place of the tensor. const Place& place() const override { return non_zero_elements_.place(); } diff --git a/paddle/phi/core/tensor_array.cc b/paddle/phi/core/tensor_array.cc index 43089d952542f2ba2390085fe59e6f8f3b0f0671..e774bd0da448a76512bcba2db706d8aa3fe03bd9 100644 --- a/paddle/phi/core/tensor_array.cc +++ b/paddle/phi/core/tensor_array.cc @@ -23,8 +23,12 @@ TensorArray::TensorArray(const std::vector& vec) { /// \brief Test whether the tensor's storage in TensorArray is allocated. /// return Whether all tensors in TensorArray is allocated. bool TensorArray::initialized() const { + if (tensors_.empty()) { + return false; + } + for (auto tensor : tensors_) { - if (!tensor.IsInitialized()) { + if (!tensor.initialized()) { return false; } } @@ -42,18 +46,69 @@ const DDim& TensorArray::dims() const { } const Place& TensorArray::place() const { - PADDLE_THROW(errors::Unavailable("place() can't be used in TensorArray")); - return tensors_[0].place(); + PADDLE_ENFORCE_NE( + tensors_.size(), 0, errors::Unavailable("TensorArray is not assigned.")); + + const Place& place = tensors_[0].place(); + for (size_t i = 1; i < tensors_.size(); ++i) { + PADDLE_ENFORCE_EQ( + tensors_[i].place(), + place, + errors::Unavailable( + "The Place of all tensors in TensorArray must be consistent. The " + "current place is %s, but the previous place is %s.", + tensors_[i].place(), + place)); + } + return place; } DataType TensorArray::dtype() const { - PADDLE_THROW(errors::Unavailable("dtype() can't be used in TensorArray")); - return DataType::UNDEFINED; + PADDLE_ENFORCE_NE( + tensors_.size(), 0, errors::Unavailable("TensorArray is not assigned.")); + + const DataType dtype = tensors_[0].dtype(); + for (size_t i = 1; i < tensors_.size(); ++i) { + PADDLE_ENFORCE_EQ( + tensors_[i].dtype(), + dtype, + errors::Unavailable( + "The DataType of all tensors in TensorArray must be consistent. " + "The current dtype is %s, but the previous dtype is %s.", + tensors_[i].dtype(), + dtype)); + } + return dtype; +} + +void TensorArray::set_type(const DataType dtype) { + for (size_t i = 0; i < tensors_.size(); ++i) { + tensors_[i].set_type(dtype); + } } DataLayout TensorArray::layout() const { - PADDLE_THROW(errors::Unavailable("layout() can't be used in TensorArray")); - return DataLayout::UNDEFINED; + PADDLE_ENFORCE_NE( + tensors_.size(), 0, errors::Unavailable("TensorArray is not assigned.")); + + const DataLayout layout = tensors_[0].layout(); + for (size_t i = 1; i < tensors_.size(); ++i) { + PADDLE_ENFORCE_EQ( + tensors_[i].layout(), + layout, + errors::Unavailable( + "The DataLayout of all tensors in TensorArray must be consistent. " + "The current layout is %s, but the previous layout is %s.", + tensors_[i].layout(), + layout)); + } + return layout; +} + +void TensorArray::set_layout(DataLayout layout) { + for (size_t i = 0; i < tensors_.size(); ++i) { + tensors_[i].set_layout(layout); + } } bool TensorArray::valid() const { diff --git a/paddle/phi/core/tensor_array.h b/paddle/phi/core/tensor_array.h index 14679429ea7f0b58edb9cdd69365d82034ff8628..4fd8fe1df5e1f43840764ef0b3ad06f5dbb171f5 100644 --- a/paddle/phi/core/tensor_array.h +++ b/paddle/phi/core/tensor_array.h @@ -63,12 +63,14 @@ class TensorArray : public TensorBase, /// \brief This overrided function is not used in TensorArray. const Place& place() const override; - /// \brief This overrided function is not used in TensorArray. DataType dtype() const override; - /// \brief This overrided function is not used in TensorArray. + void set_type(const DataType dtype); + DataLayout layout() const override; + void set_layout(const DataLayout layout); + /// \brief This overrided function is not used in TensorArray. bool valid() const override; diff --git a/paddle/phi/core/tensor_utils.cc b/paddle/phi/core/tensor_utils.cc index 7876aa0437d3ca7e39aee9c27e4503451139743f..4fd11df211f9b7b8c45cf9d7623fff30d7367acf 100644 --- a/paddle/phi/core/tensor_utils.cc +++ b/paddle/phi/core/tensor_utils.cc @@ -316,6 +316,16 @@ void Copy(const Context& dev_ctx, dst->set_dims(src.dims()); } +template +void Copy(const Context& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst) { + // NOTE(Ruibiao): implements Copy() for TensorArray when needed. + PADDLE_THROW(errors::Unimplemented("Copy for TensorArray is unimplemented.")); +} + template void Copy(const CPUContext& dev_ctx, const DenseTensor& src, Place dst_place, @@ -363,6 +373,18 @@ template void Copy(const DeviceContext& dev_ctx, bool blocking, SparseCsrTensor* dst); +template void Copy(const CPUContext& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst); + +template void Copy(const DeviceContext& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst); + #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) template void Copy(const GPUContext& dev_ctx, const DenseTensor& src, @@ -384,6 +406,11 @@ template void Copy(const GPUContext& dev_ctx, Place dst_place, bool blocking, SparseCsrTensor* dst); +template void Copy(const GPUContext& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst); #endif #ifdef PADDLE_WITH_XPU @@ -392,6 +419,11 @@ template void Copy(const XPUContext& dev_ctx, Place dst_place, bool blocking, DenseTensor* dst); +template void Copy(const XPUContext& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst); #endif #ifdef PADDLE_WITH_CUSTOM_DEVICE @@ -400,6 +432,11 @@ template void Copy(const CustomContext& dev_ctx, Place dst_place, bool blocking, DenseTensor* dst); +template void Copy(const CustomContext& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst); #endif #ifdef PADDLE_WITH_MKLDNN @@ -408,6 +445,11 @@ template void Copy(const OneDNNContext& dev_ctx, Place dst_place, bool blocking, DenseTensor* dst); +template void Copy(const OneDNNContext& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst); #endif template diff --git a/paddle/phi/core/tensor_utils.h b/paddle/phi/core/tensor_utils.h index 56a4f933ceb44f03798f7834afe8248a2ea8e5f8..e2679ffb206a8b31f02e3a86226a545e0c9f43d9 100644 --- a/paddle/phi/core/tensor_utils.h +++ b/paddle/phi/core/tensor_utils.h @@ -19,6 +19,7 @@ limitations under the License. */ #include "paddle/phi/core/selected_rows.h" #include "paddle/phi/core/sparse_coo_tensor.h" #include "paddle/phi/core/sparse_csr_tensor.h" +#include "paddle/phi/core/tensor_array.h" #include "paddle/phi/core/tensor_meta.h" namespace phi { @@ -109,6 +110,13 @@ void Copy(const Context& dev_ctx, bool blocking, SparseCsrTensor* dst); +template +void Copy(const Context& dev_ctx, + const TensorArray& src, + Place dst_place, + bool blocking, + TensorArray* dst); + template void TensorFromVector(const std::vector& src, const phi::DeviceContext& ctx, diff --git a/paddle/phi/kernels/coalesce_tensor_kernel.cc b/paddle/phi/kernels/coalesce_tensor_kernel.cc index 5e801769a55aa6ad46f3d277f476a334b7869180..5b9c1b4a3de21d952631292f826e8d6540b691c5 100644 --- a/paddle/phi/kernels/coalesce_tensor_kernel.cc +++ b/paddle/phi/kernels/coalesce_tensor_kernel.cc @@ -79,7 +79,7 @@ void GetMemSizeAndDtype(const std::vector &lod_tensors, size_of_dtype : static_cast(size); const void *ptr = - lod_tensors[i]->IsInitialized() ? lod_tensors[i]->data() : nullptr; + lod_tensors[i]->initialized() ? lod_tensors[i]->data() : nullptr; VLOG(4) << size << " " << len; ss << "input(" << i << "-th tensor) dim:(" << lod_tensors[i]->dims() << ") " << " addres:" << ptr << " len: " << len << ", "; @@ -127,7 +127,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, output[i], errors::InvalidArgument("The %d-th output tensor cannot be nullptr.", i)); - if (!input[i]->IsInitialized()) { + if (!input[i]->initialized()) { has_not_init_in_vars = true; } } @@ -142,7 +142,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, for (size_t i = 0; i < input.size(); ++i) { phi::DDim dims(concated_shapes.data() + accumulated_ranks, concated_ranks[i]); - if (!input[i]->IsInitialized()) { + if (!input[i]->initialized()) { PADDLE_ENFORCE_EQ( input[i], output[i], @@ -220,7 +220,7 @@ void CoalesceTensorKernel(const Context &dev_ctx, auto sub_tensor = fused_output->Slice(static_cast(offset), static_cast(offset + len)); // some var may not persistable, or persistable var may not init - if (output[i]->IsInitialized()) { + if (output[i]->initialized()) { phi::Copy(dev_ctx, *output[i], dev_ctx.GetPlace(), false, &sub_tensor); } offset += use_align @@ -270,7 +270,9 @@ PD_REGISTER_KERNEL(coalesce_tensor, phi::CoalesceTensorKernel, int, float, - double) {} + double) { + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); +} #if defined(PADDLE_WITH_CUDA) || defined(PADDLE_WITH_HIP) PD_REGISTER_KERNEL(coalesce_tensor, @@ -282,6 +284,7 @@ PD_REGISTER_KERNEL(coalesce_tensor, float, double) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); } #endif @@ -295,5 +298,6 @@ PD_REGISTER_KERNEL(coalesce_tensor, float, double) { kernel->InputAt(0).SetBackend(phi::Backend::ALL_BACKEND); + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); } #endif diff --git a/paddle/phi/kernels/cpu/abs_kernel.cc b/paddle/phi/kernels/cpu/abs_kernel.cc index a10e0eed64aec894e8639bbcff54f4723295adc5..34e84c60cc1ff26204fb4d9b4601aa69a683413d 100644 --- a/paddle/phi/kernels/cpu/abs_kernel.cc +++ b/paddle/phi/kernels/cpu/abs_kernel.cc @@ -46,4 +46,6 @@ PD_REGISTER_KERNEL(abs, int, int64_t, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/cpu/accuracy_kernel.cc b/paddle/phi/kernels/cpu/accuracy_kernel.cc index 4f39d28816ae3d99a7431aa0c146b05db9c66ecc..d426e98fbc5458a52beda733d24c2b8caf5031e5 100644 --- a/paddle/phi/kernels/cpu/accuracy_kernel.cc +++ b/paddle/phi/kernels/cpu/accuracy_kernel.cc @@ -97,6 +97,6 @@ PD_REGISTER_KERNEL( kernel->InputAt(1).SetDataType(phi::DataType::INT64); kernel->InputAt(2).SetDataType(phi::DataType::INT64); kernel->OutputAt(0).SetDataType(phi::DataType::FLOAT32); - kernel->OutputAt(1).SetDataType(phi::DataType::INT64); - kernel->OutputAt(2).SetDataType(phi::DataType::INT64); + kernel->OutputAt(1).SetDataType(phi::DataType::INT32); + kernel->OutputAt(2).SetDataType(phi::DataType::INT32); } diff --git a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc index 7c487287bb256841e59175a2755ab2a7d0bb54e3..20dfd2faff8a42549530b74e7e30841215a451b2 100644 --- a/paddle/phi/kernels/cpu/arg_min_max_kernel.cc +++ b/paddle/phi/kernels/cpu/arg_min_max_kernel.cc @@ -201,7 +201,9 @@ PD_REGISTER_KERNEL(argmin, int32_t, int64_t, int16_t, - uint8_t) {} + uint8_t) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); +} PD_REGISTER_KERNEL(argmax, CPU, @@ -212,4 +214,6 @@ PD_REGISTER_KERNEL(argmax, int32_t, int64_t, int16_t, - uint8_t) {} + uint8_t) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/cpu/as_complex_kernel.cc b/paddle/phi/kernels/cpu/as_complex_kernel.cc index 8166a548aa4d26cd8a3fec8f4a120dd156e3905c..9ffdbe5a0e5fe8340b6d3656a22521cbda1e1932 100644 --- a/paddle/phi/kernels/cpu/as_complex_kernel.cc +++ b/paddle/phi/kernels/cpu/as_complex_kernel.cc @@ -15,8 +15,11 @@ #include "paddle/phi/kernels/as_complex_kernel.h" #include "paddle/phi/backends/cpu/cpu_context.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/as_complex_impl.h" PD_REGISTER_KERNEL( - as_complex, CPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) {} + as_complex, CPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc index 14eb38d5b99b6e3870f54f334e034161e2a1f472..1a9921aecc985af664a46fc0bfb3155b484b98cd 100644 --- a/paddle/phi/kernels/cpu/average_accumulates_kernel.cc +++ b/paddle/phi/kernels/cpu/average_accumulates_kernel.cc @@ -53,4 +53,8 @@ PD_REGISTER_KERNEL(average_accumulates, ALL_LAYOUT, phi::AverageAccumulatesKernel, float, - double) {} + double) { + kernel->OutputAt(3).SetDataType(phi::DataType::INT64); + kernel->OutputAt(4).SetDataType(phi::DataType::INT64); + kernel->OutputAt(5).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/cpu/eig_kernel.cc b/paddle/phi/kernels/cpu/eig_kernel.cc index 0ef1e19965093d93f992bc1f2afa99d4a9a9eae5..b53b0c153e5410ef2a6defce1277889565998e49 100644 --- a/paddle/phi/kernels/cpu/eig_kernel.cc +++ b/paddle/phi/kernels/cpu/eig_kernel.cc @@ -105,6 +105,7 @@ PD_REGISTER_KERNEL(eig, double, phi::dtype::complex, phi::dtype::complex) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); - kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + const phi::DataType& out_dtype = phi::dtype::ToComplex(kernel_key.dtype()); + kernel->OutputAt(0).SetDataType(out_dtype); + kernel->OutputAt(1).SetDataType(out_dtype); } diff --git a/paddle/phi/kernels/cpu/eigvals_kernel.cc b/paddle/phi/kernels/cpu/eigvals_kernel.cc index 38864ee9a10d49fbb2baf918f8ce8981ccdc5a9a..555dbfb71dfb775f1ff6b1ce2e2db98e6d271eb7 100644 --- a/paddle/phi/kernels/cpu/eigvals_kernel.cc +++ b/paddle/phi/kernels/cpu/eigvals_kernel.cc @@ -258,5 +258,5 @@ PD_REGISTER_KERNEL(eigvals, double, phi::dtype::complex, phi::dtype::complex) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); } diff --git a/paddle/phi/kernels/cpu/fft_grad_kernel.cc b/paddle/phi/kernels/cpu/fft_grad_kernel.cc index aecaf6c5c13f8f78e99bbde895be6e219a201bf0..a9e017ac794e5ba2ab18b41fd3cb4c9ecd67065f 100644 --- a/paddle/phi/kernels/cpu/fft_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/fft_grad_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/fft_grad_kernel.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h" @@ -23,10 +24,14 @@ PD_REGISTER_KERNEL(fft_c2c_grad, phi::dtype::complex, phi::dtype::complex) {} PD_REGISTER_KERNEL( - fft_c2r_grad, CPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {} + fft_c2r_grad, CPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} PD_REGISTER_KERNEL(fft_r2c_grad, CPU, ALL_LAYOUT, phi::FFTR2CGradKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/cpu/fft_kernel.cc b/paddle/phi/kernels/cpu/fft_kernel.cc index 4d64119206f6370f32ffeeb5742283bb01a5ac81..781490422371ffe835b93e46fbff69d764b9e0b9 100644 --- a/paddle/phi/kernels/cpu/fft_kernel.cc +++ b/paddle/phi/kernels/cpu/fft_kernel.cc @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/fft_kernel.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/fft_kernel_impl.h" @@ -28,8 +29,8 @@ PD_REGISTER_KERNEL(fft_c2r, phi::FFTC2RKernel, phi::dtype::complex, phi::dtype::complex) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } PD_REGISTER_KERNEL(fft_r2c, CPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); } diff --git a/paddle/phi/kernels/cpu/layer_norm_kernel.cc b/paddle/phi/kernels/cpu/layer_norm_kernel.cc index 4a2cfc5de36e23d68370ed4b7b224ac79c2b93d2..1c82866f0bbda06ed35a8e9390c80c3d6305015d 100644 --- a/paddle/phi/kernels/cpu/layer_norm_kernel.cc +++ b/paddle/phi/kernels/cpu/layer_norm_kernel.cc @@ -141,4 +141,7 @@ void LayerNormKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL( - layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) {} + layer_norm, CPU, ALL_LAYOUT, phi::LayerNormKernel, float, double) { + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc index b261e610d2073f3713906b6e2c0c6a09be76e592..8a5e3812950ece26090980e85d1649a93b98e71a 100644 --- a/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/cpu/reduce_sum_grad_kernel.cc @@ -54,4 +54,6 @@ PD_REGISTER_KERNEL(sum_grad, int, int64_t, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/gpu/abs_kernel.cu b/paddle/phi/kernels/gpu/abs_kernel.cu index 9f27c986166f478582ba1c39a18253aed2537da9..8f55f49daf3de3c3d0804e11dfb3e0abbf31cdf7 100644 --- a/paddle/phi/kernels/gpu/abs_kernel.cu +++ b/paddle/phi/kernels/gpu/abs_kernel.cu @@ -76,4 +76,6 @@ PD_REGISTER_KERNEL(abs, phi::dtype::float16, phi::dtype::bfloat16, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->InputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/gpu/accuracy_kernel.cu b/paddle/phi/kernels/gpu/accuracy_kernel.cu index a42d131a48d69d71c70305b453186d859edae796..c12afe94b0bdf5961a728ec372ad399c15d5f653 100644 --- a/paddle/phi/kernels/gpu/accuracy_kernel.cu +++ b/paddle/phi/kernels/gpu/accuracy_kernel.cu @@ -144,6 +144,6 @@ PD_REGISTER_KERNEL(accuracy, double) { kernel->InputAt(1).SetDataType(phi::DataType::INT64); kernel->InputAt(2).SetDataType(phi::DataType::INT64); - kernel->OutputAt(1).SetDataType(phi::DataType::INT64); - kernel->OutputAt(2).SetDataType(phi::DataType::INT64); + kernel->OutputAt(1).SetDataType(phi::DataType::INT32); + kernel->OutputAt(2).SetDataType(phi::DataType::INT32); } diff --git a/paddle/phi/kernels/gpu/as_complex_kernel.cu b/paddle/phi/kernels/gpu/as_complex_kernel.cu index 5f2bfaaea54ce1206e3623540f779187202682e1..a376d3a9a5415b3d655c565890cdd3c5f540e2cf 100644 --- a/paddle/phi/kernels/gpu/as_complex_kernel.cu +++ b/paddle/phi/kernels/gpu/as_complex_kernel.cu @@ -15,8 +15,11 @@ #include "paddle/phi/kernels/as_complex_kernel.h" #include "paddle/phi/backends/gpu/gpu_context.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/as_complex_impl.h" PD_REGISTER_KERNEL( - as_complex, GPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) {} + as_complex, GPU, ALL_LAYOUT, phi::AsComplexKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu index 7af4430427997432c06553db172343c492f5c2a6..ec2c8d3fdb330d59e2afd5c47c6694a1ed641d0e 100644 --- a/paddle/phi/kernels/gpu/average_accumulates_kernel.cu +++ b/paddle/phi/kernels/gpu/average_accumulates_kernel.cu @@ -97,4 +97,8 @@ PD_REGISTER_KERNEL(average_accumulates, ALL_LAYOUT, phi::AverageAccumulatesKernel, float, - double) {} + double) { + kernel->OutputAt(3).SetDataType(phi::DataType::INT64); + kernel->OutputAt(4).SetDataType(phi::DataType::INT64); + kernel->OutputAt(5).SetDataType(phi::DataType::INT64); +} diff --git a/paddle/phi/kernels/gpu/fft_grad_kernel.cu b/paddle/phi/kernels/gpu/fft_grad_kernel.cu index 69a95cffc3ee0e5ba71c4d96f35ad1b91ae6ccf6..d5f86292899c337e5c0a34639e9e2ebb1a0e48ce 100644 --- a/paddle/phi/kernels/gpu/fft_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/fft_grad_kernel.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/fft_grad_kernel.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/fft_grad_kernel_impl.h" @@ -23,10 +24,14 @@ PD_REGISTER_KERNEL(fft_c2c_grad, phi::dtype::complex, phi::dtype::complex) {} PD_REGISTER_KERNEL( - fft_c2r_grad, GPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) {} + fft_c2r_grad, GPU, ALL_LAYOUT, phi::FFTC2RGradKernel, float, double) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); +} PD_REGISTER_KERNEL(fft_r2c_grad, GPU, ALL_LAYOUT, phi::FFTR2CGradKernel, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); +} diff --git a/paddle/phi/kernels/gpu/fft_kernel.cu b/paddle/phi/kernels/gpu/fft_kernel.cu index 02c5fc036f2b1151cae8ada2cc876424e7016cdb..ae8fe365e3f3fb292dc95a4985ece9730b74ed0b 100644 --- a/paddle/phi/kernels/gpu/fft_kernel.cu +++ b/paddle/phi/kernels/gpu/fft_kernel.cu @@ -13,6 +13,7 @@ // limitations under the License. #include "paddle/phi/kernels/fft_kernel.h" +#include "paddle/phi/common/type_traits.h" #include "paddle/phi/core/kernel_registry.h" #include "paddle/phi/kernels/impl/fft_kernel_impl.h" @@ -28,8 +29,8 @@ PD_REGISTER_KERNEL(fft_c2r, phi::FFTC2RKernel, phi::dtype::complex, phi::dtype::complex) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::dtype::ToReal(kernel_key.dtype())); } PD_REGISTER_KERNEL(fft_r2c, GPU, ALL_LAYOUT, phi::FFTR2CKernel, float, double) { - kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(0).SetDataType(phi::dtype::ToComplex(kernel_key.dtype())); } diff --git a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu index ba731e700e8ea2e6669c19b52d95949df59bcac2..e8fc640cdd508eb232246463729e9e9360241b2f 100644 --- a/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_grad_kernel.cu @@ -117,7 +117,12 @@ PD_REGISTER_KERNEL(layer_norm_grad, ALL_LAYOUT, phi::LayerNormGradKernel, float, - phi::dtype::float16) {} + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} #elif CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(layer_norm_grad, GPU, @@ -126,7 +131,12 @@ PD_REGISTER_KERNEL(layer_norm_grad, float, double, phi::dtype::float16, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} #else PD_REGISTER_KERNEL(layer_norm_grad, GPU, @@ -134,5 +144,10 @@ PD_REGISTER_KERNEL(layer_norm_grad, phi::LayerNormGradKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} #endif diff --git a/paddle/phi/kernels/gpu/layer_norm_kernel.cu b/paddle/phi/kernels/gpu/layer_norm_kernel.cu index 9f8122aa7589beb3fd5186ccd6993ccf89ef2278..34425d8cfcfe234cdad362a65842bc2faf678258 100644 --- a/paddle/phi/kernels/gpu/layer_norm_kernel.cu +++ b/paddle/phi/kernels/gpu/layer_norm_kernel.cu @@ -673,7 +673,10 @@ PD_REGISTER_KERNEL(layer_norm, ALL_LAYOUT, phi::LayerNormKernel, float, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); +} #elif CUDNN_VERSION_MIN(8, 1, 0) PD_REGISTER_KERNEL(layer_norm, GPU, @@ -682,7 +685,10 @@ PD_REGISTER_KERNEL(layer_norm, float, double, phi::dtype::float16, - phi::dtype::bfloat16) {} + phi::dtype::bfloat16) { + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); +} #else PD_REGISTER_KERNEL(layer_norm, GPU, @@ -690,5 +696,8 @@ PD_REGISTER_KERNEL(layer_norm, phi::LayerNormKernel, float, double, - phi::dtype::float16) {} + phi::dtype::float16) { + kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); +} #endif diff --git a/paddle/phi/kernels/gpu/merged_momentum_kernel.cu b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu index c6883caecd1a61430ea2ca2c353cb8c5419d664e..c8df58c13806337aaee202e9b20fe59f302f8fda 100644 --- a/paddle/phi/kernels/gpu/merged_momentum_kernel.cu +++ b/paddle/phi/kernels/gpu/merged_momentum_kernel.cu @@ -22,4 +22,9 @@ PD_REGISTER_KERNEL(merged_momentum, phi::MergedMomentumKernel, phi::dtype::float16, float, - double) {} + double) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} diff --git a/paddle/phi/kernels/gpu/momentum_kernel.cu b/paddle/phi/kernels/gpu/momentum_kernel.cu index 6d2b51dff64cb00bb968513c03be0cd989bda8cc..464c2c220d1501b6e041a89e01d5b346a8122e44 100644 --- a/paddle/phi/kernels/gpu/momentum_kernel.cu +++ b/paddle/phi/kernels/gpu/momentum_kernel.cu @@ -25,8 +25,10 @@ PD_REGISTER_KERNEL(momentum, float, double, phi::dtype::float16) { - kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); - kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } } PD_REGISTER_KERNEL(momentum_dense_param_sparse_grad, @@ -36,6 +38,8 @@ PD_REGISTER_KERNEL(momentum_dense_param_sparse_grad, float, double, phi::dtype::float16) { - kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); - kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } } diff --git a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu index 04b3253178902f85462362a39f9485a6d0eadf11..15215c05d6361ac723ff587482bd2b591f2bae0f 100644 --- a/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu +++ b/paddle/phi/kernels/gpu/reduce_sum_grad_kernel.cu @@ -70,4 +70,6 @@ PD_REGISTER_KERNEL(sum_grad, int, int64_t, phi::dtype::complex, - phi::dtype::complex) {} + phi::dtype::complex) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); +} diff --git a/paddle/phi/kernels/gpu/sgd_kernel.cu b/paddle/phi/kernels/gpu/sgd_kernel.cu index d489ccb4cb223826ecb40f05c34c64e82a553e1e..3e379650f9525935ddee7e2f49a34df91e064d25 100644 --- a/paddle/phi/kernels/gpu/sgd_kernel.cu +++ b/paddle/phi/kernels/gpu/sgd_kernel.cu @@ -188,7 +188,9 @@ PD_REGISTER_KERNEL(sgd, phi::dtype::float16, float, double) { - kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + } } PD_REGISTER_KERNEL(sgd_dense_param_sparse_grad, diff --git a/paddle/phi/kernels/memcpy_kernel.cc b/paddle/phi/kernels/memcpy_kernel.cc index 489929fa87b05da3c45f0d3abac9713ac8bd3f35..cf2f6ac00a6d6b85a8ae907cacb0f3d3ad6f45f5 100644 --- a/paddle/phi/kernels/memcpy_kernel.cc +++ b/paddle/phi/kernels/memcpy_kernel.cc @@ -146,13 +146,17 @@ PD_REGISTER_GENERAL_KERNEL(memcpy_d2h, CPU, ALL_LAYOUT, phi::MemcpyD2HKernel, - ALL_DTYPE) {} + ALL_DTYPE) { + kernel->OutputAt(0).SetBackend(phi::Backend::CPU); +} PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io, CPU, ALL_LAYOUT, phi::MemcpyD2HMultiIOKernel, - ALL_DTYPE) {} + ALL_DTYPE) { + kernel->OutputAt(0).SetBackend(phi::Backend::CPU); +} PD_REGISTER_GENERAL_KERNEL( memcpy, CPU, ALL_LAYOUT, phi::MemcpyKernel, ALL_DTYPE) { @@ -170,13 +174,17 @@ PD_REGISTER_GENERAL_KERNEL(memcpy_d2h, GPU, ALL_LAYOUT, phi::MemcpyD2HKernel, - ALL_DTYPE) {} + ALL_DTYPE) { + kernel->OutputAt(0).SetBackend(phi::Backend::CPU); +} PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io, GPU, ALL_LAYOUT, phi::MemcpyD2HMultiIOKernel, - ALL_DTYPE) {} + ALL_DTYPE) { + kernel->OutputAt(0).SetBackend(phi::Backend::CPU); +} PD_REGISTER_GENERAL_KERNEL( memcpy, GPU, ALL_LAYOUT, phi::MemcpyKernel, ALL_DTYPE) { @@ -196,12 +204,16 @@ PD_REGISTER_GENERAL_KERNEL(memcpy_d2h, XPU, ALL_LAYOUT, phi::MemcpyD2HKernel, - ALL_DTYPE) {} + ALL_DTYPE) { + kernel->OutputAt(0).SetBackend(phi::Backend::CPU); +} PD_REGISTER_GENERAL_KERNEL(memcpy_d2h_multi_io, XPU, ALL_LAYOUT, phi::MemcpyD2HMultiIOKernel, - ALL_DTYPE) {} + ALL_DTYPE) { + kernel->OutputAt(0).SetBackend(phi::Backend::CPU); +} #endif diff --git a/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc index 10b914a2005cd836a540fb06dcda5bc0edd9addb..8967df2f80e3633821b32ab8c98c8855912c4e43 100644 --- a/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/onednn/reduce_sum_grad_kernel.cc @@ -42,4 +42,5 @@ void SumGradKernel(const Context& dev_ctx, PD_REGISTER_KERNEL( sum_grad, OneDNN, ONEDNN, phi::SumGradKernel, float, phi::dtype::bfloat16) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } diff --git a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc index f83b72b79a080957bd85805e1a6ac5e0273244e0..3f99c1ace5176eaec44b0b0aebb2562b60c1065f 100644 --- a/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/layer_norm_grad_kernel.cc @@ -129,4 +129,9 @@ PD_REGISTER_KERNEL(layer_norm_grad, ALL_LAYOUT, phi::LayerNormGradKernel, float, - phi::dtype::float16) {} + phi::dtype::float16) { + if (kernel_key.dtype() == phi::DataType::FLOAT16) { + kernel->OutputAt(1).SetDataType(phi::DataType::FLOAT32); + kernel->OutputAt(2).SetDataType(phi::DataType::FLOAT32); + } +} diff --git a/paddle/phi/kernels/xpu/momentum_kernel.cc b/paddle/phi/kernels/xpu/momentum_kernel.cc index 207bfef37f947ae4ae3bb93bad52fe831de840d9..ad9cb2e6ef86ef86d09636554bacdd9801b58b30 100644 --- a/paddle/phi/kernels/xpu/momentum_kernel.cc +++ b/paddle/phi/kernels/xpu/momentum_kernel.cc @@ -69,7 +69,4 @@ PD_REGISTER_KERNEL(momentum, ALL_LAYOUT, phi::MomentumDenseKernel, float, - phi::dtype::float16) { - kernel->OutputAt(1).SetDataType(phi::DataType::UNDEFINED); - kernel->OutputAt(2).SetDataType(phi::DataType::UNDEFINED); -} + phi::dtype::float16) {} diff --git a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc index 0ba67f68bccf3c2d37d280b8266189a35e220563..44be073894d81abf3d748af9fda8408dcf75ada4 100644 --- a/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc +++ b/paddle/phi/kernels/xpu/reduce_sum_grad_kernel.cc @@ -71,4 +71,5 @@ void ReduceSumGradKernel(const Context& dev_ctx, } // namespace phi PD_REGISTER_KERNEL(sum_grad, XPU, ALL_LAYOUT, phi::ReduceSumGradKernel, float) { + kernel->OutputAt(0).SetDataType(phi::DataType::UNDEFINED); } diff --git a/python/paddle/fluid/tests/unittests/CMakeLists.txt b/python/paddle/fluid/tests/unittests/CMakeLists.txt index 1b881051b041b7e850fcf1cfa9547e2251411248..57ef073e51243298fc7785bf7e36ae233fb263a0 100755 --- a/python/paddle/fluid/tests/unittests/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/CMakeLists.txt @@ -913,6 +913,11 @@ if(WITH_DISTRIBUTE) set_tests_properties(test_dist_fleet_raw_program_optimizer_fuse_allreduce PROPERTIES TIMEOUT 60) set_tests_properties(test_dist_dygraph_apis PROPERTIES TIMEOUT 120) + + # NODE(Ruibiao): Remove it after static build is enabled by default. + set_tests_properties( + test_dist_mnist_fp16_allreduce test_dist_mnist_pg + PROPERTIES ENVIRONMENT FLAGS_new_executor_static_build=true) endif() # setting timeout value as 15S @@ -1229,3 +1234,52 @@ set_tests_properties( set_tests_properties( test_cuda_graph_static_mode_error PROPERTIES ENVIRONMENT "FLAGS_CUDA_GRAPH_USE_STANDALONE_EXECUTOR=1") + +# These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default. +set(STATIC_BUILD_TESTS + test_adagrad_op + test_adamw_op + test_arg_min_max_op + test_bincount_op + test_decoupled_py_reader + test_fake_quantize_op + test_fetch_lod_tensor_array + test_imperative_optimizer + test_lamb_op + test_layer_norm_op + test_lookup_table_bf16_op + test_lookup_table_v2_op + test_matmul_op + test_matmul_v2_op + test_merged_adam_op + test_momentum_op + test_nce + test_paddle_save_load_binary + test_reduce_op + test_segment_ops + test_sparse_momentum_op + test_sgd_op_bf16 + test_softmax_mask_fuse_upper_triangle_op + test_sparse_conv_op + test_sparse_norm_op + test_sparse_pooling_op + test_tensor_array_to_tensor + test_while_op + test_one_hot_v2_op) + +foreach(STATIC_BUILD_TEST ${STATIC_BUILD_TESTS}) + py_test_modules( + ${STATIC_BUILD_TEST}_static_build MODULES ${STATIC_BUILD_TEST} ENVS + FLAGS_new_executor_static_build=true) +endforeach() + +set_tests_properties(test_decoupled_py_reader_static_build PROPERTIES TIMEOUT + 120) +set_tests_properties(test_imperative_optimizer_static_build PROPERTIES TIMEOUT + 250) +set_tests_properties(test_matmul_op_static_build PROPERTIES TIMEOUT 120) +set_tests_properties(test_matmul_v2_op_static_build PROPERTIES TIMEOUT 120) +set_tests_properties(test_layer_norm_op_static_build PROPERTIES TIMEOUT 1500) +set_tests_properties(test_paddle_save_load_binary_static_build + PROPERTIES TIMEOUT 120) +set_tests_properties(test_reduce_op_static_build PROPERTIES TIMEOUT 500) diff --git a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt index d6a1fa1c9be541fea4a5c07a8bddbee4c688d91c..2105cee7c478935b98af1e82f778a9722a717716 100644 --- a/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt +++ b/python/paddle/fluid/tests/unittests/standalone_executor/CMakeLists.txt @@ -24,6 +24,18 @@ py_test_modules( test_standalone_executor_stats MODULES test_standalone_executor ENVS FLAGS_host_trace_level=10 FLAGS_static_executor_perfstat_filepath=./perfstat) +# These UTs are to temporarily test static build for standalone_executor, will be removed after static build is enabled by default. +set(STATIC_BUILD_TESTS + test_standalone_controlflow test_standalone_cuda_graph_multi_stream + test_standalone_custom_stream test_standalone_executor + test_standalone_multiply_write) + +foreach(STATIC_BUILD_TEST ${STATIC_BUILD_TESTS}) + py_test_modules( + ${STATIC_BUILD_TEST}_static_build MODULES ${STATIC_BUILD_TEST} ENVS + FLAGS_new_executor_static_build=true) +endforeach() + set_tests_properties(test_standalone_cross_step_overlap PROPERTIES TIMEOUT 30) set_tests_properties(test_standalone_executor_aot_choose_kernel PROPERTIES TIMEOUT 60) diff --git a/python/paddle/fluid/tests/unittests/test_dist_base.py b/python/paddle/fluid/tests/unittests/test_dist_base.py index 000e2955e464812a10d7899b5e418bd23ecdc25b..a4730bdccc1ac23f6437f634da49ca763d7106d6 100755 --- a/python/paddle/fluid/tests/unittests/test_dist_base.py +++ b/python/paddle/fluid/tests/unittests/test_dist_base.py @@ -1705,6 +1705,7 @@ class TestDistBase(unittest.TestCase): "http_proxy": "", "NCCL_P2P_DISABLE": "1", "NCCL_SHM_DISABLE": "1", + "FLAGS_new_executor_static_build": "1", } if check_error_log: diff --git a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py index b4811d926d5c6319c5b29b8e3d2aa9c2432ef22a..82dbaaf0e78c460e60bbcce660fd77bbfdd13898 100644 --- a/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py +++ b/python/paddle/fluid/tests/unittests/test_softmax_mask_fuse_upper_triangle_op.py @@ -69,13 +69,13 @@ class TestSoftmaxMaskFuseOp1(OpTest): def test_check_output(self): try: self.check_output_with_place(core.CPUPlace()) - except NotImplementedError: + except (NotImplementedError, RuntimeError): pass def test_check_grad(self): try: self.check_grad_with_place(core.CPUPlace(), ["X"], "Out") - except NotImplementedError: + except (NotImplementedError, RuntimeError): pass diff --git a/test/custom_op/CMakeLists.txt b/test/custom_op/CMakeLists.txt index e0538d417a6920fddabf11fa57d306e13969e5df..d7f32625db4abb0649329a21684da86f761ecaf9 100644 --- a/test/custom_op/CMakeLists.txt +++ b/test/custom_op/CMakeLists.txt @@ -49,6 +49,9 @@ if(WITH_TESTING) py_test(test_multi_out_jit SRCS test_multi_out_jit.py) py_test(test_custom_attrs_jit SRCS test_custom_attrs_jit.py) py_test(test_custom_concat SRCS test_custom_concat.py) + set_tests_properties( + test_custom_concat PROPERTIES ENVIRONMENT + FLAGS_new_executor_static_build=true) py_test(test_custom_conj SRCS test_custom_conj.py) py_test(test_custom_linear SRCS test_custom_linear.py) py_test(test_custom_simple_slice SRCS test_custom_simple_slice.py) diff --git a/test/mkldnn/CMakeLists.txt b/test/mkldnn/CMakeLists.txt index 50062d69bc8c920446d756d2650f01042ae47731..d3da5f2897da39edee1fe2d634ca6a6addbad23d 100644 --- a/test/mkldnn/CMakeLists.txt +++ b/test/mkldnn/CMakeLists.txt @@ -14,6 +14,14 @@ endif() foreach(TEST_OP ${TEST_OPS}) py_test_modules(${TEST_OP} MODULES ${TEST_OP}) endforeach() + +# NODE(Ruibiao): Remove it after static build is enabled by default. +if(WITH_MKLDNN AND NOT WIN32) + py_test_modules( + test_dequantize_mkldnn_op_static_build MODULES test_dequantize_mkldnn_op + ENVS FLAGS_new_executor_static_build=true) +endif() + set_tests_properties(test_concat_mkldnn_op PROPERTIES TIMEOUT 120) set_tests_properties(test_conv3d_mkldnn_op PROPERTIES TIMEOUT 120) if(WITH_MKLDNN AND NOT WIN32) diff --git a/tools/check_file_diff_approvals.sh b/tools/check_file_diff_approvals.sh index e0d4a6e062383dbefb2a89487d154656018008b3..39ece6a3b13466ed9e1f2d69deffc52b1574215b 100644 --- a/tools/check_file_diff_approvals.sh +++ b/tools/check_file_diff_approvals.sh @@ -306,6 +306,12 @@ if [ "${HAS_MODIFIED_SETUP}" != "" ] || ([ "${HAS_MODIFIED_SETUP_IN}" != "" ] && check_approval 1 risemeup1 zhangbo9674 fi +HAS_MODIFIED_STATIC_BUILD=`git diff --name-only upstream/$BRANCH | grep "new_executor/interpreter/static_build.cc" || true` +if [ "${HAS_MODIFIED_STATIC_BUILD}" != "" ] && [ "${GIT_PR_ID}" != ""]; then + echo_line="You must have one RD (From00 or zhiqiu) approval for file changes in new_executor/interpreter/static_build.cc.\n" + check_approval 1 From00 zhiqiu +fi + ALL_PADDLE_ENFORCE=`git diff -U0 upstream/$BRANCH |grep "^+" |grep -zoE "PADDLE_ENFORCE\(.[^,\);]+.[^;]*\);\s" || true` if [ "${ALL_PADDLE_ENFORCE}" != "" ] && [ "${GIT_PR_ID}" != "" ]; then echo_line="PADDLE_ENFORCE is not recommended. Please use PADDLE_ENFORCE_EQ/NE/GT/GE/LT/LE or PADDLE_ENFORCE_NOT_NULL or PADDLE_ENFORCE_GPU_SUCCESS instead, see [ https://github.com/PaddlePaddle/Paddle/wiki/PADDLE_ENFORCE-Rewriting-Specification ] for details.\nYou must have one RD (chenwhql (Recommend), luotao1 (Recommend) or lanxianghit) approval for the usage (either add or delete) of PADDLE_ENFORCE.\n${ALL_PADDLE_ENFORCE}\n" diff --git a/tools/windows/run_unittests.sh b/tools/windows/run_unittests.sh index 20c84c778d8024a6e939e3c5b66905b4458fab8b..87b32b12932e0afd8a87dcb45ec9a2e7fdeabf55 100644 --- a/tools/windows/run_unittests.sh +++ b/tools/windows/run_unittests.sh @@ -20,6 +20,7 @@ disable_wingpu_test="^test_model$|\ ^test_add_reader_dependency$|\ ^test_add_reader_dependency_for_interpretercore$|\ ^test_decoupled_py_reader$|\ +^test_decoupled_py_reader_static_build$|\ ^test_generator_dataloader$|\ ^test_parallel_dygraph_sync_batch_norm$|\ ^test_py_reader_using_executor$|\ @@ -103,6 +104,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_conv3d_transpose_part2_op$|\ ^test_deform_conv2d$|\ ^test_matmul_op$|\ +^test_matmul_op_static_build$|\ ^test_basic_api_transformation$|\ ^test_deformable_conv_op$|\ ^test_variable$|\ @@ -153,6 +155,7 @@ disable_win_inference_test="^trt_quant_int8_yolov3_r50_test$|\ ^test_add_reader_dependency_for_interpretercore$|\ ^test_compat$|\ ^test_decoupled_py_reader$|\ +^test_decoupled_py_reader_static_build$|\ ^test_generator_dataloader$|\ ^test_py_reader_using_executor$|\ ^test_dataloader_keep_order$|\ @@ -223,6 +226,7 @@ long_time_test="^test_gru_op$|\ ^test_imperative_lod_tensor_to_selected_rows$|\ ^test_imperative_selected_rows_to_lod_tensor$|\ ^test_layer_norm_op$|\ +^test_layer_norm_op_static_build$|\ ^test_multiclass_nms_op$|\ ^test_nearest_interp_v2_op$|\ ^test_nn_grad$|\