未验证 提交 f9c2f4ca 编写于 作者: H hong 提交者: GitHub

New ir support legacy kernel instruction (#55880)

* new ir remove fetch list

* fix pattern rewrite bug

* try to remove constant fold

* revert code

* add pattern rewrite test flag

* fix multi fetch

* remove usless code

* new ir support legacy kernel instraction

* new ir support legacy kernel instruction

* add scope prefix

* update

* update

* update

* update

* fix

* revert channel shuffl test

* polish code

* try to fix windows compile error

* polish code

* update

* update

* revert op test
上级 393db4a9
cc_library( cc_library(
instruction_base instruction_base
SRCS instruction_base.cc phi_kernel_instruction.cc SRCS instruction_base.cc phi_kernel_instruction.cc
legacy_kernel_instruction.cc instruction_util.cc
DEPS phi framework_proto) DEPS phi framework_proto)
...@@ -13,9 +13,15 @@ ...@@ -13,9 +13,15 @@
// limitations under the License. // limitations under the License.
#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h" #include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h" #include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
#include "paddle/fluid/platform/profiler/event_tracing.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
#include "paddle/fluid/platform/collective_helper.h"
#include "paddle/ir/core/builtin_attribute.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -93,5 +99,59 @@ void InstructionBase::SetOutputs( ...@@ -93,5 +99,59 @@ void InstructionBase::SetOutputs(
output_index_ = outputs; output_index_ = outputs;
} }
void InstructionBase::InitInputsOutputsIds(
::ir::Operation* op,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name) {
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
std::unordered_map<ir::Value, std::vector<int>> inputs;
for (size_t i = 0; i < op->num_operands(); i++) {
ir::Value value = op->operand_source(i);
if (value) {
PADDLE_ENFORCE_NE(
value_2_var_name.find(value),
value_2_var_name.end(),
phi::errors::PreconditionNotMet(
"input should in name map, [%d] 'th input of [%s] op",
i,
op_name));
std::vector<int> inputs_id = GetValueIds(value,
inner_scope,
value_2_var_name,
var_name_2_id,
variable_2_var_name);
inputs.emplace(value, inputs_id);
}
}
SetInputs(inputs);
VLOG(8) << "finish process inputs_index";
std::unordered_map<ir::Value, std::vector<int>> outputs;
for (size_t i = 0; i < op->num_results(); i++) {
ir::Value value = op->result(i);
if (value && value.type()) {
PADDLE_ENFORCE_NE(
value_2_var_name.find(value),
value_2_var_name.end(),
phi::errors::PreconditionNotMet(
"input should in name map, [%d] 'th input of [%s] op",
i,
op_name));
std::vector<int> outputs_id = GetValueIds(value,
inner_scope,
value_2_var_name,
var_name_2_id,
variable_2_var_name);
outputs.emplace(value, outputs_id);
}
}
SetOutputs(outputs);
VLOG(8) << "finish process outputs_index";
}
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -21,6 +21,7 @@ ...@@ -21,6 +21,7 @@
#include "paddle/fluid/framework/new_executor/new_executor_defs.h" #include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/platform/event.h" #include "paddle/fluid/platform/event.h"
#include "paddle/ir/core/value.h"
namespace ir { namespace ir {
class Value; class Value;
...@@ -137,7 +138,15 @@ class InstructionBase { ...@@ -137,7 +138,15 @@ class InstructionBase {
virtual const std::string& Name() const = 0; virtual const std::string& Name() const = 0;
private: void InitInputsOutputsIds(
::ir::Operation* op,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name);
protected:
size_t id_; size_t id_;
bool is_artificial_; // Instruction is artificial means that it is only used bool is_artificial_; // Instruction is artificial means that it is only used
......
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/ir/core/builtin_attribute.h"
#include "paddle/ir/core/operation.h"
#include "paddle/ir/core/value.h"
#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
#include "paddle/fluid/platform/collective_helper.h"
namespace paddle {
namespace framework {
std::vector<int> GetValueIds(
ir::Value value,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name) {
std::vector<int> ids;
std::string var_name = value_2_var_name.at(value);
ids.push_back(var_name_2_id.at(var_name));
// NOTE(zhangbo): Value maybe a VariableRefArray
auto var = inner_scope->FindVar(var_name);
if (var->IsType<paddle::framework::VariableRefArray>()) {
auto& var_array = var->Get<paddle::framework::VariableRefArray>();
for (auto item : var_array) {
ids.push_back(var_name_2_id.at(variable_2_var_name.at(item)));
}
}
return ids;
}
platform::DeviceContext* ParseDeviceContext(
ir::Operation* op,
platform::DeviceContext* origin_dev_ctx,
const platform::Place& place,
const std::string& execution_stream,
const int stream_priority) {
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
interpreter::ContextManager& ctx_manager =
interpreter::ContextManager::Instance();
platform::DeviceContext* dev_ctx = nullptr;
// only gpu need update. xpu not need, because xpu memcpy op kernel is
// synchronous.
if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
VLOG(6) << "Parse DeviceContext for " << op_name
<< ", execution stream = " << execution_stream;
if (execution_stream != kDefaultStream) {
dev_ctx = ctx_manager
.Get(std::string(kCustomStream) + "-" + execution_stream,
place,
stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}
if (op_name == interpreter::kMemcpyD2H) {
dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
} else if (op_name == interpreter::kMemcpyH2D) {
dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
// with use_cal_stream==false by returning a device context getting from the
// global NCCLCommContext instance. Because when use_calc_stream==false, in
// OP kernel, the NCCL communication will be launched to the stream directly
// getting from the global NCCLCommContext instance rather than the
// DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
// c_allreduce_op.h). Now it is just a temporary solution for ONLY
// c_allreduce_sum which is used in ResNet50 distributed training.
if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
.dyn_cast<::ir::BoolAttribute>()
.data() == false) {
int ring_id =
op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
return platform::NCCLCommContext::Instance()
.Get(ring_id, place)
->dev_context();
}
#endif
}
if (origin_dev_ctx != nullptr) {
interpreter::SetDeviceCommContext(op, origin_dev_ctx);
}
return origin_dev_ctx;
}
OpFuncType AnalyseOpFuncType(::ir::Operation* op,
const platform::Place& place) {
if (platform::is_cpu_place(place)) {
return OpFuncType::kCpuSync;
}
auto kernel_key = op->attributes()
.at("kernel_key")
.dyn_cast<dialect::KernelAttribute>()
.data();
if (phi::TransToPhiPlace(kernel_key.backend()).GetType() ==
phi::AllocationType::CPU) {
return OpFuncType::kCpuSync;
}
PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
true,
phi::errors::Fatal("Unsupported current place %s", place));
// Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
// computing. They execute serially in device thread and block CUDA kernel
// launching in other GPU OPs. To improve performance, set them as kGpuSync
// and so that they would be dispatched to host thread.
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
if (op_name == kCoalesceTensor &&
(!platform::is_xpu_place(place) ||
op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
return OpFuncType::kGpuSync;
}
// for memcpy explicitly called by user
if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
return OpFuncType::kGpuSync;
}
if (op_name == "shape") {
return OpFuncType::kGpuSync;
}
return OpFuncType::kGpuAsync;
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <map>
#include <string>
#include <unordered_map>
#include <vector>
#include "paddle/fluid/framework/new_executor/new_executor_defs.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/fluid/platform/event.h"
#include "paddle/ir/core/builtin_attribute.h"
#include "paddle/ir/core/operation.h"
#include "paddle/ir/core/value.h"
namespace paddle {
namespace framework {
std::vector<int> GetValueIds(
ir::Value value,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name);
platform::DeviceContext* ParseDeviceContext(
ir::Operation* op,
platform::DeviceContext* origin_dev_ctx,
const platform::Place& place,
const std::string& execution_stream,
const int stream_priority);
OpFuncType AnalyseOpFuncType(::ir::Operation* op, const platform::Place& place);
} // namespace framework
} // namespace paddle
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
#include "paddle/fluid/framework/new_executor/interpreter/interpreter_util.h"
#include "paddle/fluid/framework/new_executor/interpreter/stream_analyzer.h"
#include "paddle/fluid/framework/scope.h"
#include "paddle/fluid/ir/dialect/pd_dialect.h"
#include "paddle/fluid/ir/interface/infermeta.h"
#include "paddle/fluid/ir/interface/op_yaml_info.h"
#include "paddle/fluid/ir/interface/op_yaml_info_parser.h"
#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
#include "paddle/fluid/platform/device_context.h"
#include "paddle/phi/core/infermeta_utils.h"
#include "paddle/phi/core/meta_tensor.h"
#include "paddle/phi/core/type_defs.h"
namespace paddle {
namespace framework {
LegacyKernelInstruction::LegacyKernelInstruction(
size_t id,
const platform::Place& place,
ir::Operation* op,
Scope* scope,
Scope* local_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name)
: InstructionBase(id, place) {
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
ir::OpInfo op_info = ir::IrContext::Instance()->GetRegisteredOpInfo(op_name);
legacy_op_name_ = op_name;
VLOG(6) << "construct phi kernel instruction for: " << legacy_op_name_;
// Todo: support paddle::dialect::DistAttribute
// if (op_attributes.count("dist_attr") != 0) {
// if (op_attributes.count("execution_stream") != 0) {
// SetExecutionStream(op_attributes.at("execution_stream")
// .dyn_cast<::ir::StrAttribute>()
// .data());
// }
// if (op_attributes.count("stream_priority") != 0) {
// SetStreamPriority(op_attributes.at("stream_priority")
// .dyn_cast<::ir::Int32Attribute>()
// .data());
// }
// if (op_attributes.count("scheduling_priority") != 0) {
// SetSchedulingPriority(op_attributes.at("scheduling_priority")
// .dyn_cast<::ir::Int64Attribute>()
// .data());
// }
// } else {
// if (interpreter::IsCommunicationOp(op)) {
// // NOTE(Ruibiao): Dispatching computation before communication
// improves
// // multi-stream overlap when the time cost of communication less than
// // that of the calculation (e.g., ResNet50_bs128_pure_fp16 N4C32
// // training).
// op_func_node.scheduling_priority_ = 1;
// }
// }
VLOG(6) << "finish process dist attributes";
SetKernelType(AnalyseOpFuncType(op, place));
VLOG(6) << "finish process analyse kernel type";
infer_meta_interface_ =
op_info.GetInterfaceImpl<paddle::dialect::InferMetaInterface>();
VLOG(6) << "finish process infer_meta_interface_";
auto yaml_interface =
op_info.GetInterfaceImpl<paddle::dialect::OpYamlInfoInterface>();
PADDLE_ENFORCE_NOT_NULL(
yaml_interface,
phi::errors::PreconditionNotMet(
"can not find OpYamlInfoInterface from [%s]", legacy_op_name_));
paddle::dialect::OpYamlInfoParser yaml_info_parser(
yaml_interface->get_op_info_());
VLOG(6) << "finish process yaml_info_parser";
::ir::BuildPhiContext<
phi::InferMetaContext,
phi::MetaTensor,
phi::MetaTensor,
paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
paddle::small_vector<phi::MetaTensor, phi::kInputSmallVectorSize>,
false>(op,
value_2_var_name,
scope,
local_scope,
yaml_info_parser,
&infer_meta_context_);
VLOG(6) << "finish process infer meta context";
auto kernel_name =
op_attributes.at("kernel_name").dyn_cast<ir::StrAttribute>().AsString();
auto kernel_key = op_attributes.at("kernel_key")
.dyn_cast<paddle::dialect::KernelAttribute>()
.data();
auto kernel_result = phi::KernelFactory::Instance().SelectKernelOrThrowError(
kernel_name, kernel_key);
phi_kernel_ = new phi::Kernel(kernel_result.kernel);
PADDLE_ENFORCE_EQ(
phi_kernel_->IsValid(), true, "not found kernel for [%s]", kernel_name);
VLOG(6) << "finish process select kernel";
operator_base_ =
ir::BuildOperatorBase(op, value_2_var_name, yaml_info_parser);
paddle::framework::VariableValueMap in_map;
paddle::framework::VariableValueMap out_map;
auto dev_ctx = phi::DeviceContextPool::Instance().Get(
phi::TransToPhiPlace(kernel_key.backend()));
runtime_context_ = std::make_shared<paddle::framework::RuntimeContext>(
paddle::framework::RuntimeContext(in_map, out_map));
ir::BuildRuntimeContext(op,
value_2_var_name,
scope,
local_scope,
yaml_info_parser,
runtime_context_.get());
kernel_context_ = new paddle::framework::ExecutionContext(
*operator_base_, *local_scope, *dev_ctx, *(runtime_context_.get()));
VLOG(6) << "finish process kernel context";
SetDeviceContext(
ParseDeviceContext(op,
phi::DeviceContextPool::Instance().Get(
phi::TransToPhiPlace(kernel_key.backend())),
place,
GetExecutionStream(),
GetStreamPriority()));
VLOG(6) << "finish process device context";
Scope* inner_scope = local_scope == nullptr ? scope : local_scope;
InitInputsOutputsIds(
op, inner_scope, value_2_var_name, var_name_2_id, variable_2_var_name);
VLOG(6) << "finish process inputs outputs index";
auto& no_need_buffer_ids = yaml_info_parser.NoNeedBufferIds();
std::unordered_set<::ir::Value> no_need_buffer_values;
for (size_t id = 0; id < no_need_buffer_ids.size(); id++) {
no_need_buffer_values.insert(op->operand_source(no_need_buffer_ids[id]));
}
SetNoNeedBuffer(no_need_buffer_values);
VLOG(6) << "finish process no need buffer";
}
LegacyKernelInstruction::~LegacyKernelInstruction() {
if (kernel_context_ != nullptr) {
delete kernel_context_;
}
}
void LegacyKernelInstruction::Run() {
infer_meta_interface_->infer_meta_(&(infer_meta_context_));
VLOG(6) << "Run op " << legacy_op_name_ << " infer meta.";
(*(phi_kernel_))((kernel_context_));
VLOG(6) << "Run op " << legacy_op_name_ << " kernel.";
}
} // namespace framework
} // namespace paddle
// Copyright (c) 2023 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include "paddle/fluid/framework/new_executor/instruction/instruction_base.h"
namespace ir {
class Operation;
class Value;
} // namespace ir
namespace paddle {
namespace framework {
class Scope;
class LegacyKernelInstruction : public InstructionBase {
public:
LegacyKernelInstruction(
size_t id,
const platform::Place& place,
::ir::Operation* op,
Scope* scope,
Scope* local_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name);
~LegacyKernelInstruction();
phi::Kernel* PhiKernel() const { return phi_kernel_; }
const phi::InferMetaContext& InferMetaContext() const {
return infer_meta_context_;
}
paddle::dialect::InferMetaInterface::Concept* InferMetaInterface() const {
return infer_meta_interface_;
}
void Run() override;
const std::string& Name() const override { return legacy_op_name_; }
private:
std::string legacy_op_name_;
paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{
nullptr}; // not owned
phi::InferMetaContext infer_meta_context_;
paddle::framework::ExecutionContext* kernel_context_{nullptr};
std::shared_ptr<framework::RuntimeContext> runtime_context_;
std::shared_ptr<paddle::framework::OperatorBase> operator_base_;
phi::Kernel* phi_kernel_{nullptr}; // not owned
};
} // namespace framework
} // namespace paddle
...@@ -32,124 +32,10 @@ ...@@ -32,124 +32,10 @@
#include "paddle/ir/core/operation.h" #include "paddle/ir/core/operation.h"
#include "paddle/ir/core/value.h" #include "paddle/ir/core/value.h"
#include "paddle/fluid/framework/new_executor/instruction/instruction_util.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
platform::DeviceContext* ParseDeviceContext(
ir::Operation* op,
platform::DeviceContext* origin_dev_ctx,
const platform::Place& place,
const std::string& execution_stream,
const int stream_priority) {
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
interpreter::ContextManager& ctx_manager =
interpreter::ContextManager::Instance();
platform::DeviceContext* dev_ctx = nullptr;
// only gpu need update. xpu not need, because xpu memcpy op kernel is
// synchronous.
if (platform::is_gpu_place(place) || platform::is_custom_place(place)) {
VLOG(6) << "Parse DeviceContext for " << op_name
<< ", execution stream = " << execution_stream;
if (execution_stream != kDefaultStream) {
dev_ctx = ctx_manager
.Get(std::string(kCustomStream) + "-" + execution_stream,
place,
stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}
if (op_name == interpreter::kMemcpyD2H) {
dev_ctx = ctx_manager.Get(std::string(kD2HStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
} else if (op_name == interpreter::kMemcpyH2D) {
dev_ctx = ctx_manager.Get(std::string(kH2DStream), place, stream_priority)
.get()
.get();
interpreter::SetDeviceCommContext(op, dev_ctx);
return dev_ctx;
}
#if defined(PADDLE_WITH_NCCL) || defined(PADDLE_WITH_RCCL)
// NOTE(Ruibiao): Here supports multi-stream overlap for c_allreduce_sum
// with use_cal_stream==false by returning a device context getting from the
// global NCCLCommContext instance. Because when use_calc_stream==false, in
// OP kernel, the NCCL communication will be launched to the stream directly
// getting from the global NCCLCommContext instance rather than the
// DeviceContext passed from executor (see CAllReduceOpCUDAKernel in
// c_allreduce_op.h). Now it is just a temporary solution for ONLY
// c_allreduce_sum which is used in ResNet50 distributed training.
if (op_name == "c_allreduce_sum" && op_attributes.at("use_calc_stream")
.dyn_cast<::ir::BoolAttribute>()
.data() == false) {
int ring_id =
op_attributes.at("ring_id").dyn_cast<::ir::Int32Attribute>().data();
return platform::NCCLCommContext::Instance()
.Get(ring_id, place)
->dev_context();
}
#endif
}
if (origin_dev_ctx != nullptr) {
interpreter::SetDeviceCommContext(op, origin_dev_ctx);
}
return origin_dev_ctx;
}
OpFuncType AnalyseOpFuncType(ir::Operation* op, const platform::Place& place) {
if (platform::is_cpu_place(place)) {
return OpFuncType::kCpuSync;
}
auto kernel_key = op->attributes()
.at("kernel_key")
.dyn_cast<dialect::KernelAttribute>()
.data();
if (phi::TransToPhiPlace(kernel_key.backend()).GetType() ==
phi::AllocationType::CPU) {
return OpFuncType::kCpuSync;
}
PADDLE_ENFORCE_EQ(interpreter::IsSupportedHeterPlace(place),
true,
phi::errors::Fatal("Unsupported current place %s", place));
// Some GPU OPs do not launch CUDA Kernel, but spend a lot of time on CPU
// computing. They execute serially in device thread and block CUDA kernel
// launching in other GPU OPs. To improve performance, set them as kGpuSync
// and so that they would be dispatched to host thread.
auto op_attributes = op->attributes();
auto op_name =
op_attributes.at("op_name").dyn_cast<::ir::StrAttribute>().AsString();
if (op_name == kCoalesceTensor &&
(!platform::is_xpu_place(place) ||
op->attribute<ir::BoolAttribute>("persist_output").data() == false) &&
op->attribute<ir::BoolAttribute>("set_constant").data() == false &&
op->attribute<ir::BoolAttribute>("copy_data").data() == false) {
return OpFuncType::kGpuSync;
}
// for memcpy explicitly called by user
if (platform::is_gpu_place(place) && op_name == interpreter::kMemcpyD2H) {
return OpFuncType::kGpuSync;
}
if (op_name == "shape") {
return OpFuncType::kGpuSync;
}
return OpFuncType::kGpuAsync;
}
PhiKernelInstruction::PhiKernelInstruction( PhiKernelInstruction::PhiKernelInstruction(
size_t id, size_t id,
const platform::Place& place, const platform::Place& place,
...@@ -281,78 +167,6 @@ PhiKernelInstruction::PhiKernelInstruction( ...@@ -281,78 +167,6 @@ PhiKernelInstruction::PhiKernelInstruction(
VLOG(6) << "finish process no need buffer"; VLOG(6) << "finish process no need buffer";
} }
std::vector<int> GetValueIds(
ir::Value value,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name) {
std::vector<int> ids;
std::string var_name = value_2_var_name.at(value);
ids.push_back(var_name_2_id.at(var_name));
// NOTE(zhangbo): Value maybe a VariableRefArray
auto var = inner_scope->FindVar(var_name);
if (var->IsType<paddle::framework::VariableRefArray>()) {
auto& var_array = var->Get<paddle::framework::VariableRefArray>();
for (auto item : var_array) {
ids.push_back(var_name_2_id.at(variable_2_var_name.at(item)));
}
}
return ids;
}
void PhiKernelInstruction::InitInputsOutputsIds(
::ir::Operation* op,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name) {
std::unordered_map<ir::Value, std::vector<int>> inputs;
for (size_t i = 0; i < op->num_operands(); i++) {
ir::Value value = op->operand_source(i);
if (value) {
PADDLE_ENFORCE_NE(
value_2_var_name.find(value),
value_2_var_name.end(),
phi::errors::PreconditionNotMet(
"input should in name map, [%d] 'th input of [%s] op",
i,
phi_op_name_));
std::vector<int> inputs_id = GetValueIds(value,
inner_scope,
value_2_var_name,
var_name_2_id,
variable_2_var_name);
inputs.emplace(value, inputs_id);
}
}
SetInputs(inputs);
VLOG(8) << "finish process inputs_index";
std::unordered_map<ir::Value, std::vector<int>> outputs;
for (size_t i = 0; i < op->num_results(); i++) {
ir::Value value = op->result(i);
if (value && value.type()) {
PADDLE_ENFORCE_NE(
value_2_var_name.find(value),
value_2_var_name.end(),
phi::errors::PreconditionNotMet(
"input should in name map, [%d] 'th input of [%s] op",
i,
phi_op_name_));
std::vector<int> outputs_id = GetValueIds(value,
inner_scope,
value_2_var_name,
var_name_2_id,
variable_2_var_name);
outputs.emplace(value, outputs_id);
}
}
SetOutputs(outputs);
VLOG(8) << "finish process outputs_index";
}
void PhiKernelInstruction::Run() { void PhiKernelInstruction::Run() {
if (infer_meta_interface_) { if (infer_meta_interface_) {
infer_meta_interface_->infer_meta_(&(infer_meta_context_)); infer_meta_interface_->infer_meta_(&(infer_meta_context_));
......
...@@ -55,16 +55,6 @@ class PhiKernelInstruction : public InstructionBase { ...@@ -55,16 +55,6 @@ class PhiKernelInstruction : public InstructionBase {
const std::string& Name() const override { return phi_op_name_; } const std::string& Name() const override { return phi_op_name_; }
private: private:
void InitInputsOutputsIds(
::ir::Operation* op,
Scope* inner_scope,
const std::unordered_map<::ir::Value, std::string>& value_2_var_name,
const std::map<std::string, int>& var_name_2_id,
const std::unordered_map<const paddle::framework::Variable*, std::string>&
variable_2_var_name);
std::string phi_op_name_;
paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{ paddle::dialect::InferMetaInterface::Concept* infer_meta_interface_{
nullptr}; // not owned nullptr}; // not owned
...@@ -73,6 +63,8 @@ class PhiKernelInstruction : public InstructionBase { ...@@ -73,6 +63,8 @@ class PhiKernelInstruction : public InstructionBase {
phi::KernelContext kernel_context_; phi::KernelContext kernel_context_;
phi::Kernel* phi_kernel_{nullptr}; // not owned phi::Kernel* phi_kernel_{nullptr}; // not owned
std::string phi_op_name_;
}; };
} // namespace framework } // namespace framework
......
...@@ -125,5 +125,6 @@ const platform::Place& InterpreterCore::GetPlace() const { ...@@ -125,5 +125,6 @@ const platform::Place& InterpreterCore::GetPlace() const {
void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) { void InterpreterCore::SetOutputHooks(const std::vector<HookFunc>& hookfuncs) {
impl_->SetOutputHooks(hookfuncs); impl_->SetOutputHooks(hookfuncs);
} }
} // namespace framework } // namespace framework
} // namespace paddle } // namespace paddle
...@@ -36,6 +36,7 @@ ...@@ -36,6 +36,7 @@
#include "paddle/fluid/platform/flags.h" #include "paddle/fluid/platform/flags.h"
#include "paddle/phi/backends/device_manager.h" #include "paddle/phi/backends/device_manager.h"
#include "paddle/fluid/framework/new_executor/instruction/legacy_kernel_instruction.h"
#include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h" #include "paddle/fluid/framework/new_executor/instruction/phi_kernel_instruction.h"
#include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h" #include "paddle/fluid/ir/phi_kernel_adaptor/phi_kernel_util.h"
#include "paddle/ir/core/builtin_attribute.h" #include "paddle/ir/core/builtin_attribute.h"
...@@ -251,7 +252,6 @@ FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names, ...@@ -251,7 +252,6 @@ FetchList NewIRInterpreter::Run(const std::vector<std::string>& feed_names,
// return Fetch Tensors // return Fetch Tensors
Scope* inner_scope = InnerScope(); Scope* inner_scope = InnerScope();
if (FLAGS_enable_new_ir_in_executor) { if (FLAGS_enable_new_ir_in_executor) {
framework::FetchList fetch_res; framework::FetchList fetch_res;
...@@ -1545,6 +1545,19 @@ void NewIRInterpreter::BuildInstruction() { ...@@ -1545,6 +1545,19 @@ void NewIRInterpreter::BuildInstruction() {
VLOG(6) << "skip process " << op_name; VLOG(6) << "skip process " << op_name;
continue; continue;
} }
if (op_name == "pd.fused_softmax_mask_upper_triangle" ||
op_name == "pd.fused_softmax_mask_upper_triangle_grad") {
vec_instruction_base_.emplace_back(
std::make_unique<LegacyKernelInstruction>(op_idx++,
place_,
op,
scope_,
local_scope_,
value_2_var_name_,
var_name_2_id_,
variable_2_var_name_));
} else {
vec_instruction_base_.emplace_back( vec_instruction_base_.emplace_back(
std::make_unique<PhiKernelInstruction>(op_idx++, std::make_unique<PhiKernelInstruction>(op_idx++,
place_, place_,
...@@ -1554,6 +1567,7 @@ void NewIRInterpreter::BuildInstruction() { ...@@ -1554,6 +1567,7 @@ void NewIRInterpreter::BuildInstruction() {
value_2_var_name_, value_2_var_name_,
var_name_2_id_, var_name_2_id_,
variable_2_var_name_)); variable_2_var_name_));
}
} else { } else {
PADDLE_THROW(platform::errors::Unimplemented( PADDLE_THROW(platform::errors::Unimplemented(
"Now only support pd_kernel dialect.")); "Now only support pd_kernel dialect."));
......
...@@ -159,7 +159,6 @@ paddle::framework::FetchList StandaloneExecutor::Run( ...@@ -159,7 +159,6 @@ paddle::framework::FetchList StandaloneExecutor::Run(
// return Fetch Tensors // return Fetch Tensors
if (FLAGS_enable_new_ir_in_executor) { if (FLAGS_enable_new_ir_in_executor) {
framework::FetchList fetch_res; framework::FetchList fetch_res;
for (auto& var_name : fetch_var_names_) { for (auto& var_name : fetch_var_names_) {
auto* var = scope_->FindVar(var_name); auto* var = scope_->FindVar(var_name);
fetch_res.push_back(var->Get<phi::DenseTensor>()); fetch_res.push_back(var->Get<phi::DenseTensor>());
......
...@@ -234,6 +234,7 @@ void HandleForSpecialOp( ...@@ -234,6 +234,7 @@ void HandleForSpecialOp(
->Var(fetch_var_name); ->Var(fetch_var_name);
var->GetMutable<phi::DenseTensor>(); var->GetMutable<phi::DenseTensor>();
auto value = op->result(0); auto value = op->result(0);
AddNewData(value, AddNewData(value,
fetch_var_name, fetch_var_name,
var, var,
......
...@@ -59,6 +59,10 @@ const std::unordered_set<std::string> UnchangeOutputOps = { ...@@ -59,6 +59,10 @@ const std::unordered_set<std::string> UnchangeOutputOps = {
"builtin.get_parameter", "builtin.get_parameter",
"pd.shadow_output"}; "pd.shadow_output"};
const std::unordered_set<std::string> LegacyOpList = {
"pd.fused_softmax_mask_upper_triangle",
"pd.fused_softmax_mask_upper_triangle_grad"};
bool NeedFallBackCpu(const ir::Operation* op, bool NeedFallBackCpu(const ir::Operation* op,
const std::string& kernel_fn_name, const std::string& kernel_fn_name,
const phi::KernelKey& kernel_key) { const phi::KernelKey& kernel_key) {
...@@ -401,7 +405,8 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog, ...@@ -401,7 +405,8 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
kernel_fn_str, kernel_key); kernel_fn_str, kernel_key);
auto args_def = phi_kernel.args_def(); auto args_def = phi_kernel.args_def();
auto output_defs = args_def.output_defs(); auto output_defs = args_def.output_defs();
if (!UnchangeOutputOps.count(op_item->name())) { if (!UnchangeOutputOps.count(op_item->name()) &&
!LegacyOpList.count(op_item->name())) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
op_item->num_results(), op_item->num_results(),
output_defs.size(), output_defs.size(),
...@@ -413,7 +418,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog, ...@@ -413,7 +418,7 @@ std::unique_ptr<ir::Program> PdOpLowerToKernelPass(ir::Program* prog,
for (size_t i = 0; i < op_item->num_results(); ++i) { for (size_t i = 0; i < op_item->num_results(); ++i) {
phi::Place out_place; phi::Place out_place;
if ((!UnchangeOutputOps.count(op_item->name())) && if ((!UnchangeOutputOps.count(op_item->name())) &&
phi_kernel.IsValid()) { (!LegacyOpList.count(op_item->name())) && phi_kernel.IsValid()) {
out_place = phi::TransToPhiPlace(output_defs[i].backend); out_place = phi::TransToPhiPlace(output_defs[i].backend);
} else { } else {
out_place = phi::TransToPhiPlace(kernel_key.backend()); out_place = phi::TransToPhiPlace(kernel_key.backend());
......
...@@ -7,3 +7,7 @@ endif() ...@@ -7,3 +7,7 @@ endif()
cc_test_old(pattern_rewrite_test SRCS pattern_rewrite_test.cc DEPS cc_test_old(pattern_rewrite_test SRCS pattern_rewrite_test.cc DEPS
${PATTERN_REWRITE_TEST_DEPS}) ${PATTERN_REWRITE_TEST_DEPS})
set_tests_properties(
pattern_rewrite_test PROPERTIES ENVIRONMENT
"FLAGS_enable_new_ir_in_executor=true")
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册