未验证 提交 afa0e82c 编写于 作者: L Leo Chen 提交者: GitHub

[new-exec] fit for mkldnn and inplace op (#40955)

* fit for mkldnn and inplace op

* fix compile

* refine ut

* register op version

* fix inplace op

* fix transfer_layout
上级 de8962bd
......@@ -149,7 +149,8 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
// 2. Construct VariableNameMap
VariableNameMap in_name_map = {{"X", {var_name}}};
VariableNameMap out_name_map = {{"Out", {*new_var_name}}};
AttributeMap attr_map = {{"dst_layout", static_cast<int>(out_layout)}};
AttributeMap attr_map = {{"src_layout", static_cast<int>(in_layout)},
{"dst_layout", static_cast<int>(out_layout)}};
// 3. Create transfer_layout_op
std::string op_type("transfer_layout");
......@@ -157,8 +158,9 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
auto op = std::shared_ptr<OperatorBase>(
op_info.Creator()(op_type, in_name_map, out_name_map, attr_map));
VLOG(3) << string::Sprintf("Insert %s(%s) with %s -> %s(%s).", op_type,
var_name, in_layout, *new_var_name, out_layout);
VLOG(3) << string::Sprintf("Insert %s for variable %s(%s) -> %s(%s).",
op_type, var_name, in_layout, *new_var_name,
out_layout);
return op;
}
......@@ -242,6 +244,7 @@ std::shared_ptr<OperatorBase> TransferDevice(const std::string& var_name,
void ApplyDataTransform(const OpKernelType& expected_kernel_key,
const platform::Place& place,
VariableValueMap* ins_map_temp,
VariableValueMap* outs_map_temp,
VariableScope* var_scope, OpFuncNode* op_func_node,
std::vector<OpFuncNode>* new_op_func_nodes,
bool use_local_scope) {
......@@ -251,6 +254,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
"op_base in apply_data_transform."));
VariableNameMap new_ins(op_base->Inputs());
VariableNameMap new_outs(op_base->Outputs());
// record the no need transform variable index.
std::unordered_set<int> no_data_transform_index;
......@@ -258,7 +262,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
for (auto& var_name_item : *ins_map_temp) {
for (size_t i = 0; i < var_name_item.second.size(); ++i) {
auto var = var_name_item.second[i];
auto& var_name = new_ins[var_name_item.first].at(i);
auto var_name = new_ins[var_name_item.first].at(i);
const Tensor* tensor_in;
if (var->IsType<LoDTensor>() || var->IsType<phi::SelectedRows>()) {
tensor_in = GetLoDTensorOrSelectedRowsValueFromVar(*var);
......@@ -287,6 +291,28 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
var_scope->VarId(new_var_name);
var_name_item.second[i] = var_scope->Var(new_var_name);
new_ins[var_name_item.first][i] = new_var_name;
for (auto& pair : new_outs) {
for (size_t j = 0; j < pair.second.size(); ++j) {
VLOG(4) << pair.second[j] << " " << var_name;
if (pair.second[j] == var_name) {
VLOG(4) << "Found inplace between input(" << var_name_item.first
<< ") and output(" << pair.first
<< "), the variable name is " << var_name;
(*outs_map_temp)[pair.first][j] = var_scope->Var(new_var_name);
new_outs[pair.first][j] = new_var_name;
op_func_node
->inplace_back_map[var_scope->GetIdByName(new_var_name)] =
var_scope->GetIdByName(var_name);
op_func_node->output_index[pair.first][j] =
var_scope->VarId(new_var_name);
// NOTE(zhiqiu): The inplace op with `transfer` also changes
// original output after that
// so add original output as well
op_func_node->output_index[pair.first].push_back(
var_scope->VarId(var_name));
}
}
}
// NOTE(Aurelius84): avoid deepcopy twice if we already insert data
// transfer op.
if (op_base->Type() == "fetch_v2") {
......@@ -306,7 +332,7 @@ void ApplyDataTransform(const OpKernelType& expected_kernel_key,
// with instruction. (hot fix, it is not good design here)
op_func_node->operator_base_ =
std::shared_ptr<OperatorBase>(framework::OpRegistry::CreateOp(
op_base->Type(), new_ins, op_base->Outputs(), op_base->Attrs()));
op_base->Type(), new_ins, new_outs, op_base->Attrs()));
op_func_node->no_data_transform_index = std::move(no_data_transform_index);
}
......
......@@ -54,6 +54,7 @@ class DataTranferHelper {
void ApplyDataTransform(const OpKernelType& expected_kernel_key,
const platform::Place& place,
VariableValueMap* ins_map_temp,
VariableValueMap* outs_map_temp,
VariableScope* var_scope, OpFuncNode* op_func_node,
std::vector<OpFuncNode>* op_func_nodes,
bool use_local_scope = true);
......
......@@ -457,6 +457,21 @@ void InterpreterCore::RunInstruction(const Instruction& instr_node) {
VLOG(4) << "End run " << place << " " << op->DebugStringEx(global_scope_);
if (!instr_node.InplaceBackMap().empty()) {
auto& m = instr_node.InplaceBackMap();
// NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
for (auto& p : m) {
auto* transformed_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
global_scope_->Var(p.first));
auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
global_scope_->Var(p.second));
original_tensor->ShareDataWith(*transformed_tensor);
VLOG(4) << "Transfer inplace variable back form "
<< global_scope_->GetNameById(p.first) << " to "
<< global_scope_->GetNameById(p.second);
}
}
/*For profiling/benchmark only*/
if (FLAGS_benchmark) {
instr_node.DeviceContext().Wait();
......
......@@ -138,7 +138,9 @@ get_unused_vars(const BlockDesc& block,
size_t op_idx = name_op_idx_pair.second;
result[ops[op_idx].get()].emplace_back(name);
VLOG(4) << ops[op_idx].get()->Type() << " " << name;
}
VLOG(4) << "gc map size:" << result.size();
return result;
}
......@@ -311,8 +313,8 @@ void build_op_func_list(const platform::Place& place,
operators::PrepareSafeEagerDeletionOnRecurrentOpAndRecurrentGradOp(
main_program, block.ID(), ops_unique);
std::vector<std::shared_ptr<OperatorBase>>
ops; // its elements will be moved to vec_func_list
// its elements will be moved to vec_func_list
std::vector<std::shared_ptr<OperatorBase>> ops;
for (auto& op_unique : ops_unique) {
ops.emplace_back(std::move(op_unique));
}
......@@ -348,34 +350,28 @@ void build_op_func_list(const platform::Place& place,
op_func_node.operator_base_ = ops[i];
op_func_node.input_index = ins_name2id;
op_func_node.output_index = outs_name2id;
VLOG(4) << "Start run " << place << " " << op->DebugStringEx(local_scope);
if (dynamic_cast<const framework::OperatorWithKernel*>(op) == nullptr) {
if (dynamic_cast<framework::OperatorWithKernel*>(op) == nullptr) {
// op is not a operatorwithkernel, so direcly run OperatorBase::Run()
deal_operator_base(place, var_scope, ops[i], &op_func_node, local_scope);
VLOG(4) << "End run " << place << " "
<< op_func_node.operator_base_->DebugStringEx(local_scope);
} else {
auto op_with_kernel =
static_cast<const framework::OperatorWithKernel*>(op);
auto op_with_kernel = const_cast<framework::OperatorWithKernel*>(
static_cast<const framework::OperatorWithKernel*>(op));
// construct RuntimeContext and analysis KernelType
RuntimeContext runtime_context({}, {});
runtime_context.inputs.swap(ins_map);
runtime_context.outputs.swap(outs_map);
// see OperatorWithKernel::RunImpl in operator.cc for why
if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
// TODO(Aurelius84): In case of control flow ops, they are NOT
// inheritted
// from OperatorWithKernel.
op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
}
platform::DeviceContextPool& pool =
platform::DeviceContextPool::Instance();
auto* dev_ctx = pool.Get(place);
Scope scope;
auto expected_kernel_key = op_with_kernel->GetExpectedKernelType(
ExecutionContext(*op, scope, *dev_ctx, runtime_context));
op_with_kernel->ResetKernelType(new OpKernelType(expected_kernel_key));
// change device by the device_guard()
apply_device_guard(op, place, &expected_kernel_key);
......@@ -383,13 +379,16 @@ void build_op_func_list(const platform::Place& place,
// step 3. apply data transforms and insert data transfer ops
VariableValueMap& ins_map_temp = runtime_context.inputs;
VariableValueMap& outs_map_temp = runtime_context.outputs;
// NOTE(zhiqiu): op_func_node->operator_base_ maybe changed in
// ApplyDataTransform
ApplyDataTransform(expected_kernel_key, place, &ins_map_temp, var_scope,
&op_func_node, vec_func_list, use_local_scope);
op_with_kernel = static_cast<const framework::OperatorWithKernel*>(
op_func_node.operator_base_.get());
ApplyDataTransform(expected_kernel_key, place, &ins_map_temp,
&outs_map_temp, var_scope, &op_func_node,
vec_func_list, use_local_scope);
op_with_kernel = const_cast<framework::OperatorWithKernel*>(
static_cast<const framework::OperatorWithKernel*>(
op_func_node.operator_base_.get()));
// step 4. Run op kernel
VLOG(3) << op_with_kernel->Type()
......@@ -412,6 +411,16 @@ void build_op_func_list(const platform::Place& place,
auto exec_ctx =
ExecutionContext(*op_with_kernel, scope, *dev_ctx, runtime_context);
// see OperatorWithKernel::RunImpl in operator.cc for why
if (!(op->HasAttr(kAllKernelsMustComputeRuntimeShape) &&
op->Attr<bool>(kAllKernelsMustComputeRuntimeShape))) {
InterpretercoreInferShapeContext infer_shape_ctx(*op, runtime_context);
// TODO(Aurelius84): In case of control flow ops, they are NOT
// inheritted
// from OperatorWithKernel.
op_with_kernel->Info().infer_shape_(&infer_shape_ctx);
}
auto run_phi_kernel = false;
if (phi::KernelFactory::Instance().HasCompatiblePhiKernel(
op_with_kernel->Type())) {
......@@ -476,9 +485,28 @@ void build_op_func_list(const platform::Place& place,
op_func_node, place, outputs_names, &runtime_context.outputs,
var_scope, vec_func_list, local_scope);
}
if (!op_func_node.inplace_back_map.empty()) {
auto& m = op_func_node.inplace_back_map;
// NOTE(zhiqiu): same logic as TransferInplaceVarsBack() in operator.cc
for (auto& p : m) {
auto* transformed_tensor =
GetMutableLoDTensorOrSelectedRowsValueFromVar(
var_scope->Var(p.first));
auto* original_tensor = GetMutableLoDTensorOrSelectedRowsValueFromVar(
var_scope->Var(p.second));
original_tensor->ShareDataWith(*transformed_tensor);
VLOG(4) << "Transfer inplace variable back form "
<< var_scope->GetNameById(p.first) << " to "
<< var_scope->GetNameById(p.second);
}
}
}
VLOG(4) << "End run " << place << " "
<< op_func_node.operator_base_->DebugStringEx(local_scope);
vec_func_list->emplace_back(op_func_node);
// gc---------------------------------------------------------------------------
auto iter = unused_var_map.find(op);
if (iter == unused_var_map.end()) {
......@@ -514,10 +542,7 @@ void build_op_func_list(const platform::Place& place,
framework::ToTypeName(var->Type()), var_name));
}
}
delete garbages; // free mem
VLOG(3) << "run " << op->Type() << " done.";
}
}
......
......@@ -692,6 +692,10 @@ phi::Kernel* Instruction::PhiKernel() const { return op_func_node_.pt_kernel_; }
OpFuncType Instruction::KernelType() const { return op_func_node_.type_; }
const std::map<int, int>& Instruction::InplaceBackMap() const {
return op_func_node_.inplace_back_map;
}
OperatorBase* Instruction::OpBase() const {
auto op_base = op_func_node_.operator_base_;
PADDLE_ENFORCE_NOT_NULL(op_base, platform::errors::PreconditionNotMet(
......
......@@ -297,6 +297,8 @@ struct OpFuncNode {
std::map<std::string, std::vector<int>> output_index;
std::unordered_set<int> no_data_transform_index;
std::map<int, int> inplace_back_map;
OpKernelComputeFunc kernel_func_;
platform::DeviceContext* dev_ctx_; // not owned
......@@ -325,6 +327,8 @@ class Instruction {
OpFuncType KernelType() const;
const std::map<int, int>& InplaceBackMap() const;
OperatorBase* OpBase() const;
NextInstruction& NextInstructions();
......
......@@ -664,6 +664,10 @@ class OperatorWithKernel : public OperatorBase {
const OpKernelType* kernel_type() const { return kernel_type_.get(); }
void ResetKernelType(OpKernelType* kernel_type) {
kernel_type_.reset(kernel_type);
}
private:
void RunImpl(const Scope& scope, const platform::Place& place) const final;
void RunImpl(const Scope& scope, const platform::Place& place,
......
......@@ -94,7 +94,8 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
"must smaller than or equal to 5. But received: the shape of input X "
"= [%s], the dimension of input X = [%d]",
x_dims, x_dims.size()));
VLOG(4) << ctx->IsRunMKLDNNKernel();
VLOG(4) << data_layout;
const int64_t C =
((ctx->IsRunMKLDNNKernel() == true) || (data_layout == DataLayout::kNCHW)
? x_dims[1]
......@@ -136,6 +137,7 @@ void BatchNormOp::InferShape(framework::InferShapeContext *ctx) const {
C, bias_dim[0]));
}
ctx->SetOutputDim("Y", x_dims);
VLOG(4) << x_dims;
ctx->SetOutputDim("MeanOut", {C});
ctx->SetOutputDim("VarianceOut", {C});
ctx->SetOutputDim("SavedMean", {C});
......
......@@ -203,14 +203,12 @@ class BatchNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
auto *y = ctx.Output<Tensor>("Y");
auto *batch_mean = ctx.Output<Tensor>("SavedMean");
auto *batch_variance = ctx.Output<Tensor>("SavedVariance");
BatchNormMKLDNNHandler<T> handler(ctx, mkldnn_engine, x, global_stats,
test_mode);
auto src_memory = handler.AcquireSrcMemory(x);
auto scaleshift_memory = handler.AcquireScaleShiftMemory(scale, shift);
auto dst_memory = handler.AcquireDstMemory(y);
auto batch_norm_p = handler.AcquireForwardPrimitive();
std::shared_ptr<memory> mean_memory;
......@@ -300,7 +298,6 @@ class BatchNormMKLDNNGradOpKernel : public paddle::framework::OpKernel<T> {
auto diff_src_memory = handler.AcquireDiffSrcMemory(diff_x);
auto diff_scaleshift_memory =
handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data());
// finally create batch_norm backward primitive
auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive();
......
......@@ -16,6 +16,8 @@
#include <string>
#include "paddle/fluid/framework/op_version_registry.h"
namespace paddle {
namespace framework {
class OpDesc;
......@@ -95,8 +97,9 @@ class TransferLayoutKernel {
auto *x = ctx.InputVar("X");
auto *out = ctx.OutputVar("Out");
auto &dev_ctx = ctx.device_context();
auto src_layout = ctx.Attr<int>("src_layout");
auto dst_layout = ctx.Attr<int>("dst_layout");
TransferLayoutFunctor(x, out, dev_ctx, dst_layout)();
TransferLayoutFunctor(x, out, dev_ctx, src_layout, dst_layout)();
}
};
......@@ -105,6 +108,14 @@ class TransferLayoutOpProtoMaker : public framework::OpProtoAndCheckerMaker {
void Make() override {
AddInput("X", "(LoDTensor) The input Tensor");
AddOutput("Out", "(LoDTensor) The Output Tensor with desired layout");
// NOTE(zhiqiu): in most case, the src_layout is not needed, the op can use
// the layout
// of input X. However, in some mkldnn kernel, the src layout computed by
// GetKernelTypeForVar is different with the layout of tensor X.
AddAttr<int>("src_layout",
"kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3, default "
"-1 means unspecified and use the tensor's layout.")
.SetDefault(-1);
AddAttr<int>("dst_layout",
"kAnyLayout = 0, kNHWC = 1, kNCHW = 2, kMKLDNN = 3");
AddComment(R"DOC(
......@@ -126,3 +137,8 @@ REGISTER_OPERATOR(
// dtype is not important
REGISTER_OP_CPU_KERNEL_FUNCTOR(transfer_layout, float,
ops::TransferLayoutKernel);
REGISTER_OP_VERSION(transfer_layout)
.AddCheckpoint(
R"ROC(refine transfer_layout, add src_layout attribute)ROC",
paddle::framework::compatible::OpVersionDesc().NewAttr(
"src_layout", "(int, the layout of the input tensor", -1));
......@@ -39,8 +39,12 @@ class TransferLayoutFunctor {
public:
TransferLayoutFunctor(const framework::Variable *in, framework::Variable *out,
const platform::DeviceContext &dev_ctx,
const int dst_layout)
: in_(in), out_(out), dev_ctx_(dev_ctx), dst_layout_(dst_layout) {}
const int src_layout, const int dst_layout)
: in_(in),
out_(out),
dev_ctx_(dev_ctx),
src_layout_(src_layout),
dst_layout_(dst_layout) {}
void operator()() const {
auto &in_tensor = *framework::GetLoDTensorOrSelectedRowsValueFromVar(*in_);
......@@ -50,7 +54,8 @@ class TransferLayoutFunctor {
out_tensor.set_layout(out_layout);
#ifdef PADDLE_WITH_MKLDNN
auto in_layout = in_tensor.layout();
auto in_layout = static_cast<DataLayout>(src_layout_);
VLOG(4) << in_layout << "->" << out_layout << " " << in_tensor.layout();
if (in_layout == DataLayout::kMKLDNN || out_layout == DataLayout::kMKLDNN) {
PADDLE_ENFORCE_NE(
in_layout, out_layout,
......@@ -68,6 +73,7 @@ class TransferLayoutFunctor {
// For NHWC data we need reshape of tensors as MKL-DNN
// is expecting NHWC dims description order
if (in_layout == DataLayout::kNHWC) {
VLOG(4) << "kNHWC";
platform::MatchShapeToLayout(&out_tensor, in_layout, out_layout);
paddle::platform::MKLDNNDeviceContext::tls()
.set_cur_paddle_data_layout(in_layout);
......@@ -75,6 +81,7 @@ class TransferLayoutFunctor {
out_tensor.set_layout(DataLayout::kMKLDNN);
out_tensor.set_format(out_format);
} else {
VLOG(4) << "kNCHW";
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
// Do transform via MKLDNN lib
paddle::framework::innerTransDataLayoutFromMKLDNN(
......@@ -123,6 +130,7 @@ class TransferLayoutFunctor {
const framework::Variable *in_;
framework::Variable *out_;
const platform::DeviceContext &dev_ctx_;
const int src_layout_;
const int dst_layout_;
};
......
......@@ -531,6 +531,7 @@ Eigen::GpuDevice* CUDADeviceContext::eigen_device() const {
}
void CUDADeviceContext::Wait() const {
VLOG(4) << "CUDA context(" << this << ") Wait";
if (thread_ctx_.count(this)) {
context()->Stream()->Wait();
return;
......
......@@ -352,5 +352,23 @@ class TestException(unittest.TestCase):
self.fetch_vars.name))
class TestInplaceApiWithDataTransform(unittest.TestCase):
def test_increment(self):
if paddle.fluid.core.is_compiled_with_cuda():
with paddle.fluid.device_guard("gpu:0"):
x = paddle.fluid.layers.fill_constant([1], "float32", 0)
with paddle.fluid.device_guard("cpu"):
x = paddle.increment(x)
exe = paddle.static.Executor(paddle.CUDAPlace(0))
os.environ['FLAGS_USE_STANDALONE_EXECUTOR'] = '1'
for i in range(10):
a, = exe.run(paddle.static.default_main_program(),
fetch_list=[x])
self.assertEqual(a[0], 1)
del os.environ['FLAGS_USE_STANDALONE_EXECUTOR']
if __name__ == "__main__":
unittest.main()
......@@ -30,6 +30,7 @@ class TestTransferLayoutOpkNCHWTokNHWC(OpTest):
self.inputs = {'X': ipt.astype('float32')}
self.outputs = {'Out': ipt.transpose([0, 2, 3, 1])}
self.attrs = {
'src_layout': 0,
'dst_layout': 1 # kNHWC
}
self.op_type = 'transfer_layout'
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册