未验证 提交 9aacb31b 编写于 作者: Z Zuza Gawrysiak 提交者: GitHub

[PHI] Migrate conv_transpose kernel (#48119)

* Migrate conv_transpose to phi

* Move handler to kernel

* kernel m

* Fix formatting

* handler

* remove fluid

* revert tcp_store

* tcp_store

* remove unused

* Fix declaration

* add dnn input

* Fix typo
Co-authored-by: NSławomir Siwek <slawomir.siwek@intel.com>
上级 ec778272
......@@ -20,6 +20,7 @@
#include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle {
namespace framework {
......@@ -315,6 +316,12 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
};
gpd(graph, handler);
AddStatis(found_conv_bias_count);
if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
found_conv_bias_count > 0) {
string::PrettyLogDetail("--- fused %d %s with elementwise_add as bias",
found_conv_bias_count,
type());
}
}
} // namespace ir
} // namespace framework
......
......@@ -34,7 +34,7 @@ PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT);
USE_OP_ITSELF(batch_norm);
PD_DECLARE_KERNEL(batch_norm, OneDNN, ONEDNN);
USE_OP_ITSELF(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN);
PD_DECLARE_KERNEL(conv2d_transpose, OneDNN, ONEDNN);
USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP_ITSELF(gelu);
......
......@@ -222,6 +222,7 @@ class ExtraInfoUtils {
// TODO(chenweihang): move these extra inputs into op_compat.yaml
std::unordered_map<std::string, std::vector<std::string>>
g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}},
{"conv2d_transpose", {"Bias"}},
{"conv2d_grad", {"Bias"}}};
std::vector<std::string> empty_extra_input_names_;
};
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License.
You may obtain a copy of the License at
http://www.apache.org/licenses/LICENSE-2.0
Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and
limitations under the License. */
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/conv_op.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using Tensor = phi::DenseTensor;
using phi::DataLayout;
using phi::funcs::OneDNNMemDesc;
// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#include "paddle/phi/kernels/conv_transpose_kernel.h"
#include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/phi/backends/onednn/onednn_helper.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/expect.h"
#include "paddle/phi/core/kernel_registry.h"
#include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/data_layout_transform.h"
namespace phi {
inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter,
const int groups) {
auto weights_tz = phi::vectorize(filter->dims());
int g = std::max(groups, 1);
int g_dim = (g > 1) ? 1 : 0;
phi::funcs::GetGroupConvWeightsTz(weights_tz, g);
funcs::GetGroupConvWeightsTz(weights_tz, g);
// gIOHW -> gOIHW || IOHW -> OIHW
std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]);
return weights_tz;
}
template <typename T, typename K, typename T_out>
class ConvTransposeMKLDNNHandlerT
: public phi::funcs::OneDNNHandlerNoCachingT<T,
dnnl::deconvolution_forward> {
class ConvTransposeOneDNNHandlerT
: public funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward> {
private:
const bool is_test_;
public:
ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx,
const dnnl::engine mkldnn_engine,
const phi::DenseTensor* input,
const phi::DenseTensor* filter,
const phi::DenseTensor* bias,
phi::DenseTensor* output)
: phi::funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>(
mkldnn_engine, ctx.GetPlace()),
is_test_(ctx.Attr<bool>("is_test")) {
ConvTransposeOneDNNHandlerT(const OneDNNContext& dev_ctx,
const DenseTensor* x,
const DenseTensor* filter,
const DenseTensor* bias,
const std::vector<int>& strides_in,
const std::vector<int>& paddings_in,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations_in,
DenseTensor* out)
: funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>(
dev_ctx.GetEngine(), dev_ctx.GetPlace()),
is_test_(dev_ctx.HasDnnAttr("is_test")
? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test"))
: false) {
PADDLE_ENFORCE_EQ(is_test_,
true,
platform::errors::InvalidArgument(
"ConvTransposeMKLDNN works only for inference. "
phi::errors::InvalidArgument(
"ConvTransposeOneDNN works only for inference. "
"The attribute \'is_test\' value should be set to "
"True, but got is_test=False."));
PADDLE_ENFORCE_EQ(
input->layout(),
x->layout(),
DataLayout::ONEDNN,
platform::errors::InvalidArgument(
"Got wrong layout = %d for Input tensor.", input->layout()));
phi::errors::InvalidArgument("Got wrong layout = %d for Input tensor.",
x->layout()));
PADDLE_ENFORCE_EQ(
filter->layout(),
DataLayout::ONEDNN,
platform::errors::InvalidArgument(
phi::errors::InvalidArgument(
"The filter tensor's layout should be %d, but got %d.",
DataLayout::ONEDNN,
filter->layout()));
PADDLE_ENFORCE_EQ(
input->dims().size(),
x->dims().size(),
4,
platform::errors::InvalidArgument("Input must be with 4 dimensions, "
phi::errors::InvalidArgument("Input must be with 4 dimensions, "
"i.e. NCHW. but got dimension =%d",
input->dims().size()));
x->dims().size()));
PADDLE_ENFORCE_EQ(
filter->dims().size(),
4,
platform::errors::InvalidArgument("Filter must be with 4 dimensions, "
phi::errors::InvalidArgument("Filter must be with 4 dimensions, "
"i.e. OIHW, but got dimension =%d",
filter->dims().size()));
......@@ -88,7 +95,7 @@ class ConvTransposeMKLDNNHandlerT
PADDLE_ENFORCE_EQ(
bias->layout(),
DataLayout::ONEDNN,
platform::errors::InvalidArgument(
phi::errors::InvalidArgument(
"The bias tensor's laytout should be %d, but got %d.",
DataLayout::ONEDNN,
bias->layout()));
......@@ -96,76 +103,71 @@ class ConvTransposeMKLDNNHandlerT
PADDLE_ENFORCE_EQ(
bias->dims().size(),
1,
platform::errors::InvalidArgument("Bias must only have 1 dimension, "
phi::errors::InvalidArgument("Bias must only have 1 dimension, "
"i.e. X, but got dimension = %d .",
bias->dims().size()));
}
std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides");
dnnl::memory::dims strides(begin(strides_temp), end(strides_temp));
std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
dnnl::memory::dims paddings(begin(paddings_temp), end(paddings_temp));
std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
dnnl::memory::dims dilations(begin(dilations_temp), end(dilations_temp));
int groups = ctx.Attr<int>("groups");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
dnnl::memory::dims strides(begin(strides_in), end(strides_in));
dnnl::memory::dims paddings(begin(paddings_in), end(paddings_in));
dnnl::memory::dims dilations(begin(dilations_in), end(dilations_in));
PADDLE_ENFORCE_EQ(
strides.size(),
2,
platform::errors::Unimplemented(
phi::errors::Unimplemented(
"Now we only support 2d oneDNN convolution transpose op"));
const auto& input_dims = input->dims();
const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size());
const auto& filter_dims = filter->dims();
const auto x_dims = x->dims();
const auto x_data_dims = phi::slice_ddim(x_dims, 2, x_dims.size());
const auto filter_dims = filter->dims();
const auto filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size());
const auto ksize = phi::vectorize(filter_data_dims);
UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, data_dims, strides, ksize);
&paddings, &dilations, padding_algorithm, x_data_dims, strides, ksize);
std::transform(
dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) {
return i - 1;
});
const auto src_tz = phi::vectorize(input->dims());
const auto src_tz = phi::vectorize(x->dims());
const auto weights_tz = GetWeightsTz(filter, groups);
const auto dst_tz = phi::vectorize(output->dims());
const auto mkldnn_paddings = phi::funcs::ToOneDNNPadding(paddings);
const auto dst_tz = phi::vectorize(out->dims());
const auto onednn_paddings = funcs::ToOneDNNPadding(paddings);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
const auto chosen_memory_format = OneDNNMemoryFormat::any;
auto chosen_memory_format = funcs::OneDNNMemoryFormat::any;
auto data_type = dnnl::memory::data_type::f32;
if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" ||
std::is_same<T_out, platform::bfloat16>::value)
const bool is_BFLOAT16 =
dev_ctx.HasDnnAttr("mkldnn_data_type")
? PADDLE_GET_CONST(std::string,
dev_ctx.GetDnnAttr("mkldnn_data_type")) ==
"bfloat16"
: false;
if (is_BFLOAT16 || std::is_same<T_out, dtype::bfloat16>::value) {
data_type = dnnl::memory::data_type::bf16;
}
const auto src_md = OneDNNMemDesc(src_tz, data_type, chosen_memory_format);
const auto src_md =
funcs::OneDNNMemDesc(src_tz, data_type, chosen_memory_format);
const auto weights_md =
OneDNNMemDesc(weights_tz, data_type, chosen_memory_format);
const auto dst_md = OneDNNMemDesc(
dst_tz, phi::funcs::OneDNNGetDataType<T_out>(), chosen_memory_format);
funcs::OneDNNMemDesc(weights_tz, data_type, chosen_memory_format);
const auto dst_md = funcs::OneDNNMemDesc(
dst_tz, funcs::OneDNNGetDataType<T_out>(), chosen_memory_format);
const dnnl::primitive_attr conv_trans_attr = CreateConvAttrs(ctx);
auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference
: dnnl::prop_kind::forward_training;
if (bias) {
std::vector<int64_t> bias_tz = phi::vectorize(bias->dims());
const auto bias_md =
OneDNNMemDesc(bias_tz, data_type, OneDNNMemoryFormat::x);
const auto bias_md = funcs::OneDNNMemDesc(
bias_tz, data_type, funcs::OneDNNMemoryFormat::x);
this->AcquireForwardPrimitiveDescriptor(
conv_trans_attr,
fwd_prop_kind,
dnnl::algorithm::deconvolution_direct,
src_md,
......@@ -174,11 +176,10 @@ class ConvTransposeMKLDNNHandlerT
dst_md,
strides,
dilations,
mkldnn_paddings[0],
mkldnn_paddings[1]);
onednn_paddings[0],
onednn_paddings[1]);
} else {
this->AcquireForwardPrimitiveDescriptor(
conv_trans_attr,
fwd_prop_kind,
dnnl::algorithm::deconvolution_direct,
src_md,
......@@ -186,50 +187,22 @@ class ConvTransposeMKLDNNHandlerT
dst_md,
strides,
dilations,
mkldnn_paddings[0],
mkldnn_paddings[1]);
onednn_paddings[0],
onednn_paddings[1]);
}
}
dnnl::primitive_attr CreateConvAttrs(const framework::ExecutionContext& ctx) {
dnnl::primitive_attr conv_attr;
dnnl::post_ops post_operations;
const std::string fuse_activation =
ctx.Attr<std::string>("fuse_activation");
const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
const float fuse_beta = ctx.Attr<float>("fuse_beta");
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
constexpr float scale = 1.0f;
post_operations.append_eltwise(
scale, dnnl::algorithm::eltwise_relu, fuse_alpha, fuse_beta);
} else if (fuse_activation == "relu6") {
constexpr float scale = 1.0f;
post_operations.append_eltwise(
scale, dnnl::algorithm::eltwise_bounded_relu, fuse_alpha, fuse_beta);
} else if (fuse_activation == "swish") {
constexpr float scale = 1.0f;
post_operations.append_eltwise(
scale, dnnl::algorithm::eltwise_swish, fuse_alpha, fuse_beta);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder(
const phi::DenseTensor* input) {
const T* input_data = input->data<T>();
return phi::funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>::
AcquireMemoryWithReorder(input->mem_desc(),
const phi::DenseTensor* x) {
const T* input_data = x->data<T>();
return funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>::
AcquireMemoryWithReorder(x->mem_desc(),
this->fwd_pd_->src_desc(),
phi::funcs::to_void_cast<T>(input_data));
funcs::to_void_cast<T>(input_data));
}
std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
const platform::MKLDNNDeviceContext& dev_ctx,
const OneDNNContext& dev_ctx,
const std::string& key,
const phi::DenseTensor* filter,
const int& groups) {
......@@ -237,16 +210,17 @@ class ConvTransposeMKLDNNHandlerT
auto weights_tz = GetWeightsTz(filter, groups);
int g = std::max(groups, 1);
auto user_src_md = OneDNNMemDesc(
weights_tz,
phi::funcs::OneDNNGetDataType<K>(),
(g == 1) ? OneDNNMemoryFormat::iohw : OneDNNMemoryFormat::giohw);
auto user_src_md =
funcs::OneDNNMemDesc(weights_tz,
funcs::OneDNNGetDataType<K>(),
(g == 1) ? funcs::OneDNNMemoryFormat::iohw
: funcs::OneDNNMemoryFormat::giohw);
return this->template AcquireMemoryWithReorder<K>(
dev_ctx,
user_src_md,
this->fwd_pd_->weights_desc(),
phi::funcs::to_void_cast<K>(filter_data),
funcs::to_void_cast<K>(filter_data),
key,
"@weights_mem_p",
is_test_);
......@@ -254,7 +228,7 @@ class ConvTransposeMKLDNNHandlerT
template <typename F = T>
std::shared_ptr<dnnl::memory> AcquireMemoryWithReorder(
const platform::MKLDNNDeviceContext& dev_ctx,
const OneDNNContext& dev_ctx,
const dnnl::memory::desc& user_md,
const dnnl::memory::desc& target_md,
void* ptr,
......@@ -277,7 +251,7 @@ class ConvTransposeMKLDNNHandlerT
target_memory_p =
std::make_shared<dnnl::memory>(target_md, this->engine_);
dnnl::reorder::primitive_desc reorder_pdesc;
if (phi::funcs::is_int8<T>()) {
if (funcs::is_int8<T>()) {
dnnl::primitive_attr attr;
attr.set_output_scales(mask, scale_data);
reorder_pdesc = dnnl::reorder::primitive_desc(
......@@ -289,12 +263,12 @@ class ConvTransposeMKLDNNHandlerT
auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc);
dev_ctx.SetBlob(key_reorder_p, reorder_p);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
platform::RecordEvent record_reorder(
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
platform::TracerEventType::UserDefined,
paddle::platform::TracerEventType::UserDefined,
1,
platform::EventRole::kUniqueOp);
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......@@ -305,7 +279,7 @@ class ConvTransposeMKLDNNHandlerT
dev_ctx.SetBlob(user_key, user_memory_p);
dev_ctx.SetBlob(target_key, target_memory_p);
} else if (!is_persistent) {
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
auto& astream = OneDNNContext::tls().get_stream();
auto user_memory_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(user_key));
......@@ -316,11 +290,11 @@ class ConvTransposeMKLDNNHandlerT
auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
dev_ctx.GetBlob(key_reorder_p));
if (reorder_p != nullptr) {
platform::RecordEvent record_reorder(
paddle::platform::RecordEvent record_reorder(
"int_reorder",
platform::TracerEventType::UserDefined,
paddle::platform::TracerEventType::UserDefined,
1,
platform::EventRole::kUniqueOp);
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
......@@ -331,73 +305,60 @@ class ConvTransposeMKLDNNHandlerT
}
std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
const platform::MKLDNNDeviceContext& dev_ctx,
const OneDNNContext& dev_ctx,
const std::string& key,
const phi::DenseTensor* bias) {
const K* bias_data = bias->data<K>();
auto user_bias_md = OneDNNMemDesc(phi::vectorize(bias->dims()),
phi::funcs::OneDNNGetDataType<K>(),
OneDNNMemoryFormat::x);
return this->AcquireMemoryWithReorder(
dev_ctx,
auto user_bias_md = funcs::OneDNNMemDesc(phi::vectorize(bias->dims()),
funcs::OneDNNGetDataType<K>(),
funcs::OneDNNMemoryFormat::x);
return this->AcquireMemoryWithReorder(dev_ctx,
user_bias_md,
this->fwd_pd_->bias_desc(),
phi::funcs::to_void_cast<K>(bias_data),
funcs::to_void_cast<K>(bias_data),
key,
"@bias_mem_p",
is_test_);
}
private:
const bool is_test_;
};
template <typename T, typename K>
class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()),
true,
platform::errors::PreconditionNotMet(
"Operator DNNL ConvTranspose must use CPUPlace"));
const bool is_bfloat16 =
ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16";
const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
if (is_bfloat16) {
if (force_fp32_output)
Execute<float>(ctx);
else
Execute<platform::bfloat16>(ctx);
} else {
Execute<float>(ctx);
}
}
template <typename T, typename T_out>
void Execute(const OneDNNContext& dev_ctx,
const DenseTensor* x,
const DenseTensor* filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
DenseTensor* out) {
const auto* bias =
dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr;
template <typename T_out>
void Execute(const framework::ExecutionContext& ctx) const {
auto& dev_ctx =
ctx.template device_context<platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
ConvTransposeOneDNNHandlerT<T, float, T_out> handler(dev_ctx,
x,
filter,
bias,
strides,
paddings,
padding_algorithm,
groups,
dilations,
out);
const auto* input = ctx.Input<phi::DenseTensor>("Input");
const auto* filter = ctx.Input<phi::DenseTensor>("Filter");
const auto* bias =
ctx.HasInput("Bias") ? ctx.Input<phi::DenseTensor>("Bias") : nullptr;
auto* output = ctx.Output<phi::DenseTensor>("Output");
ConvTransposeMKLDNNHandlerT<T, K, T_out> handler(
ctx, mkldnn_engine, input, filter, bias, output);
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input);
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(x);
// Caching Key for weights is needed
std::string key = platform::CreateKey(dev_ctx,
ctx.InputName("Input"),
ctx.InputName("Filter"),
(bias ? ctx.InputName("Bias") : ""));
key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder(
dev_ctx, key, filter, ctx.Attr<int>("groups"));
std::string key =
funcs::CreateKey(dev_ctx,
dev_ctx.GetInputsName("Input")[0],
dev_ctx.GetInputsName("Filter")[0],
(bias ? dev_ctx.GetInputsName("Bias")[0] : ""));
key = funcs::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
auto weights_memory_p =
handler.AcquireWeightsMemoryWithReorder(dev_ctx, key, filter, groups);
std::shared_ptr<dnnl::memory> dst_memory_p =
handler.template AcquireDstMemory<T_out>(output);
handler.template AcquireDstMemory<T_out>(out);
auto conv_p = handler.AcquireForwardPrimitive();
std::unordered_map<int, dnnl::memory> args = {
......@@ -410,21 +371,70 @@ class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> {
handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias);
args.insert({DNNL_ARG_BIAS, *bias_memory_p});
}
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
auto& astream = OneDNNContext::tls().get_stream();
conv_p->execute(astream, args);
astream.wait();
output->set_mem_desc(dst_memory_p->get_desc());
}
};
out->set_mem_desc(dst_memory_p->get_desc());
}
} // namespace operators
} // namespace paddle
template <typename T, typename Context>
void Conv2dTransposeKernel(const Context& dev_ctx,
const DenseTensor& x,
const DenseTensor& filter,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const std::vector<int>& output_padding,
const IntArray& output_size,
const std::string& padding_algorithm,
int groups,
const std::vector<int>& dilations,
const std::string& data_format,
DenseTensor* out) {
PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType(),
AllocationType::CPU,
phi::errors::PreconditionNotMet(
"Operator oneDNN Conv must use CPUPlace"));
const bool is_BFLOAT16 =
dev_ctx.HasDnnAttr("mkldnn_data_type")
? PADDLE_GET_CONST(std::string,
dev_ctx.GetDnnAttr("mkldnn_data_type")) ==
"bfloat16"
: false;
const bool force_fp32_output =
dev_ctx.HasDnnAttr("force_fp32_output")
? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
: false;
const bool use_bfloat16 = (!force_fp32_output && is_BFLOAT16);
if (use_bfloat16) {
Execute<T, dtype::bfloat16>(dev_ctx,
&x,
&filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
out);
} else {
Execute<T, float>(dev_ctx,
&x,
&filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
out);
}
}
namespace ops = paddle::operators;
} // namespace phi
REGISTER_OP_KERNEL(
conv2d_transpose,
MKLDNN,
::paddle::platform::CPUPlace,
ops::ConvTransposeMKLDNNOpKernel<float, float>,
ops::ConvTransposeMKLDNNOpKernel<paddle::platform::bfloat16, float>);
PD_REGISTER_KERNEL(conv2d_transpose,
OneDNN,
ONEDNN,
phi::Conv2dTransposeKernel,
float,
phi::dtype::bfloat16) {}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册