未验证 提交 9aacb31b 编写于 作者: Z Zuza Gawrysiak 提交者: GitHub

[PHI] Migrate conv_transpose kernel (#48119)

* Migrate conv_transpose to phi

* Move handler to kernel

* kernel m

* Fix formatting

* handler

* remove fluid

* revert tcp_store

* tcp_store

* remove unused

* Fix declaration

* add dnn input

* Fix typo
Co-authored-by: NSławomir Siwek <slawomir.siwek@intel.com>
上级 ec778272
...@@ -20,6 +20,7 @@ ...@@ -20,6 +20,7 @@
#include "paddle/fluid/framework/lod_tensor.h" #include "paddle/fluid/framework/lod_tensor.h"
#include "paddle/fluid/framework/op_version_registry.h" #include "paddle/fluid/framework/op_version_registry.h"
#include "paddle/fluid/platform/enforce.h" #include "paddle/fluid/platform/enforce.h"
#include "paddle/fluid/string/pretty_log.h"
namespace paddle { namespace paddle {
namespace framework { namespace framework {
...@@ -315,6 +316,12 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const { ...@@ -315,6 +316,12 @@ void ConvBiasFusePass::ApplyImpl(ir::Graph* graph) const {
}; };
gpd(graph, handler); gpd(graph, handler);
AddStatis(found_conv_bias_count); AddStatis(found_conv_bias_count);
if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
found_conv_bias_count > 0) {
string::PrettyLogDetail("--- fused %d %s with elementwise_add as bias",
found_conv_bias_count,
type());
}
} }
} // namespace ir } // namespace ir
} // namespace framework } // namespace framework
......
...@@ -34,7 +34,7 @@ PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT); ...@@ -34,7 +34,7 @@ PD_DECLARE_KERNEL(gelu, CPU, ALL_LAYOUT);
USE_OP_ITSELF(batch_norm); USE_OP_ITSELF(batch_norm);
PD_DECLARE_KERNEL(batch_norm, OneDNN, ONEDNN); PD_DECLARE_KERNEL(batch_norm, OneDNN, ONEDNN);
USE_OP_ITSELF(conv2d_transpose); USE_OP_ITSELF(conv2d_transpose);
USE_OP_DEVICE_KERNEL(conv2d_transpose, MKLDNN); PD_DECLARE_KERNEL(conv2d_transpose, OneDNN, ONEDNN);
USE_OP_ITSELF(elementwise_add); USE_OP_ITSELF(elementwise_add);
USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN); USE_OP_DEVICE_KERNEL(elementwise_add, MKLDNN);
USE_OP_ITSELF(gelu); USE_OP_ITSELF(gelu);
......
...@@ -222,6 +222,7 @@ class ExtraInfoUtils { ...@@ -222,6 +222,7 @@ class ExtraInfoUtils {
// TODO(chenweihang): move these extra inputs into op_compat.yaml // TODO(chenweihang): move these extra inputs into op_compat.yaml
std::unordered_map<std::string, std::vector<std::string>> std::unordered_map<std::string, std::vector<std::string>>
g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}}, g_extra_input_names_map_ = {{"conv2d", {"Bias", "ResidualData"}},
{"conv2d_transpose", {"Bias"}},
{"conv2d_grad", {"Bias"}}}; {"conv2d_grad", {"Bias"}}};
std::vector<std::string> empty_extra_input_names_; std::vector<std::string> empty_extra_input_names_;
}; };
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
//
Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
You may obtain a copy of the License at // You may obtain a copy of the License at
//
http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
//
Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
limitations under the License. */ // limitations under the License.
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/phi/kernels/conv_transpose_kernel.h"
#include "paddle/fluid/framework/op_registry.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/platform/profiler/event_tracing.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/phi/backends/onednn/onednn_helper.h"
#include "paddle/fluid/platform/mkldnn_reuse.h" #include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/core/expect.h"
namespace paddle { #include "paddle/phi/core/kernel_registry.h"
namespace operators { #include "paddle/phi/kernels/cpu/conv_util.h"
#include "paddle/phi/kernels/funcs/data_layout_transform.h"
using Tensor = phi::DenseTensor;
using phi::DataLayout; namespace phi {
using phi::funcs::OneDNNMemDesc;
inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter, inline dnnl::memory::dims GetWeightsTz(const phi::DenseTensor* filter,
const int groups) { const int groups) {
auto weights_tz = phi::vectorize(filter->dims()); auto weights_tz = phi::vectorize(filter->dims());
int g = std::max(groups, 1); int g = std::max(groups, 1);
int g_dim = (g > 1) ? 1 : 0; int g_dim = (g > 1) ? 1 : 0;
phi::funcs::GetGroupConvWeightsTz(weights_tz, g); funcs::GetGroupConvWeightsTz(weights_tz, g);
// gIOHW -> gOIHW || IOHW -> OIHW // gIOHW -> gOIHW || IOHW -> OIHW
std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]); std::swap(weights_tz[g_dim + 0], weights_tz[g_dim + 1]);
return weights_tz; return weights_tz;
} }
template <typename T, typename K, typename T_out> template <typename T, typename K, typename T_out>
class ConvTransposeMKLDNNHandlerT class ConvTransposeOneDNNHandlerT
: public phi::funcs::OneDNNHandlerNoCachingT<T, : public funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward> {
dnnl::deconvolution_forward> { private:
const bool is_test_;
public: public:
ConvTransposeMKLDNNHandlerT(const framework::ExecutionContext& ctx, ConvTransposeOneDNNHandlerT(const OneDNNContext& dev_ctx,
const dnnl::engine mkldnn_engine, const DenseTensor* x,
const phi::DenseTensor* input, const DenseTensor* filter,
const phi::DenseTensor* filter, const DenseTensor* bias,
const phi::DenseTensor* bias, const std::vector<int>& strides_in,
phi::DenseTensor* output) const std::vector<int>& paddings_in,
: phi::funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>( const std::string& padding_algorithm,
mkldnn_engine, ctx.GetPlace()), int groups,
is_test_(ctx.Attr<bool>("is_test")) { const std::vector<int>& dilations_in,
DenseTensor* out)
: funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>(
dev_ctx.GetEngine(), dev_ctx.GetPlace()),
is_test_(dev_ctx.HasDnnAttr("is_test")
? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("is_test"))
: false) {
PADDLE_ENFORCE_EQ(is_test_, PADDLE_ENFORCE_EQ(is_test_,
true, true,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"ConvTransposeMKLDNN works only for inference. " "ConvTransposeOneDNN works only for inference. "
"The attribute \'is_test\' value should be set to " "The attribute \'is_test\' value should be set to "
"True, but got is_test=False.")); "True, but got is_test=False."));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
input->layout(), x->layout(),
DataLayout::ONEDNN, DataLayout::ONEDNN,
platform::errors::InvalidArgument( phi::errors::InvalidArgument("Got wrong layout = %d for Input tensor.",
"Got wrong layout = %d for Input tensor.", input->layout())); x->layout()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
filter->layout(), filter->layout(),
DataLayout::ONEDNN, DataLayout::ONEDNN,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"The filter tensor's layout should be %d, but got %d.", "The filter tensor's layout should be %d, but got %d.",
DataLayout::ONEDNN, DataLayout::ONEDNN,
filter->layout())); filter->layout()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
input->dims().size(), x->dims().size(),
4, 4,
platform::errors::InvalidArgument("Input must be with 4 dimensions, " phi::errors::InvalidArgument("Input must be with 4 dimensions, "
"i.e. NCHW. but got dimension =%d", "i.e. NCHW. but got dimension =%d",
input->dims().size())); x->dims().size()));
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
filter->dims().size(), filter->dims().size(),
4, 4,
platform::errors::InvalidArgument("Filter must be with 4 dimensions, " phi::errors::InvalidArgument("Filter must be with 4 dimensions, "
"i.e. OIHW, but got dimension =%d", "i.e. OIHW, but got dimension =%d",
filter->dims().size())); filter->dims().size()));
if (bias) { if (bias) {
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
bias->layout(), bias->layout(),
DataLayout::ONEDNN, DataLayout::ONEDNN,
platform::errors::InvalidArgument( phi::errors::InvalidArgument(
"The bias tensor's laytout should be %d, but got %d.", "The bias tensor's laytout should be %d, but got %d.",
DataLayout::ONEDNN, DataLayout::ONEDNN,
bias->layout())); bias->layout()));
...@@ -96,76 +103,71 @@ class ConvTransposeMKLDNNHandlerT ...@@ -96,76 +103,71 @@ class ConvTransposeMKLDNNHandlerT
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
bias->dims().size(), bias->dims().size(),
1, 1,
platform::errors::InvalidArgument("Bias must only have 1 dimension, " phi::errors::InvalidArgument("Bias must only have 1 dimension, "
"i.e. X, but got dimension = %d .", "i.e. X, but got dimension = %d .",
bias->dims().size())); bias->dims().size()));
} }
std::vector<int> strides_temp = ctx.Attr<std::vector<int>>("strides"); dnnl::memory::dims strides(begin(strides_in), end(strides_in));
dnnl::memory::dims strides(begin(strides_temp), end(strides_temp)); dnnl::memory::dims paddings(begin(paddings_in), end(paddings_in));
dnnl::memory::dims dilations(begin(dilations_in), end(dilations_in));
std::vector<int> paddings_temp = ctx.Attr<std::vector<int>>("paddings");
dnnl::memory::dims paddings(begin(paddings_temp), end(paddings_temp));
std::vector<int> dilations_temp = ctx.Attr<std::vector<int>>("dilations");
dnnl::memory::dims dilations(begin(dilations_temp), end(dilations_temp));
int groups = ctx.Attr<int>("groups");
std::string padding_algorithm = ctx.Attr<std::string>("padding_algorithm");
PADDLE_ENFORCE_EQ( PADDLE_ENFORCE_EQ(
strides.size(), strides.size(),
2, 2,
platform::errors::Unimplemented( phi::errors::Unimplemented(
"Now we only support 2d oneDNN convolution transpose op")); "Now we only support 2d oneDNN convolution transpose op"));
const auto& input_dims = input->dims(); const auto x_dims = x->dims();
const auto data_dims = phi::slice_ddim(input_dims, 2, input_dims.size()); const auto x_data_dims = phi::slice_ddim(x_dims, 2, x_dims.size());
const auto& filter_dims = filter->dims(); const auto filter_dims = filter->dims();
const auto filter_data_dims = const auto filter_data_dims =
phi::slice_ddim(filter_dims, 2, filter_dims.size()); phi::slice_ddim(filter_dims, 2, filter_dims.size());
const auto ksize = phi::vectorize(filter_data_dims); const auto ksize = phi::vectorize(filter_data_dims);
UpdatePaddingAndDilation( UpdatePaddingAndDilation(
&paddings, &dilations, padding_algorithm, data_dims, strides, ksize); &paddings, &dilations, padding_algorithm, x_data_dims, strides, ksize);
std::transform( std::transform(
dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) { dilations.begin(), dilations.end(), dilations.begin(), [](int64_t i) {
return i - 1; return i - 1;
}); });
const auto src_tz = phi::vectorize(input->dims()); const auto src_tz = phi::vectorize(x->dims());
const auto weights_tz = GetWeightsTz(filter, groups); const auto weights_tz = GetWeightsTz(filter, groups);
const auto dst_tz = phi::vectorize(output->dims()); const auto dst_tz = phi::vectorize(out->dims());
const auto mkldnn_paddings = phi::funcs::ToOneDNNPadding(paddings); const auto onednn_paddings = funcs::ToOneDNNPadding(paddings);
/* create memory descriptor for convolution without specified format /* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose * ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance * the memory format preferred for best performance
*/ */
const auto chosen_memory_format = OneDNNMemoryFormat::any; auto chosen_memory_format = funcs::OneDNNMemoryFormat::any;
auto data_type = dnnl::memory::data_type::f32; auto data_type = dnnl::memory::data_type::f32;
if (ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16" || const bool is_BFLOAT16 =
std::is_same<T_out, platform::bfloat16>::value) dev_ctx.HasDnnAttr("mkldnn_data_type")
? PADDLE_GET_CONST(std::string,
dev_ctx.GetDnnAttr("mkldnn_data_type")) ==
"bfloat16"
: false;
if (is_BFLOAT16 || std::is_same<T_out, dtype::bfloat16>::value) {
data_type = dnnl::memory::data_type::bf16; data_type = dnnl::memory::data_type::bf16;
}
const auto src_md = OneDNNMemDesc(src_tz, data_type, chosen_memory_format); const auto src_md =
funcs::OneDNNMemDesc(src_tz, data_type, chosen_memory_format);
const auto weights_md = const auto weights_md =
OneDNNMemDesc(weights_tz, data_type, chosen_memory_format); funcs::OneDNNMemDesc(weights_tz, data_type, chosen_memory_format);
const auto dst_md = OneDNNMemDesc( const auto dst_md = funcs::OneDNNMemDesc(
dst_tz, phi::funcs::OneDNNGetDataType<T_out>(), chosen_memory_format); dst_tz, funcs::OneDNNGetDataType<T_out>(), chosen_memory_format);
const dnnl::primitive_attr conv_trans_attr = CreateConvAttrs(ctx);
auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference auto fwd_prop_kind = is_test_ ? dnnl::prop_kind::forward_inference
: dnnl::prop_kind::forward_training; : dnnl::prop_kind::forward_training;
if (bias) { if (bias) {
std::vector<int64_t> bias_tz = phi::vectorize(bias->dims()); std::vector<int64_t> bias_tz = phi::vectorize(bias->dims());
const auto bias_md = const auto bias_md = funcs::OneDNNMemDesc(
OneDNNMemDesc(bias_tz, data_type, OneDNNMemoryFormat::x); bias_tz, data_type, funcs::OneDNNMemoryFormat::x);
this->AcquireForwardPrimitiveDescriptor( this->AcquireForwardPrimitiveDescriptor(
conv_trans_attr,
fwd_prop_kind, fwd_prop_kind,
dnnl::algorithm::deconvolution_direct, dnnl::algorithm::deconvolution_direct,
src_md, src_md,
...@@ -174,11 +176,10 @@ class ConvTransposeMKLDNNHandlerT ...@@ -174,11 +176,10 @@ class ConvTransposeMKLDNNHandlerT
dst_md, dst_md,
strides, strides,
dilations, dilations,
mkldnn_paddings[0], onednn_paddings[0],
mkldnn_paddings[1]); onednn_paddings[1]);
} else { } else {
this->AcquireForwardPrimitiveDescriptor( this->AcquireForwardPrimitiveDescriptor(
conv_trans_attr,
fwd_prop_kind, fwd_prop_kind,
dnnl::algorithm::deconvolution_direct, dnnl::algorithm::deconvolution_direct,
src_md, src_md,
...@@ -186,50 +187,22 @@ class ConvTransposeMKLDNNHandlerT ...@@ -186,50 +187,22 @@ class ConvTransposeMKLDNNHandlerT
dst_md, dst_md,
strides, strides,
dilations, dilations,
mkldnn_paddings[0], onednn_paddings[0],
mkldnn_paddings[1]); onednn_paddings[1]);
} }
} }
dnnl::primitive_attr CreateConvAttrs(const framework::ExecutionContext& ctx) {
dnnl::primitive_attr conv_attr;
dnnl::post_ops post_operations;
const std::string fuse_activation =
ctx.Attr<std::string>("fuse_activation");
const float fuse_alpha = ctx.Attr<float>("fuse_alpha");
const float fuse_beta = ctx.Attr<float>("fuse_beta");
// Fusion with ReLU layer is executed through the PostOps feature. Create a
// PostOps object and configure it to execute an eltwise relu operation.
if (fuse_activation == "relu" || fuse_activation == "leaky_relu") {
constexpr float scale = 1.0f;
post_operations.append_eltwise(
scale, dnnl::algorithm::eltwise_relu, fuse_alpha, fuse_beta);
} else if (fuse_activation == "relu6") {
constexpr float scale = 1.0f;
post_operations.append_eltwise(
scale, dnnl::algorithm::eltwise_bounded_relu, fuse_alpha, fuse_beta);
} else if (fuse_activation == "swish") {
constexpr float scale = 1.0f;
post_operations.append_eltwise(
scale, dnnl::algorithm::eltwise_swish, fuse_alpha, fuse_beta);
}
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder( std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder(
const phi::DenseTensor* input) { const phi::DenseTensor* x) {
const T* input_data = input->data<T>(); const T* input_data = x->data<T>();
return phi::funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>:: return funcs::OneDNNHandlerNoCachingT<T, dnnl::deconvolution_forward>::
AcquireMemoryWithReorder(input->mem_desc(), AcquireMemoryWithReorder(x->mem_desc(),
this->fwd_pd_->src_desc(), this->fwd_pd_->src_desc(),
phi::funcs::to_void_cast<T>(input_data)); funcs::to_void_cast<T>(input_data));
} }
std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder( std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
const platform::MKLDNNDeviceContext& dev_ctx, const OneDNNContext& dev_ctx,
const std::string& key, const std::string& key,
const phi::DenseTensor* filter, const phi::DenseTensor* filter,
const int& groups) { const int& groups) {
...@@ -237,16 +210,17 @@ class ConvTransposeMKLDNNHandlerT ...@@ -237,16 +210,17 @@ class ConvTransposeMKLDNNHandlerT
auto weights_tz = GetWeightsTz(filter, groups); auto weights_tz = GetWeightsTz(filter, groups);
int g = std::max(groups, 1); int g = std::max(groups, 1);
auto user_src_md = OneDNNMemDesc( auto user_src_md =
weights_tz, funcs::OneDNNMemDesc(weights_tz,
phi::funcs::OneDNNGetDataType<K>(), funcs::OneDNNGetDataType<K>(),
(g == 1) ? OneDNNMemoryFormat::iohw : OneDNNMemoryFormat::giohw); (g == 1) ? funcs::OneDNNMemoryFormat::iohw
: funcs::OneDNNMemoryFormat::giohw);
return this->template AcquireMemoryWithReorder<K>( return this->template AcquireMemoryWithReorder<K>(
dev_ctx, dev_ctx,
user_src_md, user_src_md,
this->fwd_pd_->weights_desc(), this->fwd_pd_->weights_desc(),
phi::funcs::to_void_cast<K>(filter_data), funcs::to_void_cast<K>(filter_data),
key, key,
"@weights_mem_p", "@weights_mem_p",
is_test_); is_test_);
...@@ -254,7 +228,7 @@ class ConvTransposeMKLDNNHandlerT ...@@ -254,7 +228,7 @@ class ConvTransposeMKLDNNHandlerT
template <typename F = T> template <typename F = T>
std::shared_ptr<dnnl::memory> AcquireMemoryWithReorder( std::shared_ptr<dnnl::memory> AcquireMemoryWithReorder(
const platform::MKLDNNDeviceContext& dev_ctx, const OneDNNContext& dev_ctx,
const dnnl::memory::desc& user_md, const dnnl::memory::desc& user_md,
const dnnl::memory::desc& target_md, const dnnl::memory::desc& target_md,
void* ptr, void* ptr,
...@@ -277,7 +251,7 @@ class ConvTransposeMKLDNNHandlerT ...@@ -277,7 +251,7 @@ class ConvTransposeMKLDNNHandlerT
target_memory_p = target_memory_p =
std::make_shared<dnnl::memory>(target_md, this->engine_); std::make_shared<dnnl::memory>(target_md, this->engine_);
dnnl::reorder::primitive_desc reorder_pdesc; dnnl::reorder::primitive_desc reorder_pdesc;
if (phi::funcs::is_int8<T>()) { if (funcs::is_int8<T>()) {
dnnl::primitive_attr attr; dnnl::primitive_attr attr;
attr.set_output_scales(mask, scale_data); attr.set_output_scales(mask, scale_data);
reorder_pdesc = dnnl::reorder::primitive_desc( reorder_pdesc = dnnl::reorder::primitive_desc(
...@@ -289,12 +263,12 @@ class ConvTransposeMKLDNNHandlerT ...@@ -289,12 +263,12 @@ class ConvTransposeMKLDNNHandlerT
auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc); auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc);
dev_ctx.SetBlob(key_reorder_p, reorder_p); dev_ctx.SetBlob(key_reorder_p, reorder_p);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); auto& astream = OneDNNContext::tls().get_stream();
platform::RecordEvent record_reorder( paddle::platform::RecordEvent record_reorder(
"int_reorder", "int_reorder",
platform::TracerEventType::UserDefined, paddle::platform::TracerEventType::UserDefined,
1, 1,
platform::EventRole::kUniqueOp); paddle::platform::EventRole::kUniqueOp);
reorder_p->execute( reorder_p->execute(
astream, astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
...@@ -305,7 +279,7 @@ class ConvTransposeMKLDNNHandlerT ...@@ -305,7 +279,7 @@ class ConvTransposeMKLDNNHandlerT
dev_ctx.SetBlob(user_key, user_memory_p); dev_ctx.SetBlob(user_key, user_memory_p);
dev_ctx.SetBlob(target_key, target_memory_p); dev_ctx.SetBlob(target_key, target_memory_p);
} else if (!is_persistent) { } else if (!is_persistent) {
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); auto& astream = OneDNNContext::tls().get_stream();
auto user_memory_p = auto user_memory_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(user_key)); std::static_pointer_cast<dnnl::memory>(dev_ctx.GetBlob(user_key));
...@@ -316,11 +290,11 @@ class ConvTransposeMKLDNNHandlerT ...@@ -316,11 +290,11 @@ class ConvTransposeMKLDNNHandlerT
auto reorder_p = std::static_pointer_cast<dnnl::reorder>( auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
dev_ctx.GetBlob(key_reorder_p)); dev_ctx.GetBlob(key_reorder_p));
if (reorder_p != nullptr) { if (reorder_p != nullptr) {
platform::RecordEvent record_reorder( paddle::platform::RecordEvent record_reorder(
"int_reorder", "int_reorder",
platform::TracerEventType::UserDefined, paddle::platform::TracerEventType::UserDefined,
1, 1,
platform::EventRole::kUniqueOp); paddle::platform::EventRole::kUniqueOp);
reorder_p->execute( reorder_p->execute(
astream, astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}}); {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
...@@ -331,100 +305,136 @@ class ConvTransposeMKLDNNHandlerT ...@@ -331,100 +305,136 @@ class ConvTransposeMKLDNNHandlerT
} }
std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder( std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
const platform::MKLDNNDeviceContext& dev_ctx, const OneDNNContext& dev_ctx,
const std::string& key, const std::string& key,
const phi::DenseTensor* bias) { const phi::DenseTensor* bias) {
const K* bias_data = bias->data<K>(); const K* bias_data = bias->data<K>();
auto user_bias_md = OneDNNMemDesc(phi::vectorize(bias->dims()), auto user_bias_md = funcs::OneDNNMemDesc(phi::vectorize(bias->dims()),
phi::funcs::OneDNNGetDataType<K>(), funcs::OneDNNGetDataType<K>(),
OneDNNMemoryFormat::x); funcs::OneDNNMemoryFormat::x);
return this->AcquireMemoryWithReorder( return this->AcquireMemoryWithReorder(dev_ctx,
dev_ctx, user_bias_md,
user_bias_md, this->fwd_pd_->bias_desc(),
this->fwd_pd_->bias_desc(), funcs::to_void_cast<K>(bias_data),
phi::funcs::to_void_cast<K>(bias_data), key,
key, "@bias_mem_p",
"@bias_mem_p", is_test_);
is_test_);
} }
private:
const bool is_test_;
}; };
template <typename T, typename K> template <typename T, typename T_out>
class ConvTransposeMKLDNNOpKernel : public framework::OpKernel<T> { void Execute(const OneDNNContext& dev_ctx,
public: const DenseTensor* x,
void Compute(const framework::ExecutionContext& ctx) const override { const DenseTensor* filter,
PADDLE_ENFORCE_EQ(platform::is_cpu_place(ctx.GetPlace()), const std::vector<int>& strides,
true, const std::vector<int>& paddings,
platform::errors::PreconditionNotMet( const std::string& padding_algorithm,
"Operator DNNL ConvTranspose must use CPUPlace")); int groups,
const bool is_bfloat16 = const std::vector<int>& dilations,
ctx.Attr<std::string>("mkldnn_data_type") == "bfloat16"; DenseTensor* out) {
const bool force_fp32_output = ctx.Attr<bool>("force_fp32_output"); const auto* bias =
if (is_bfloat16) { dev_ctx.HasDnnInput("Bias") ? dev_ctx.GetDnnInput("Bias") : nullptr;
if (force_fp32_output)
Execute<float>(ctx); ConvTransposeOneDNNHandlerT<T, float, T_out> handler(dev_ctx,
else x,
Execute<platform::bfloat16>(ctx); filter,
} else { bias,
Execute<float>(ctx); strides,
} paddings,
padding_algorithm,
groups,
dilations,
out);
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(x);
// Caching Key for weights is needed
std::string key =
funcs::CreateKey(dev_ctx,
dev_ctx.GetInputsName("Input")[0],
dev_ctx.GetInputsName("Filter")[0],
(bias ? dev_ctx.GetInputsName("Bias")[0] : ""));
key = funcs::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key);
auto weights_memory_p =
handler.AcquireWeightsMemoryWithReorder(dev_ctx, key, filter, groups);
std::shared_ptr<dnnl::memory> dst_memory_p =
handler.template AcquireDstMemory<T_out>(out);
auto conv_p = handler.AcquireForwardPrimitive();
std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC, *src_memory_p},
{DNNL_ARG_WEIGHTS, *weights_memory_p},
{DNNL_ARG_DST, *dst_memory_p}};
if (bias) {
auto bias_memory_p =
handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias);
args.insert({DNNL_ARG_BIAS, *bias_memory_p});
} }
auto& astream = OneDNNContext::tls().get_stream();
conv_p->execute(astream, args);
astream.wait();
out->set_mem_desc(dst_memory_p->get_desc());
}
template <typename T_out> template <typename T, typename Context>
void Execute(const framework::ExecutionContext& ctx) const { void Conv2dTransposeKernel(const Context& dev_ctx,
auto& dev_ctx = const DenseTensor& x,
ctx.template device_context<platform::MKLDNNDeviceContext>(); const DenseTensor& filter,
const auto& mkldnn_engine = dev_ctx.GetEngine(); const std::vector<int>& strides,
const std::vector<int>& paddings,
const auto* input = ctx.Input<phi::DenseTensor>("Input"); const std::vector<int>& output_padding,
const auto* filter = ctx.Input<phi::DenseTensor>("Filter"); const IntArray& output_size,
const auto* bias = const std::string& padding_algorithm,
ctx.HasInput("Bias") ? ctx.Input<phi::DenseTensor>("Bias") : nullptr; int groups,
auto* output = ctx.Output<phi::DenseTensor>("Output"); const std::vector<int>& dilations,
ConvTransposeMKLDNNHandlerT<T, K, T_out> handler( const std::string& data_format,
ctx, mkldnn_engine, input, filter, bias, output); DenseTensor* out) {
auto src_memory_p = handler.AcquireSrcMemoryWithReorder(input); PADDLE_ENFORCE_EQ(dev_ctx.GetPlace().GetType(),
// Caching Key for weights is needed AllocationType::CPU,
std::string key = platform::CreateKey(dev_ctx, phi::errors::PreconditionNotMet(
ctx.InputName("Input"), "Operator oneDNN Conv must use CPUPlace"));
ctx.InputName("Filter"),
(bias ? ctx.InputName("Bias") : "")); const bool is_BFLOAT16 =
key = platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, key); dev_ctx.HasDnnAttr("mkldnn_data_type")
auto weights_memory_p = handler.AcquireWeightsMemoryWithReorder( ? PADDLE_GET_CONST(std::string,
dev_ctx, key, filter, ctx.Attr<int>("groups")); dev_ctx.GetDnnAttr("mkldnn_data_type")) ==
"bfloat16"
std::shared_ptr<dnnl::memory> dst_memory_p = : false;
handler.template AcquireDstMemory<T_out>(output); const bool force_fp32_output =
auto conv_p = handler.AcquireForwardPrimitive(); dev_ctx.HasDnnAttr("force_fp32_output")
? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
std::unordered_map<int, dnnl::memory> args = { : false;
{DNNL_ARG_SRC, *src_memory_p}, const bool use_bfloat16 = (!force_fp32_output && is_BFLOAT16);
{DNNL_ARG_WEIGHTS, *weights_memory_p},
{DNNL_ARG_DST, *dst_memory_p}}; if (use_bfloat16) {
Execute<T, dtype::bfloat16>(dev_ctx,
if (bias) { &x,
auto bias_memory_p = &filter,
handler.AcquireBiasMemoryWithReorder(dev_ctx, key, bias); strides,
args.insert({DNNL_ARG_BIAS, *bias_memory_p}); paddings,
} padding_algorithm,
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream(); groups,
conv_p->execute(astream, args); dilations,
astream.wait(); out);
output->set_mem_desc(dst_memory_p->get_desc()); } else {
Execute<T, float>(dev_ctx,
&x,
&filter,
strides,
paddings,
padding_algorithm,
groups,
dilations,
out);
} }
}; }
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; } // namespace phi
REGISTER_OP_KERNEL( PD_REGISTER_KERNEL(conv2d_transpose,
conv2d_transpose, OneDNN,
MKLDNN, ONEDNN,
::paddle::platform::CPUPlace, phi::Conv2dTransposeKernel,
ops::ConvTransposeMKLDNNOpKernel<float, float>, float,
ops::ConvTransposeMKLDNNOpKernel<paddle::platform::bfloat16, float>); phi::dtype::bfloat16) {}
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册