未验证 提交 269bd1fe 编写于 作者: P piotrekobi 提交者: GitHub

[PHI] Move oneDNN helper classes to new location (#45626)

* gaussian random

* mkldnn to onednn renaming

* fix merge conflicts

* remove fluid code

* onednn renaming

* Move classes from mkldnn_reuse.h to onednn_reuse.h

* Move more functions from mkldnn_helper.h to onednn_helpper.h

* Change MKLDNN to OneDNN in VLOG message
Co-authored-by: NSilv3S <slawomir.siwek@intel.com>
上级 4e4f4586
......@@ -212,10 +212,10 @@ std::shared_ptr<OperatorBase> TransferLayout(const std::string& var_name,
out_layout = framework::DataLayout::kNCHW;
}
if (in_layout == framework::DataLayout::MKLDNN &&
out_layout != framework::DataLayout::MKLDNN) {
if (in_layout == framework::DataLayout::ONEDNN &&
out_layout != framework::DataLayout::ONEDNN) {
auto target_layout = phi::OneDNNContext::tls().get_cur_paddle_data_layout();
VLOG(4) << "TransDataLayoutFromMKLDNN: " << in_layout << "->"
VLOG(4) << "TransDataLayoutFromOneDNN: " << in_layout << "->"
<< target_layout;
if (out_layout == DataLayout::kNCHW &&
......
......@@ -75,7 +75,7 @@ TEST(PhiUtils, TransOpKernelTypeToPhiKernelKey) {
auto kernel_key_mkldnn =
paddle::framework::TransOpKernelTypeToPhiKernelKey(op_kernel_type_mkldnn);
ASSERT_EQ(kernel_key_mkldnn.dtype(), phi::DataType::FLOAT32);
ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::MKLDNN);
ASSERT_EQ(kernel_key_mkldnn.layout(), phi::DataLayout::ONEDNN);
ASSERT_EQ(kernel_key_mkldnn.backend(), phi::Backend::ONEDNN);
#endif
......
......@@ -39,537 +39,24 @@ template <typename T,
typename TForward,
typename TBackward = mkldnn_dummy_primitive,
typename TBackward_params = mkldnn_dummy_primitive>
using MKLDNNHandlerNoCachingT = phi::funcs::
MKLDNNHandlerNoCachingT<T, TForward, TBackward, TBackward_params>;
using MKLDNNHandlerT =
phi::funcs::OneDNNHandlerT<T, TForward, TBackward, TBackward_params>;
template <typename T,
typename TForward,
typename TBackward = mkldnn_dummy_primitive,
typename TBackward_params = mkldnn_dummy_primitive>
class MKLDNNHandlerT {
public:
MKLDNNHandlerT(const MKLDNNDeviceContext& dev_ctx,
dnnl::engine engine,
platform::Place cpu_place,
const std::string& base_key)
: dev_ctx_(dev_ctx),
engine_(engine),
place_(cpu_place),
key_common_(base_key),
key_(platform::ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)),
fwd_pd_(nullptr),
bwd_pd_(nullptr) {
platform::MKLDNNDeviceContext::tls().log_lib_version();
}
std::shared_ptr<TForward> AcquireForwardPrimitive() {
const std::string key_p = key_ + "@fwd_p";
auto forward_p =
std::static_pointer_cast<TForward>(dev_ctx_.GetBlob(key_p));
if (forward_p == nullptr) {
forward_p = std::make_shared<TForward>(*fwd_pd_);
dev_ctx_.SetBlob(key_p, forward_p);
}
return forward_p;
}
std::shared_ptr<TBackward> AcquireBackwardPrimitive() {
const std::string key_p = key_ + "@bwd_p";
auto backward_p =
std::static_pointer_cast<TBackward>(dev_ctx_.GetBlob(key_p));
if (backward_p == nullptr) {
backward_p = std::make_shared<TBackward>(*bwd_pd_);
dev_ctx_.SetBlob(key_p, backward_p);
}
return backward_p;
}
std::shared_ptr<TBackward_params> AcquireBackwardWeightsPrimitive() {
const std::string key_p = key_ + "@bwd_w_p";
auto backward_p =
std::static_pointer_cast<TBackward_params>(dev_ctx_.GetBlob(key_p));
if (backward_p == nullptr) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
platform::errors::Unavailable("BWD_PD should be set when "
"getting BWD prim witk key: %s .",
key_p));
backward_p = std::make_shared<TBackward_params>(*bwd_w_pd_);
dev_ctx_.SetBlob(key_p, backward_p);
}
return backward_p;
}
std::shared_ptr<dnnl::memory> AcquireSrcMemory(
const framework::Tensor* input) {
const T* input_data = input->data<T>();
return this->AcquireMemoryFromPrimitive(
fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
}
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireDstMemory(framework::Tensor* output) {
T_out* ptr =
output->mutable_data<T_out>(place_, fwd_pd_->dst_desc().get_size());
return this->AcquireMemoryFromPrimitive(
fwd_pd_->dst_desc(), ptr, "@dst_mem_p");
}
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireDstMemory(void) {
return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p");
}
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireDstMemory(
const framework::Tensor* output) {
const T_out* output_data = output->data<T_out>();
return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(),
to_void_cast<T_out>(output_data),
"@bwd-dst_mem_p");
}
std::shared_ptr<dnnl::memory> AcquireDiffDstMemory(
const framework::Tensor* diffdst) {
const T* ptr = diffdst->data<T>();
return this->AcquireMemoryFromPrimitive(
bwd_pd_->diff_dst_desc(), to_void_cast<T>(ptr), "@diff_dst_mem_p");
}
std::shared_ptr<dnnl::memory> AcquireDiffSrcMemory(
framework::Tensor* diffsrc) {
T* ptr =
diffsrc->mutable_data<T>(place_, bwd_pd_->diff_src_desc().get_size());
return this->AcquireMemoryFromPrimitive(
bwd_pd_->diff_src_desc(), ptr, "@diff_src_mem_p");
}
// Buffer of given Tensor is used for oneDNN computation
std::shared_ptr<dnnl::memory> AcquireDiffWeightsMemory(
framework::Tensor* diff_weights) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
platform::errors::Unavailable(
"BWD_W_PD should be set when getting BWD grad of weights."));
T* ptr = diff_weights->mutable_data<T>(
place_, bwd_w_pd_->diff_weights_desc().get_size());
return this->AcquireMemoryFromPrimitive(
bwd_w_pd_->diff_weights_desc(), ptr, "@diff_wei_mem_p");
}
// Buffer is allocated by oneDNN to store computation results
std::shared_ptr<dnnl::memory> AcquireDiffWeightsMemory(void) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
platform::errors::Unavailable(
"BWD_W_PD should be set when getting BWD grad of weights."));
return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(),
"@diff_wei_mem_p");
}
protected:
bool isCached() {
const std::string key_pd = key_ + "@fwd_pd";
fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
return (fwd_pd_ != nullptr);
}
bool isBwdCached() {
const std::string key_pd = key_ + "@bwd_pd";
bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (bwd_pd_ == nullptr) {
return false;
} else {
if (std::is_same<TBackward_params, mkldnn_dummy_primitive>::value ==
false) {
const std::string key_bw_w_pd = key_ + "@bwd_w_pd";
bwd_w_pd_ =
std::static_pointer_cast<typename TBackward_params::primitive_desc>(
dev_ctx_.GetBlob(key_bw_w_pd));
}
// When BWD is cached then still we need to Get FWD PD
const std::string key_fpd = key_ + "@fwd_pd";
fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
dev_ctx_.GetBlob(key_fpd));
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
platform::errors::Unavailable(
"Error: FWD PD should be set when BWD PD is cached."));
return true;
}
}
// If your primitive descriptor requires attributes, pass them as a
// first argument and paramters to descriptor constructor in the following
// arguments. Otherwise, all arguments will be forwarded to descriptor
// constructor, including the first one.
template <typename Arg, typename... Args>
void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) {
// This is used when we can recreate FWD PD in BWD so
// we do not need to pass FWD to BWD
const std::string key_pd = key_ + "@fwd_pd";
fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (fwd_pd_ == nullptr) {
CreateForwardPrimitiveDescriptor(first_arg, std::forward<Args>(args)...);
dev_ctx_.SetBlob(key_pd, fwd_pd_);
}
}
// Using sfinae to specialise variadic function. Workaround for not having
// if constexpr in C++ 11.
template <class First, class... Args>
typename std::enable_if<std::is_same<typename std::decay<First>::type,
dnnl::primitive_attr>::value>::type
CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...);
fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(
fwd_desc, first, engine_);
}
template <class First, class... Args>
typename std::enable_if<!std::is_same<typename std::decay<First>::type,
dnnl::primitive_attr>::value>::type
CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
auto fwd_desc = typename TForward::desc(std::forward<First>(first),
std::forward<Args>(args)...);
fwd_pd_ =
std::make_shared<typename TForward::primitive_desc>(fwd_desc, engine_);
}
template <typename... Args>
void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
// fwd_pd_ is set during grad by calling
// AcquireForwardPrimitiveDescriptor
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.",
key_ + "@fwd_pd"));
const std::string key_pd = key_ + "@bwd_pd";
bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (bwd_pd_ == nullptr) {
auto bwd_desc = typename TBackward::desc(std::forward<Args>(args)...);
bwd_pd_ = std::make_shared<typename TBackward::primitive_desc>(
bwd_desc, engine_, *fwd_pd_);
dev_ctx_.SetBlob(key_pd, bwd_pd_);
}
}
template <typename... Args>
void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) {
// fwd_pd_ is set during grad by calling
// AcquireForwardPrimitiveDescriptor
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
platform::errors::Unavailable("Get MKLDNN Forward primitive %s failed.",
key_ + "@fwd_pd"));
const std::string key_pd = key_ + "@bwd_w_pd";
bwd_w_pd_ =
std::static_pointer_cast<typename TBackward_params::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (bwd_w_pd_ == nullptr) {
auto bwd_desc =
typename TBackward_params::desc(std::forward<Args>(args)...);
bwd_w_pd_ = std::make_shared<typename TBackward_params::primitive_desc>(
bwd_desc, engine_, *fwd_pd_);
dev_ctx_.SetBlob(key_pd, bwd_w_pd_);
}
}
std::shared_ptr<dnnl::memory> AcquireMemoryFromPrimitive(
const std::string& suffix) {
return std::static_pointer_cast<dnnl::memory>(
dev_ctx_.GetBlob(key_ + suffix));
}
std::shared_ptr<dnnl::memory> AcquireMemoryFromPrimitive(
dnnl::memory::desc md, void* ptr, const std::string& suffix) {
const auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
mem_p = std::make_shared<dnnl::memory>(md, engine_, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
}
return mem_p;
}
std::shared_ptr<dnnl::memory> AcquireMemoryFromPrimitive(
dnnl::memory::desc md, const std::string& suffix) {
const auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
mem_p = std::make_shared<dnnl::memory>(md, engine_);
dev_ctx_.SetBlob(local_key, mem_p);
}
return mem_p;
}
void AcquireReorder(const std::shared_ptr<dnnl::memory>& user_memory_p,
const std::shared_ptr<dnnl::memory>& target_memory_p) {
auto reorder_p =
std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
platform::RecordEvent record_reorder("int_reorder",
platform::TracerEventType::UserDefined,
2,
platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
astream.wait();
}
template <typename F = T>
std::shared_ptr<dnnl::memory> AcquireMemoryWithReorder(
const dnnl::memory::desc& user_md,
const dnnl::memory::desc& target_md,
void* ptr,
const std::string& suffix,
bool is_persistent = false,
std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {},
const std::vector<float>& scale_data = {1.0f},
int mask = 0) {
const auto target_key = key_ + suffix + "_target";
const auto key_reorder_p = key_ + suffix + "reorder_p";
const auto user_key = key_ + suffix + "_user";
auto target_memory_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(target_key));
if (target_memory_p == nullptr) {
if (custom_reorder_func) {
auto reordered_data =
custom_reorder_func(reinterpret_cast<const F*>(ptr));
dev_ctx_.SetBlob(key_reorder_p + "-custom_reorder", reordered_data);
ptr = reinterpret_cast<void*>(reordered_data.get());
}
auto user_memory_p =
std::make_shared<dnnl::memory>(user_md, engine_, ptr);
if (user_md != target_md) {
target_memory_p = std::make_shared<dnnl::memory>(target_md, engine_);
dnnl::reorder::primitive_desc reorder_pdesc;
if (is_int8<T>()) {
dnnl::primitive_attr attr;
attr.set_output_scales(mask, scale_data);
reorder_pdesc = dnnl::reorder::primitive_desc(
*user_memory_p, *target_memory_p, attr);
} else {
reorder_pdesc =
dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p);
}
auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc);
dev_ctx_.SetBlob(key_reorder_p, reorder_p);
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
platform::RecordEvent record_reorder(
"int_reorder",
platform::TracerEventType::UserDefined,
2,
platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
astream.wait();
} else {
target_memory_p = user_memory_p;
}
dev_ctx_.SetBlob(user_key, user_memory_p);
dev_ctx_.SetBlob(target_key, target_memory_p);
} else if (!is_persistent) {
auto& astream = platform::MKLDNNDeviceContext::tls().get_stream();
auto user_memory_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(user_key));
user_memory_p->set_data_handle(ptr);
// TODO(jczaja): Here we detect if reorder is cached it means it is needed
// need to change this to get rid of keys
auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
dev_ctx_.GetBlob(key_reorder_p));
if (reorder_p != nullptr) {
platform::RecordEvent record_reorder(
"int_reorder",
platform::TracerEventType::UserDefined,
2,
platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
astream.wait();
}
}
return target_memory_p;
}
std::shared_ptr<dnnl::memory> AcquireMemory(const std::string& suffix) {
const auto local_key = key_ + suffix;
return std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(local_key));
}
const MKLDNNDeviceContext& dev_ctx_;
dnnl::engine engine_;
platform::Place place_;
std::string key_common_;
std::string key_;
std::shared_ptr<typename TForward::primitive_desc> fwd_pd_;
std::shared_ptr<typename TBackward::primitive_desc> bwd_pd_;
std::shared_ptr<typename TBackward_params::primitive_desc> bwd_w_pd_;
};
using MKLDNNHandlerNoCachingT = phi::funcs::
OneDNNHandlerNoCachingT<T, TForward, TBackward, TBackward_params>;
template <typename T>
class BinaryMKLDNNHandler
: public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
public:
BinaryMKLDNNHandler(const dnnl::algorithm algo,
const int axis,
const dnnl::engine engine,
platform::Place cpu_place,
const Tensor* x,
const Tensor* y,
Tensor* out,
float scale_x,
float scale_y,
float scale_out,
const dnnl::post_ops& post_ops = dnnl::post_ops{})
: platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
const auto src_x_tz = phi::vectorize(x->dims());
const auto src_y_tz = phi::vectorize(y->dims());
// if output tensor(z) is nullptr then we are computing into oneDNN
// managed buffer
auto rankdiff = x->dims().size() - y->dims().size();
const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
: phi::vectorize(out->dims());
auto src0_md = x->mem_desc();
auto src1_md = y->mem_desc();
if (rankdiff > 0) { // Second input is of smaller rank than first
std::vector<int64_t> dims1_ex(rankdiff, 1);
dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)),
src_y_tz.begin(),
src_y_tz.end());
// For broadcasting for NHWC we need rotate extended shape
if (MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
framework::DataLayout::kNHWC) {
std::rotate(dims1_ex.begin() + 1, dims1_ex.end() - 1, dims1_ex.end());
}
src1_md = src1_md.reshape(dims1_ex);
} else if (rankdiff < 0) { // First input is of smaller than second
std::vector<int64_t> dims0_ex(-rankdiff, 1);
dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)),
src_x_tz.begin(),
src_x_tz.end());
// For broadcasting for NHWC we need rotate extended shape
if (MKLDNNDeviceContext::tls().get_cur_paddle_data_layout() ==
framework::DataLayout::kNHWC) {
std::rotate(dims0_ex.begin() + 1, dims0_ex.end() - 1, dims0_ex.end());
}
src0_md = src0_md.reshape(dims0_ex);
}
const auto dst_md = memory::desc(
dst_tz, platform::MKLDNNGetDataType<T>(), MKLDNNMemoryFormat::any);
auto attributes =
CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
if (x->numel() < y->numel()) {
this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src1_md, src0_md, dst_md);
} else {
this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src0_md, src1_md, dst_md);
}
}
std::shared_ptr<dnnl::memory> AcquireSecondSrcMemory(
const framework::Tensor* input) {
const T* input_data = input->data<T>();
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(),
to_void_cast<T>(input_data));
}
private:
static inline dnnl::primitive_attr CreateAttributes(
dnnl::algorithm op,
float scale_x,
float scale_y,
float scale_out,
dnnl::post_ops post_ops = dnnl::post_ops{}) {
// Scales set in attributes for inputs contibute to the output equation
// in the following way (assuming no broadcasting takes place):
// output_i = scale_0 * x_i <+ or *> scale_1 * y_i;
// Hence we have to create scales that will:
// 1. Dequantize both values, by multiplying with (1.0 / scale_x_or_y)
// 2. Quantize their result to output scale range, by multiplying with
// (scale_z)
// If we combine these two, we end up with following equation
// output = scale_out * (1/scale_x * x <* or +> 1/scale_y * y)
// Hence, to mimic such behaviour using provided interface,
// For add operation the equation is equal to:
// output = (scale_out / scale_x) * x + (scale_out / scale_y) * y
// <scale_0> <scale_1>
// For mul operation on the other hand
// output = (scale_out / scale_x) * x * (1.0 / scale_y) * y
// <scale_0> <scale_1>
float scale_0 = scale_out / scale_x;
float scale_1 =
op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y;
dnnl::primitive_attr attributes;
attributes.set_scales(
/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0, {scale_0});
attributes.set_scales(
/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0, {scale_1});
if (post_ops.len() > 0) attributes.set_post_ops(post_ops);
return attributes;
}
};
using ReductionMKLDNNHandler = phi::funcs::ReductionOneDNNHandler<T>;
template <typename T>
class BroadcastDataMKLDNNHandler
: public platform::MKLDNNHandlerNoCachingT<T, dnnl::binary> {
public:
BroadcastDataMKLDNNHandler(const dnnl::algorithm algo,
const dnnl::engine engine,
platform::Place cpu_place,
const Tensor* x,
Tensor* out,
float scale_x,
float scale_y,
const std::vector<int64_t>& extended_x_dims)
: platform::MKLDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
const auto src0_tz = phi::vectorize(out->dims());
const auto src0_md =
dnnl::memory::desc(src0_tz,
platform::MKLDNNGetDataType<T>(),
platform::GetPlainMKLDNNFormat(src0_tz.size()));
const auto src1_md = x->mem_desc().reshape(extended_x_dims);
dnnl::primitive_attr attributes;
attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y});
this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src0_md, src1_md, src0_md);
}
using BroadcastDataMKLDNNHandler = phi::funcs::BroadcastDataOneDNNHandler<T>;
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireZeroedDstMemory(framework::Tensor* out) {
T_out* ptr = out->mutable_data<T_out>(this->place_,
this->fwd_pd_->dst_desc().get_size());
memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
}
};
template <typename T>
using BinaryMKLDNNHandler = phi::funcs::BinaryOneDNNHandler<T>;
static void AppendActivation(const framework::ExecutionContext& ctx,
dnnl::post_ops& post_ops, // NOLINT
......@@ -624,34 +111,6 @@ static void AppendActivation(const framework::ExecutionContext& ctx,
}
}
template <typename T>
class ReductionMKLDNNHandler
: public platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction> {
public:
ReductionMKLDNNHandler(const dnnl::algorithm algo,
const float p,
const float eps,
const dnnl::engine engine,
platform::Place cpu_place,
const Tensor* x,
const Tensor* out,
std::vector<int64_t> out_tz,
const dnnl::primitive_attr& attrs = NULL)
: platform::MKLDNNHandlerNoCachingT<T, dnnl::reduction>(engine,
cpu_place) {
const auto out_md = memory::desc(out_tz,
platform::MKLDNNGetDataType<T>(),
dnnl::memory::format_tag::any);
if (attrs)
this->AcquireForwardPrimitiveDescriptor(
attrs, algo, x->mem_desc(), out_md, p, eps);
else
this->AcquireForwardPrimitiveDescriptor(
algo, x->mem_desc(), out_md, p, eps);
}
};
template <typename T>
constexpr bool IsInt8() {
return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
......@@ -1071,37 +530,5 @@ class ReorderMKLDNNHandler {
dnnl::memory::data_type dtype_, dtype_dst_;
dnnl::engine engine_;
};
template <typename T>
static void SetDstMemoryQuantized(
const framework::ExecutionContext& ctx,
framework::Tensor* output,
std::vector<int64_t> dst_tz,
const dnnl::engine& engine,
std::shared_ptr<dnnl::memory::desc>& dst_md, // NOLINT
std::shared_ptr<dnnl::memory>& dst_memory, // NOLINT
MKLDNNMemoryFormat output_format) {
T* output_data = output->mutable_data<T>(ctx.GetPlace());
const size_t dst_dims = dst_tz.size();
MKLDNNMemoryFormat dst_fmt;
PADDLE_ENFORCE_LE(dst_dims,
5,
platform::errors::InvalidArgument(
"Dst memory for quantization can not have "
"dims > 5. But received dst_dims is %d.",
dst_dims));
dst_fmt = platform::MKLDNNFormatForSize(dst_dims, output_format);
auto tmp_dst_md =
platform::MKLDNNMemDesc({dst_tz},
paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<T>::DataType()),
dst_fmt);
dst_md.reset(new dnnl::memory::desc(tmp_dst_md));
dst_memory.reset(
new dnnl::memory(*dst_md, engine, to_void_cast<T>(output_data)));
}
} // namespace platform
} // namespace paddle
......@@ -56,7 +56,7 @@ BackendSet GetTensorBackendSet(const phi::TensorBase& t) {
if (HasAllocation(t) && t.place().GetType() != AllocationType::UNDEFINED) {
BackendSet backend_set(phi::TransToPhiBackend(t.place()));
switch (t.layout()) {
case DataLayout::MKLDNN:
case DataLayout::ONEDNN:
backend_set = backend_set | BackendSet(Backend::ONEDNN);
break;
default:
......
......@@ -14,6 +14,7 @@
#pragma once
#include <thread>
#include "dnnl.hpp" // NOLINT
#include "glog/logging.h"
......@@ -94,6 +95,106 @@ inline dnnl::memory::format_tag GetPlainOneDNNFormat(int tensor_rank) {
}
}
template <typename Type>
dnnl::memory::data_type oneDNNGetDataType() {
return dnnl::memory::data_type::undef;
}
template <>
inline dnnl::memory::data_type oneDNNGetDataType<float>() {
return dnnl::memory::data_type::f32;
}
template <>
inline dnnl::memory::data_type oneDNNGetDataType<int32_t>() {
return dnnl::memory::data_type::s32;
}
template <>
inline dnnl::memory::data_type oneDNNGetDataType<int8_t>() {
return dnnl::memory::data_type::s8;
}
template <>
inline dnnl::memory::data_type oneDNNGetDataType<uint8_t>() {
return dnnl::memory::data_type::u8;
}
template <>
inline dnnl::memory::data_type oneDNNGetDataType<dtype::bfloat16>() {
return dnnl::memory::data_type::bf16;
}
inline std::vector<std::vector<int64_t>> ToOneDNNPadding(
const std::vector<int64_t>& paddings) {
if (paddings.size() == 6) {
int padding_front = paddings[0];
int padding_back = paddings[1];
int padding_top = paddings[2];
int padding_bottom = paddings[3];
int padding_left = paddings[4];
int padding_right = paddings[5];
return {{padding_front, padding_top, padding_left},
{padding_back, padding_bottom, padding_right}};
} else {
int padding_top = paddings[0];
int padding_bottom = paddings[1];
int padding_left = paddings[2];
int padding_right = paddings[3];
return {{padding_top, padding_left}, {padding_bottom, padding_right}};
}
}
template <typename T>
inline void AppendKey(std::string* key, const T& num) {
key->append(std::to_string(num));
}
template <>
inline void AppendKey(std::string* key,
const dnnl::memory::format_tag& format) {
key->append(std::to_string(static_cast<int>(format)));
}
template <>
inline void AppendKey(std::string* key,
const dnnl::memory::data_type& data_type) {
key->append(std::to_string(static_cast<int>(data_type)));
}
template <>
inline void AppendKey(std::string* key, const dnnl::algorithm& algorithm) {
key->append(std::to_string(static_cast<int>(algorithm)));
}
template <>
inline void AppendKey(std::string* key,
const dnnl::normalization_flags& flags) {
key->append(std::to_string(static_cast<int>(flags)));
}
inline void AppendKey(std::string* key, const std::string& str) {
key->append(str);
}
inline void AppendKey(std::string* key, const char* str) { key->append(str); }
template <typename T>
inline void AppendKey(std::string* key, const std::vector<T>& dims) {
for (size_t i = 0; i < dims.size(); i++) {
AppendKey(key, std::to_string(dims[i]));
}
}
template <typename... ArgTypes>
inline std::string CreateKey(const OneDNNContext& dev_ctx, ArgTypes&&... args) {
std::string key;
key.reserve(64);
using expand_type = int[];
expand_type{0, (AppendKey(&key, std::forward<ArgTypes>(args)), 0)...};
key += OneDNNContext::tls().get_key_suffix();
return key;
}
inline void MatchShapeToLayout(DenseTensor* tensor_in,
DataLayout from,
DataLayout to) {
......@@ -117,28 +218,28 @@ inline void MatchShapeToLayout(DenseTensor* tensor_in,
// at last nhwC, so for dim==2 these layouts are the same and nothing should
// be done. Similarly for dim==1 when you have just one possible combination.
if (tensor_in->dims().size() < 3) {
VLOG(3) << "Keeping MKLDNN/NHWC/NDHWC output_shape"
VLOG(3) << "Keeping ONEDNN/NHWC/NDHWC output_shape"
<< print_dims(phi::vectorize<int>(tensor_in->dims()));
return;
}
switch (from) {
case DataLayout::MKLDNN:
case DataLayout::ONEDNN:
if ((to == DataLayout::NHWC) || (to == DataLayout::NDHWC)) {
auto dims = phi::vectorize<int>(tensor_in->dims());
std::rotate(dims.begin() + 1, dims.begin() + 2, dims.end());
tensor_in->Resize(phi::make_ddim(dims));
VLOG(3) << "Rotating Shape from: MKLDNN to: NHWC/NDHWC output_shape"
VLOG(3) << "Rotating Shape from: ONEDNN to: NHWC/NDHWC output_shape"
<< print_dims(dims);
}
break;
case DataLayout::NHWC:
case DataLayout::NDHWC:
if (to == DataLayout::MKLDNN) {
if (to == DataLayout::ONEDNN) {
auto dims = phi::vectorize<int>(tensor_in->dims());
std::rotate(dims.begin() + 1, dims.end() - 1, dims.end());
tensor_in->Resize(phi::make_ddim(dims));
VLOG(3) << "Rotating Shape from: NHWC/NDHWC to: MKLDNN output_shape"
VLOG(3) << "Rotating Shape from: NHWC/NDHWC to: ONEDNN output_shape"
<< print_dims(dims);
}
break;
......@@ -158,5 +259,22 @@ inline dnnl::memory::desc OneDNNMemDesc(const std::vector<int64_t>& dims,
return dnnl::memory::desc({dims}, data_type, format);
}
inline std::string ThreadIDasStr(void) {
return std::to_string(
std::hash<std::thread::id>()(std::this_thread::get_id()));
}
inline std::string ExtendKeyWithThreadInfoIfNeeded(const OneDNNContext& dev_ctx,
const std::string& key) {
return (OneDNNContext::tls().is_tid_used_in_key() == true)
? key + "-t:" + ThreadIDasStr()
: key;
}
template <typename T>
bool constexpr is_int8() {
return std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
}
} // namespace funcs
} // namespace phi
......@@ -33,19 +33,402 @@ namespace funcs {
using user_function = std::function<std::shared_ptr<float>(const float*)>;
using memory = dnnl::memory;
using Place = phi::Place;
using MKLDNNMemoryFormat = dnnl::memory::format_tag;
using OneDNNMemoryFormat = dnnl::memory::format_tag;
template <typename T,
typename TForward,
typename TBackward = onednn_dummy_primitive,
typename TBackward_params = onednn_dummy_primitive>
class MKLDNNHandlerNoCachingT {
class OneDNNHandlerT {
public:
MKLDNNHandlerNoCachingT(dnnl::engine engine, Place cpu_place)
OneDNNHandlerT(const OneDNNContext& dev_ctx,
dnnl::engine engine,
Place cpu_place,
const std::string& base_key)
: dev_ctx_(dev_ctx),
engine_(engine),
place_(cpu_place),
key_common_(base_key),
key_(ExtendKeyWithThreadInfoIfNeeded(dev_ctx, base_key)),
fwd_pd_(nullptr),
bwd_pd_(nullptr) {
OneDNNContext::tls().log_lib_version();
}
std::shared_ptr<TForward> AcquireForwardPrimitive() {
const std::string key_p = key_ + "@fwd_p";
auto forward_p =
std::static_pointer_cast<TForward>(dev_ctx_.GetBlob(key_p));
if (forward_p == nullptr) {
forward_p = std::make_shared<TForward>(*fwd_pd_);
dev_ctx_.SetBlob(key_p, forward_p);
}
return forward_p;
}
std::shared_ptr<TBackward> AcquireBackwardPrimitive() {
const std::string key_p = key_ + "@bwd_p";
auto backward_p =
std::static_pointer_cast<TBackward>(dev_ctx_.GetBlob(key_p));
if (backward_p == nullptr) {
backward_p = std::make_shared<TBackward>(*bwd_pd_);
dev_ctx_.SetBlob(key_p, backward_p);
}
return backward_p;
}
std::shared_ptr<TBackward_params> AcquireBackwardWeightsPrimitive() {
const std::string key_p = key_ + "@bwd_w_p";
auto backward_p =
std::static_pointer_cast<TBackward_params>(dev_ctx_.GetBlob(key_p));
if (backward_p == nullptr) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
errors::Unavailable("BWD_PD should be set when "
"getting BWD prim witk key: %s .",
key_p));
backward_p = std::make_shared<TBackward_params>(*bwd_w_pd_);
dev_ctx_.SetBlob(key_p, backward_p);
}
return backward_p;
}
std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor* input) {
const T* input_data = input->data<T>();
return this->AcquireMemoryFromPrimitive(
fwd_pd_->src_desc(), to_void_cast<T>(input_data), "@src_mem_p");
}
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireDstMemory(DenseTensor* output) {
T_out* ptr =
output->mutable_data<T_out>(place_, fwd_pd_->dst_desc().get_size());
return this->AcquireMemoryFromPrimitive(
fwd_pd_->dst_desc(), ptr, "@dst_mem_p");
}
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireDstMemory(void) {
return this->AcquireMemoryFromPrimitive(fwd_pd_->dst_desc(), "@dstt_mem_p");
}
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireDstMemory(const DenseTensor* output) {
const T_out* output_data = output->data<T_out>();
return this->AcquireMemoryFromPrimitive(bwd_pd_->dst_desc(),
to_void_cast<T_out>(output_data),
"@bwd-dst_mem_p");
}
std::shared_ptr<dnnl::memory> AcquireDiffDstMemory(
const DenseTensor* diffdst) {
const T* ptr = diffdst->data<T>();
return this->AcquireMemoryFromPrimitive(
bwd_pd_->diff_dst_desc(), to_void_cast<T>(ptr), "@diff_dst_mem_p");
}
std::shared_ptr<dnnl::memory> AcquireDiffSrcMemory(DenseTensor* diffsrc) {
T* ptr =
diffsrc->mutable_data<T>(place_, bwd_pd_->diff_src_desc().get_size());
return this->AcquireMemoryFromPrimitive(
bwd_pd_->diff_src_desc(), ptr, "@diff_src_mem_p");
}
// Buffer of given DenseTensor is used for oneDNN computation
std::shared_ptr<dnnl::memory> AcquireDiffWeightsMemory(
DenseTensor* diff_weights) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
errors::Unavailable(
"BWD_W_PD should be set when getting BWD grad of weights."));
T* ptr = diff_weights->mutable_data<T>(
place_, bwd_w_pd_->diff_weights_desc().get_size());
return this->AcquireMemoryFromPrimitive(
bwd_w_pd_->diff_weights_desc(), ptr, "@diff_wei_mem_p");
}
// Buffer is allocated by oneDNN to store computation results
std::shared_ptr<dnnl::memory> AcquireDiffWeightsMemory(void) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
errors::Unavailable(
"BWD_W_PD should be set when getting BWD grad of weights."));
return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc(),
"@diff_wei_mem_p");
}
protected:
bool isCached() {
const std::string key_pd = key_ + "@fwd_pd";
fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
return (fwd_pd_ != nullptr);
}
bool isBwdCached() {
const std::string key_pd = key_ + "@bwd_pd";
bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (bwd_pd_ == nullptr) {
return false;
} else {
if (std::is_same<TBackward_params, onednn_dummy_primitive>::value ==
false) {
const std::string key_bw_w_pd = key_ + "@bwd_w_pd";
bwd_w_pd_ =
std::static_pointer_cast<typename TBackward_params::primitive_desc>(
dev_ctx_.GetBlob(key_bw_w_pd));
}
// When BWD is cached then still we need to Get FWD PD
const std::string key_fpd = key_ + "@fwd_pd";
fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
dev_ctx_.GetBlob(key_fpd));
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
errors::Unavailable(
"Error: FWD PD should be set when BWD PD is cached."));
return true;
}
}
// If your primitive descriptor requires attributes, pass them as a
// first argument and paramters to descriptor constructor in the following
// arguments. Otherwise, all arguments will be forwarded to descriptor
// constructor, including the first one.
template <typename Arg, typename... Args>
void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) {
// This is used when we can recreate FWD PD in BWD so
// we do not need to pass FWD to BWD
const std::string key_pd = key_ + "@fwd_pd";
fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (fwd_pd_ == nullptr) {
CreateForwardPrimitiveDescriptor(first_arg, std::forward<Args>(args)...);
dev_ctx_.SetBlob(key_pd, fwd_pd_);
}
}
// Using sfinae to specialise variadic function. Workaround for not having
// if constexpr in C++ 11.
template <class First, class... Args>
typename std::enable_if<std::is_same<typename std::decay<First>::type,
dnnl::primitive_attr>::value>::type
CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...);
fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(
fwd_desc, first, engine_);
}
template <class First, class... Args>
typename std::enable_if<!std::is_same<typename std::decay<First>::type,
dnnl::primitive_attr>::value>::type
CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
auto fwd_desc = typename TForward::desc(std::forward<First>(first),
std::forward<Args>(args)...);
fwd_pd_ =
std::make_shared<typename TForward::primitive_desc>(fwd_desc, engine_);
}
template <typename... Args>
void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
// fwd_pd_ is set during grad by calling
// AcquireForwardPrimitiveDescriptor
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
errors::Unavailable("Get OneDNN Forward primitive %s failed.",
key_ + "@fwd_pd"));
const std::string key_pd = key_ + "@bwd_pd";
bwd_pd_ = std::static_pointer_cast<typename TBackward::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (bwd_pd_ == nullptr) {
auto bwd_desc = typename TBackward::desc(std::forward<Args>(args)...);
bwd_pd_ = std::make_shared<typename TBackward::primitive_desc>(
bwd_desc, engine_, *fwd_pd_);
dev_ctx_.SetBlob(key_pd, bwd_pd_);
}
}
template <typename... Args>
void AcquireBackwardWeightsPrimitiveDescriptor(Args&&... args) {
// fwd_pd_ is set during grad by calling
// AcquireForwardPrimitiveDescriptor
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
errors::Unavailable("Get OneDNN Forward primitive %s failed.",
key_ + "@fwd_pd"));
const std::string key_pd = key_ + "@bwd_w_pd";
bwd_w_pd_ =
std::static_pointer_cast<typename TBackward_params::primitive_desc>(
dev_ctx_.GetBlob(key_pd));
if (bwd_w_pd_ == nullptr) {
auto bwd_desc =
typename TBackward_params::desc(std::forward<Args>(args)...);
bwd_w_pd_ = std::make_shared<typename TBackward_params::primitive_desc>(
bwd_desc, engine_, *fwd_pd_);
dev_ctx_.SetBlob(key_pd, bwd_w_pd_);
}
}
std::shared_ptr<dnnl::memory> AcquireMemoryFromPrimitive(
const std::string& suffix) {
return std::static_pointer_cast<dnnl::memory>(
dev_ctx_.GetBlob(key_ + suffix));
}
std::shared_ptr<dnnl::memory> AcquireMemoryFromPrimitive(
dnnl::memory::desc md, void* ptr, const std::string& suffix) {
const auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
mem_p = std::make_shared<dnnl::memory>(md, engine_, ptr);
dev_ctx_.SetBlob(local_key, mem_p);
} else {
mem_p->set_data_handle(ptr);
}
return mem_p;
}
std::shared_ptr<dnnl::memory> AcquireMemoryFromPrimitive(
dnnl::memory::desc md, const std::string& suffix) {
const auto local_key = key_ + suffix;
auto mem_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(local_key));
if (mem_p == nullptr) {
mem_p = std::make_shared<dnnl::memory>(md, engine_);
dev_ctx_.SetBlob(local_key, mem_p);
}
return mem_p;
}
void AcquireReorder(const std::shared_ptr<dnnl::memory>& user_memory_p,
const std::shared_ptr<dnnl::memory>& target_memory_p) {
auto reorder_p =
std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
2,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
astream.wait();
}
template <typename F = T>
std::shared_ptr<dnnl::memory> AcquireMemoryWithReorder(
const dnnl::memory::desc& user_md,
const dnnl::memory::desc& target_md,
void* ptr,
const std::string& suffix,
bool is_persistent = false,
std::function<std::shared_ptr<F>(const F*)> custom_reorder_func = {},
const std::vector<float>& scale_data = {1.0f},
int mask = 0) {
const auto target_key = key_ + suffix + "_target";
const auto key_reorder_p = key_ + suffix + "reorder_p";
const auto user_key = key_ + suffix + "_user";
auto target_memory_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(target_key));
if (target_memory_p == nullptr) {
if (custom_reorder_func) {
auto reordered_data =
custom_reorder_func(reinterpret_cast<const F*>(ptr));
dev_ctx_.SetBlob(key_reorder_p + "-custom_reorder", reordered_data);
ptr = reinterpret_cast<void*>(reordered_data.get());
}
auto user_memory_p =
std::make_shared<dnnl::memory>(user_md, engine_, ptr);
if (user_md != target_md) {
target_memory_p = std::make_shared<dnnl::memory>(target_md, engine_);
dnnl::reorder::primitive_desc reorder_pdesc;
if (is_int8<T>()) {
dnnl::primitive_attr attr;
attr.set_output_scales(mask, scale_data);
reorder_pdesc = dnnl::reorder::primitive_desc(
*user_memory_p, *target_memory_p, attr);
} else {
reorder_pdesc =
dnnl::reorder::primitive_desc(*user_memory_p, *target_memory_p);
}
auto reorder_p = std::make_shared<dnnl::reorder>(reorder_pdesc);
dev_ctx_.SetBlob(key_reorder_p, reorder_p);
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
2,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
astream.wait();
} else {
target_memory_p = user_memory_p;
}
dev_ctx_.SetBlob(user_key, user_memory_p);
dev_ctx_.SetBlob(target_key, target_memory_p);
} else if (!is_persistent) {
auto& astream = OneDNNContext::tls().get_stream();
auto user_memory_p =
std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(user_key));
user_memory_p->set_data_handle(ptr);
// TODO(jczaja): Here we detect if reorder is cached it means it is needed
// need to change this to get rid of keys
auto reorder_p = std::static_pointer_cast<dnnl::reorder>(
dev_ctx_.GetBlob(key_reorder_p));
if (reorder_p != nullptr) {
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
2,
paddle::platform::EventRole::kUniqueOp);
reorder_p->execute(
astream,
{{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
astream.wait();
}
}
return target_memory_p;
}
std::shared_ptr<dnnl::memory> AcquireMemory(const std::string& suffix) {
const auto local_key = key_ + suffix;
return std::static_pointer_cast<dnnl::memory>(dev_ctx_.GetBlob(local_key));
}
const OneDNNContext& dev_ctx_;
dnnl::engine engine_;
Place place_;
std::string key_common_;
std::string key_;
std::shared_ptr<typename TForward::primitive_desc> fwd_pd_;
std::shared_ptr<typename TBackward::primitive_desc> bwd_pd_;
std::shared_ptr<typename TBackward_params::primitive_desc> bwd_w_pd_;
};
template <typename T,
typename TForward,
typename TBackward = onednn_dummy_primitive,
typename TBackward_params = onednn_dummy_primitive>
class OneDNNHandlerNoCachingT {
public:
OneDNNHandlerNoCachingT(dnnl::engine engine, Place cpu_place)
: engine_(engine), place_(cpu_place), fwd_pd_(nullptr), bwd_pd_(nullptr) {
phi::OneDNNContext::tls().log_lib_version();
OneDNNContext::tls().log_lib_version();
}
std::shared_ptr<TForward> AcquireForwardPrimitive() {
......@@ -57,9 +440,8 @@ class MKLDNNHandlerNoCachingT {
}
std::shared_ptr<TBackward_params> AcquireBackwardWeightsPrimitive() {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
phi::errors::Unavailable("BWD_PD should be set when "
PADDLE_ENFORCE_NOT_NULL(bwd_w_pd_,
errors::Unavailable("BWD_PD should be set when "
"getting BWD prim ."));
return std::make_shared<TBackward_params>(*bwd_w_pd_);
}
......@@ -102,12 +484,12 @@ class MKLDNNHandlerNoCachingT {
return this->AcquireMemoryFromPrimitive(bwd_pd_->diff_src_desc(), ptr);
}
// Buffer of given Tensor is used for oneDNN computation
// Buffer of given DenseTensor is used for oneDNN computation
std::shared_ptr<dnnl::memory> AcquireDiffWeightsMemory(
DenseTensor* diff_weights) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
phi::errors::Unavailable(
errors::Unavailable(
"BWD_W_PD should be set when getting BWD grad of weights."));
T* ptr = diff_weights->mutable_data<T>(
place_, bwd_w_pd_->diff_weights_desc().get_size());
......@@ -119,7 +501,7 @@ class MKLDNNHandlerNoCachingT {
std::shared_ptr<dnnl::memory> AcquireDiffWeightsMemory(void) {
PADDLE_ENFORCE_NOT_NULL(
bwd_w_pd_,
phi::errors::Unavailable(
errors::Unavailable(
"BWD_W_PD should be set when getting BWD grad of weights."));
return this->AcquireMemoryFromPrimitive(bwd_w_pd_->diff_weights_desc());
}
......@@ -161,7 +543,7 @@ class MKLDNNHandlerNoCachingT {
// AcquireForwardPrimitiveDescriptor
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed."));
errors::Unavailable("Get oneDNN Forward primitive %s failed."));
auto bwd_desc = typename TBackward::desc(std::forward<Args>(args)...);
bwd_pd_ = std::make_shared<typename TBackward::primitive_desc>(
bwd_desc, engine_, *fwd_pd_);
......@@ -173,7 +555,7 @@ class MKLDNNHandlerNoCachingT {
// AcquireForwardPrimitiveDescriptor
PADDLE_ENFORCE_NOT_NULL(
fwd_pd_,
phi::errors::Unavailable("Get MKLDNN Forward primitive %s failed."));
errors::Unavailable("Get oneDNN Forward primitive %s failed."));
auto bwd_desc =
typename TBackward_params::desc(std::forward<Args>(args)...);
bwd_w_pd_ = std::make_shared<typename TBackward_params::primitive_desc>(
......@@ -195,7 +577,7 @@ class MKLDNNHandlerNoCachingT {
auto reorder_p =
std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
auto& astream = phi::OneDNNContext::tls().get_stream();
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
......@@ -227,7 +609,7 @@ class MKLDNNHandlerNoCachingT {
auto reorder_p =
std::make_shared<dnnl::reorder>(*user_memory_p, *target_memory_p);
auto& astream = phi::OneDNNContext::tls().get_stream();
auto& astream = OneDNNContext::tls().get_stream();
paddle::platform::RecordEvent record_reorder(
"int_reorder",
paddle::platform::TracerEventType::UserDefined,
......@@ -252,7 +634,7 @@ class MKLDNNHandlerNoCachingT {
template <typename T>
class ActivationOneDNNHandler
: public MKLDNNHandlerNoCachingT<T,
: public OneDNNHandlerNoCachingT<T,
dnnl::eltwise_forward,
dnnl::eltwise_backward> {
public:
......@@ -262,7 +644,7 @@ class ActivationOneDNNHandler
const dnnl::engine engine,
Place cpu_place,
const DenseTensor* x)
: MKLDNNHandlerNoCachingT<T,
: OneDNNHandlerNoCachingT<T,
dnnl::eltwise_forward,
dnnl::eltwise_backward>(engine, cpu_place) {
this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
......@@ -279,7 +661,7 @@ class ActivationOneDNNHandler
Place cpu_place,
const DenseTensor* x,
const DenseTensor* dout)
: MKLDNNHandlerNoCachingT<T,
: OneDNNHandlerNoCachingT<T,
dnnl::eltwise_forward,
dnnl::eltwise_backward>(engine, cpu_place) {
this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
......@@ -330,7 +712,7 @@ class ReorderOneDNNHandler {
return std::make_shared<dnnl::memory>(md, engine_, ptr);
}
std::shared_ptr<dnnl::memory> AcquireSrcMemory(const MKLDNNMemoryFormat& fmt,
std::shared_ptr<dnnl::memory> AcquireSrcMemory(const OneDNNMemoryFormat& fmt,
void* ptr) {
auto md = dnnl::memory::desc(dims_, dtype_, fmt);
return std::make_shared<dnnl::memory>(md, engine_, ptr);
......@@ -347,7 +729,7 @@ class ReorderOneDNNHandler {
}
std::shared_ptr<dnnl::memory> AcquireDstMemory(DenseTensor* output,
const MKLDNNMemoryFormat& fmt,
const OneDNNMemoryFormat& fmt,
Place place) {
auto dst_md = OneDNNMemDesc(dims_, dtype_dst_, fmt);
auto dst_data = output->mutable_data(place, ptype_dst_, dst_md.get_size());
......@@ -372,7 +754,7 @@ class ReorderOneDNNHandler {
std::shared_ptr<dnnl::memory> AcquireDstMemory(
DenseTensor* output,
const std::vector<int64_t>& dims,
const MKLDNNMemoryFormat& fmt,
const OneDNNMemoryFormat& fmt,
Place place) {
auto dst_md = OneDNNMemDesc(dims, dtype_dst_, fmt);
auto dst_data = output->mutable_data(place, ptype_dst_, dst_md.get_size());
......@@ -400,5 +782,170 @@ class ReorderOneDNNHandler {
dnnl::engine engine_;
};
template <typename T>
class BinaryOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::binary> {
public:
BinaryOneDNNHandler(const dnnl::algorithm algo,
const int axis,
const dnnl::engine engine,
Place cpu_place,
const DenseTensor* x,
const DenseTensor* y,
DenseTensor* out,
float scale_x,
float scale_y,
float scale_out,
const dnnl::post_ops& post_ops = dnnl::post_ops{})
: OneDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
const auto src_x_tz = vectorize(x->dims());
const auto src_y_tz = vectorize(y->dims());
// if output tensor(z) is nullptr then we are computing into oneDNN
// managed buffer
auto rankdiff = x->dims().size() - y->dims().size();
const auto dst_tz = (out == nullptr) ? (rankdiff > 0 ? src_x_tz : src_y_tz)
: vectorize(out->dims());
auto src0_md = x->mem_desc();
auto src1_md = y->mem_desc();
if (rankdiff > 0) { // Second input is of smaller rank than first
std::vector<int64_t> dims1_ex(rankdiff, 1);
dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)),
src_y_tz.begin(),
src_y_tz.end());
// For broadcasting for NHWC we need rotate extended shape
if (OneDNNContext::tls().get_cur_paddle_data_layout() ==
DataLayout::kNHWC) {
std::rotate(dims1_ex.begin() + 1, dims1_ex.end() - 1, dims1_ex.end());
}
src1_md = src1_md.reshape(dims1_ex);
} else if (rankdiff < 0) { // First input is of smaller than second
std::vector<int64_t> dims0_ex(-rankdiff, 1);
dims0_ex.insert(next(dims0_ex.begin(), (axis == -1 ? -rankdiff : axis)),
src_x_tz.begin(),
src_x_tz.end());
// For broadcasting for NHWC we need rotate extended shape
if (OneDNNContext::tls().get_cur_paddle_data_layout() ==
DataLayout::kNHWC) {
std::rotate(dims0_ex.begin() + 1, dims0_ex.end() - 1, dims0_ex.end());
}
src0_md = src0_md.reshape(dims0_ex);
}
const auto dst_md =
memory::desc(dst_tz, oneDNNGetDataType<T>(), OneDNNMemoryFormat::any);
auto attributes =
CreateAttributes(algo, scale_x, scale_y, scale_out, post_ops);
if (x->numel() < y->numel()) {
this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src1_md, src0_md, dst_md);
} else {
this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src0_md, src1_md, dst_md);
}
}
std::shared_ptr<dnnl::memory> AcquireSecondSrcMemory(
const DenseTensor* input) {
const T* input_data = input->data<T>();
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->src1_desc(),
to_void_cast<T>(input_data));
}
private:
static inline dnnl::primitive_attr CreateAttributes(
dnnl::algorithm op,
float scale_x,
float scale_y,
float scale_out,
dnnl::post_ops post_ops = dnnl::post_ops{}) {
// Scales set in attributes for inputs contibute to the output equation
// in the following way (assuming no broadcasting takes place):
// output_i = scale_0 * x_i <+ or *> scale_1 * y_i;
// Hence we have to create scales that will:
// 1. Dequantize both values, by multiplying with (1.0 / scale_x_or_y)
// 2. Quantize their result to output scale range, by multiplying with
// (scale_z)
// If we combine these two, we end up with following equation
// output = scale_out * (1/scale_x * x <* or +> 1/scale_y * y)
// Hence, to mimic such behaviour using provided interface,
// For add operation the equation is equal to:
// output = (scale_out / scale_x) * x + (scale_out / scale_y) * y
// <scale_0> <scale_1>
// For mul operation on the other hand
// output = (scale_out / scale_x) * x * (1.0 / scale_y) * y
// <scale_0> <scale_1>
float scale_0 = scale_out / scale_x;
float scale_1 =
op == dnnl::algorithm::binary_add ? scale_out / scale_y : 1.0 / scale_y;
dnnl::primitive_attr attributes;
attributes.set_scales(
/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0, {scale_0});
attributes.set_scales(
/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0, {scale_1});
if (post_ops.len() > 0) attributes.set_post_ops(post_ops);
return attributes;
}
};
template <typename T>
class BroadcastDataOneDNNHandler
: public OneDNNHandlerNoCachingT<T, dnnl::binary> {
public:
BroadcastDataOneDNNHandler(const dnnl::algorithm algo,
const dnnl::engine engine,
Place cpu_place,
const DenseTensor* x,
DenseTensor* out,
float scale_x,
float scale_y,
const std::vector<int64_t>& extended_x_dims)
: OneDNNHandlerNoCachingT<T, dnnl::binary>(engine, cpu_place) {
const auto src0_tz = vectorize(out->dims());
const auto src0_md = dnnl::memory::desc(
src0_tz, oneDNNGetDataType<T>(), GetPlainOneDNNFormat(src0_tz.size()));
const auto src1_md = x->mem_desc().reshape(extended_x_dims);
dnnl::primitive_attr attributes;
attributes.set_scales(DNNL_ARG_SRC_0, 0, {scale_x});
attributes.set_scales(DNNL_ARG_SRC_1, 0, {scale_y});
this->AcquireForwardPrimitiveDescriptor(
attributes, algo, src0_md, src1_md, src0_md);
}
template <typename T_out = T>
std::shared_ptr<dnnl::memory> AcquireZeroedDstMemory(DenseTensor* out) {
T_out* ptr = out->mutable_data<T_out>(this->place_,
this->fwd_pd_->dst_desc().get_size());
memset(ptr, 0, this->fwd_pd_->dst_desc().get_size());
return this->AcquireMemoryFromPrimitive(this->fwd_pd_->dst_desc(), ptr);
}
};
template <typename T>
class ReductionOneDNNHandler
: public OneDNNHandlerNoCachingT<T, dnnl::reduction> {
public:
ReductionOneDNNHandler(const dnnl::algorithm algo,
const float p,
const float eps,
const dnnl::engine engine,
Place cpu_place,
const DenseTensor* x,
const DenseTensor* out,
std::vector<int64_t> out_tz,
const dnnl::primitive_attr& attrs = NULL)
: OneDNNHandlerNoCachingT<T, dnnl::reduction>(engine, cpu_place) {
const auto out_md = memory::desc(
out_tz, oneDNNGetDataType<T>(), dnnl::memory::format_tag::any);
if (attrs)
this->AcquireForwardPrimitiveDescriptor(
attrs, algo, x->mem_desc(), out_md, p, eps);
else
this->AcquireForwardPrimitiveDescriptor(
algo, x->mem_desc(), out_md, p, eps);
}
};
} // namespace funcs
} // namespace phi
......@@ -32,7 +32,7 @@ namespace experimental {
* more specific, we need to distinguish the calculation method.
*
* Such as the kernel for CPU device, it can be a native CPU kernel,
* or a kernel implemented by MKLDNN library.
* or a kernel implemented by oneDNN library.
*
* Note(chenweihang): HIP is not needed now, we can added it if needed
* in the future
......
......@@ -40,7 +40,7 @@ enum class DataLayout {
NCHW,
NCDHW,
NDHWC,
MKLDNN,
ONEDNN,
SPARSE_COO,
SPARSE_CSR,
PSTRING_UNION,
......@@ -62,7 +62,7 @@ enum class DataLayout {
kAnyLayout = ANY,
kNHWC = NHWC,
kNCHW = NCHW,
kMKLDNN = MKLDNN, // all layouts supported by MKLDNN internally
kMKLDNN = ONEDNN, // all layouts supported by ONEDNN internally
kNDHWC = NDHWC,
kNCDHW = NCDHW,
};
......
......@@ -14,8 +14,8 @@
#include "paddle/phi/kernels/log_softmax_kernel.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
#include "paddle/phi/backends/onednn/onednn_context.h"
#include "paddle/phi/backends/onednn/onednn_reuse.h"
#include "paddle/phi/common/bfloat16.h"
#include "paddle/phi/common/place.h"
#include "paddle/phi/core/kernel_registry.h"
......@@ -23,16 +23,15 @@
namespace phi {
template <typename T>
class LogSoftmaxMKLDNNHandler
: public paddle::platform::
MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward> {
class LogSoftmaxOneDNNHandler
: public funcs::OneDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward> {
public:
LogSoftmaxMKLDNNHandler(const dnnl::engine mkldnn_engine,
LogSoftmaxOneDNNHandler(const dnnl::engine onednn_engine,
Place cpu_place,
const DenseTensor& x,
const int axis)
: paddle::platform::MKLDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward>(
mkldnn_engine, cpu_place) {
: funcs::OneDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward>(
onednn_engine, cpu_place) {
this->AcquireForwardPrimitiveDescriptor(
dnnl::prop_kind::forward_inference, x.mem_desc(), axis);
}
......@@ -43,11 +42,11 @@ void LogSoftmaxKernel(const Context& dev_ctx,
const DenseTensor& x,
int axis,
DenseTensor* out) {
const auto& mkldnn_engine = dev_ctx.GetEngine();
const auto& onednn_engine = dev_ctx.GetEngine();
axis = axis >= 0 ? axis : x.dims().size() + axis;
LogSoftmaxMKLDNNHandler<T> handler(
mkldnn_engine, dev_ctx.GetPlace(), x, axis);
LogSoftmaxOneDNNHandler<T> handler(
onednn_engine, dev_ctx.GetPlace(), x, axis);
auto src_memory_p = handler.AcquireSrcMemory(&x);
auto dst_memory_p = handler.AcquireDstMemory(out);
......
......@@ -97,7 +97,7 @@ void TransferLayoutMKLDNN(const Context& dev_ctx,
// NOTE(zhiqiu): to handle the special case in ApplyDataTransform() in
// data_transfer.cc
if (!x.IsInitialized() && src_layout == DataLayout::MKLDNN &&
if (!x.IsInitialized() && src_layout == DataLayout::ONEDNN &&
dst_layout == DataLayout::NHWC) {
VLOG(4) << src_layout << "->" << dst_layout << " " << x.layout();
out->Resize(x.dims());
......@@ -106,7 +106,7 @@ void TransferLayoutMKLDNN(const Context& dev_ctx,
return;
}
if (src_layout != DataLayout::MKLDNN && dst_layout == DataLayout::MKLDNN) {
if (src_layout != DataLayout::ONEDNN && dst_layout == DataLayout::ONEDNN) {
// Case1 - transform from Non-MKLDNN OPKernel to MKLDNN OPKernel
// Just set layout/format. No real transform occur
auto out_format = funcs::OneDNNFormatForSize(
......@@ -121,16 +121,16 @@ void TransferLayoutMKLDNN(const Context& dev_ctx,
OneDNNContext::tls().set_cur_paddle_data_layout(src_layout);
}
out->set_layout(DataLayout::MKLDNN);
out->set_layout(DataLayout::ONEDNN);
out->set_format(out_format);
} else if (src_layout == DataLayout::MKLDNN &&
dst_layout != DataLayout::MKLDNN) {
} else if (src_layout == DataLayout::ONEDNN &&
dst_layout != DataLayout::ONEDNN) {
// Case2 - transfrom from MKLDNN OPKernel to Non-MKLDNN OPKernel
// Do transform via MKLDNN lib
funcs::innerTransDataLayoutFromOneDNN(
src_layout, dst_layout, x, out, dev_ctx.GetPlace());
} else if (src_layout == DataLayout::MKLDNN &&
dst_layout == DataLayout::MKLDNN) {
} else if (src_layout == DataLayout::ONEDNN &&
dst_layout == DataLayout::ONEDNN) {
PADDLE_ENFORCE_NE(
src_layout,
dst_layout,
......
......@@ -37,7 +37,7 @@ TEST(DataLayout, OStream) {
oss << phi::DataLayout::NCHW;
EXPECT_EQ(oss.str(), "NCHW");
oss.str("");
oss << phi::DataLayout::MKLDNN;
oss << phi::DataLayout::ONEDNN;
EXPECT_EQ(oss.str(), "MKLDNN");
oss.str("");
try {
......
......@@ -40,7 +40,7 @@ TEST(DEV_API, transfer_layout) {
DenseTensor x;
MetaTensor meta_x(&x);
meta_x.set_dtype(DataType::FLOAT32);
meta_x.set_layout(DataLayout::MKLDNN);
meta_x.set_layout(DataLayout::ONEDNN);
meta_x.set_dims(make_ddim({n, c, h, w}));
DenseTensor out;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册