未验证 提交 a7944904 编写于 作者: J Jacek Czaja 提交者: GitHub

[oneDNN]elementwise_add and elementwise_mul int8 support (#24984)

* Start implementing int8 eltwise add

test=develop

* - Fix to Michal PR

* - Fix

test=develop

* - Lint fixes

test=develop

* - Added checking if elementwise_mul can be used

test=develop

* - Added attribs to skip_attrs_set

test=develop

* - Improved broadcasting

test=develop

- fixes to compilation

- fix

- fix

- Lint fixes

test=develop

* - removed redundant condition

test=develop
Co-authored-by: NMichal Gallus <michal.gallus@intel.com>
上级 84358115
...@@ -1881,8 +1881,16 @@ PDNode *patterns::MultipleQuantize::operator()() { ...@@ -1881,8 +1881,16 @@ PDNode *patterns::MultipleQuantize::operator()() {
PDNode *patterns::MKLDNNInPlace::operator()() { PDNode *patterns::MKLDNNInPlace::operator()() {
const std::unordered_set<std::string> &supported_op_types = { const std::unordered_set<std::string> &supported_op_types = {
"abs", "elementwise_add", "gelu", "leaky_relu", "relu", "softmax", "abs",
"sqrt", "swish", "tanh"}; "elementwise_mul",
"elementwise_add",
"gelu",
"leaky_relu",
"relu",
"softmax",
"sqrt",
"swish",
"tanh"};
auto possible_inplace_op = pattern->NewNode(inplace_to_be_op_repr()) auto possible_inplace_op = pattern->NewNode(inplace_to_be_op_repr())
->assert_is_ops(supported_op_types); ->assert_is_ops(supported_op_types);
......
...@@ -28,39 +28,31 @@ class ElementwiseMulOp : public ElementwiseOp { ...@@ -28,39 +28,31 @@ class ElementwiseMulOp : public ElementwiseOp {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
using ElementwiseOp::ElementwiseOp; using ElementwiseOp::ElementwiseOp;
#ifdef PADDLE_WITH_MKLDNN
static bool AreDimsAndFormatCorrect(const framework::ExecutionContext& ctx,
int simd_width,
mkldnn::memory::format_tag x_format) {
using Tensor = framework::Tensor;
using paddle::framework::vectorize;
using mkldnn::memory;
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto x_dims = vectorize(x->dims());
const bool are_dims_divisable = !(x_dims[1] % simd_width);
const bool is_x_format_correct = x->format() == x_format;
const bool is_y_format_correct = vectorize(y->dims()).size() == 2;
return are_dims_divisable && is_x_format_correct && is_y_format_correct;
}
#endif
framework::OpKernelType GetExpectedKernelType( framework::OpKernelType GetExpectedKernelType(
const framework::ExecutionContext& ctx) const override { const framework::ExecutionContext& ctx) const override {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
using mkldnn::memory; using mkldnn::memory;
if (platform::CanMKLDNNBeUsed(ctx)) { auto CanMKLDNNElementwiseMulBeUsed = [&]() {
bool can_use_avx512_kernel = auto x_dims = ctx.Input<Tensor>("X")->dims();
platform::MayIUse(platform::avx512f) && auto y_dims = ctx.Input<Tensor>("Y")->dims();
AreDimsAndFormatCorrect(ctx, 16, memory::format_tag::nChw16c); int rankdiff = x_dims.size() - y_dims.size();
if (can_use_avx512_kernel) { // TODO(jczaja): Remove this when oneDNN performance for scalar
// broadcasting
// is improved (Ernie large situation)
if (rankdiff != 0 && y_dims.size() == 1 && y_dims[0] == 1) {
return false;
}
return true;
};
if (platform::CanMKLDNNBeUsed(ctx) && CanMKLDNNElementwiseMulBeUsed()) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(), return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN, framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN); framework::LibraryType::kMKLDNN);
} }
}
#endif #endif
return framework::OpKernelType(input_data_type, ctx.GetPlace()); return framework::OpKernelType(input_data_type, ctx.GetPlace());
} }
......
...@@ -100,15 +100,7 @@ class ElementwiseOp : public framework::OperatorWithKernel { ...@@ -100,15 +100,7 @@ class ElementwiseOp : public framework::OperatorWithKernel {
auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X"); auto input_data_type = OperatorWithKernel::IndicateVarDataType(ctx, "X");
#ifdef PADDLE_WITH_MKLDNN #ifdef PADDLE_WITH_MKLDNN
auto CanMKLDNNElementwiseAddBeUsed = [&]() { if (platform::CanMKLDNNBeUsed(ctx)) {
int axis = ctx.Attr<int>("axis");
int rankdiff = ctx.Input<Tensor>("X")->dims().size() -
ctx.Input<Tensor>("Y")->dims().size();
return (rankdiff == 0) || (axis == -1) || (axis == rankdiff);
};
if (platform::CanMKLDNNBeUsed(ctx) &&
(ctx.Type() != "elementwise_add" || CanMKLDNNElementwiseAddBeUsed())) {
return framework::OpKernelType(input_data_type, ctx.GetPlace(), return framework::OpKernelType(input_data_type, ctx.GetPlace(),
framework::DataLayout::kMKLDNN, framework::DataLayout::kMKLDNN,
framework::LibraryType::kMKLDNN); framework::LibraryType::kMKLDNN);
...@@ -148,6 +140,21 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker { ...@@ -148,6 +140,21 @@ class ElementwiseOpMaker : public framework::OpProtoAndCheckerMaker {
.SetDefault(""); .SetDefault("");
AddAttr<std::string>("y_data_format", "This parameter is no longer used.") AddAttr<std::string>("y_data_format", "This parameter is no longer used.")
.SetDefault(""); .SetDefault("");
/* int8 parameters */
AddAttr<bool>("use_quantizer",
"(bool, default false) "
"Set to true for operators that should be quantized and use "
"int8 kernel. Only used on CPU.")
.SetDefault(false);
AddAttr<float>("Scale_x",
"(float, default 1.0f), The quantize scale of X tensor")
.SetDefault(1.0f);
AddAttr<float>("Scale_y",
"(float, default 1.0f), The quantize scale of Y tensor")
.SetDefault(1.0f);
AddAttr<float>("Scale_out",
"(float, default 1.0f), The quantize scale of output data")
.SetDefault(1.0f);
AddOpComment(); AddOpComment();
} }
......
/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. // Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
Licensed under the Apache License, Version 2.0 (the "License"); // Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. // you may not use this file except in compliance with the License.
You may obtain a copy of the License at // You may obtain a copy of the License at
//
http://www.apache.org/licenses/LICENSE-2.0 // http://www.apache.org/licenses/LICENSE-2.0
//
Unless required by applicable law or agreed to in writing, software // Unless required by applicable law or agreed to in writing, software
distributed under the License is distributed on an "AS IS" BASIS, // distributed under the License is distributed on an "AS IS" BASIS,
WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and // See the License for the specific language governing permissions and
limitations under the License. */ // limitations under the License.
#include "paddle/fluid/memory/memcpy.h" #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle { namespace paddle {
namespace operators { namespace operators {
using framework::DataLayout;
using framework::Tensor;
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::reorder;
using mkldnn::stream;
using mkldnn::sum;
template <typename T>
class EltwiseAddMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const auto* x = ctx.Input<Tensor>("X");
const auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
platform::BinaryMKLDNNHandler<T> handler(
dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z, ctx.OutputName("Out"));
const auto src_x_memory = handler.AcquireSrcMemory(x);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
// For Inplace src and and dst are the same memory object
const auto dst_memory =
x->IsSharedBufferWith(*z) ? src_x_memory : handler.AcquireDstMemory(z);
const auto binary_prim = handler.AcquireForwardPrimitive();
mkldnn::stream astream(mkldnn_engine);
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_x_memory},
{DNNL_ARG_SRC_1, *src_y_memory},
{DNNL_ARG_DST, *dst_memory}};
binary_prim->execute(astream, args);
astream.wait();
z->set_layout(DataLayout::kMKLDNN);
z->set_format(platform::GetMKLDNNFormat(*dst_memory));
}
};
template <typename T> template <typename T>
class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> { class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
public: public:
...@@ -106,8 +53,11 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> { ...@@ -106,8 +53,11 @@ class EltwiseAddMKLDNNGradKernel : public ElemwiseGradKernel<T> {
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(elementwise_add, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(
ops::EltwiseAddMKLDNNKernel<float>) elementwise_add, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_add>,
ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_add>,
ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_add>)
REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(elementwise_add_grad, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseAddMKLDNNGradKernel<float>) ops::EltwiseAddMKLDNNGradKernel<float>)
// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
//
// Licensed under the Apache License, Version 2.0 (the "License");
// you may not use this file except in compliance with the License.
// You may obtain a copy of the License at
//
// http://www.apache.org/licenses/LICENSE-2.0
//
// Unless required by applicable law or agreed to in writing, software
// distributed under the License is distributed on an "AS IS" BASIS,
// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
// See the License for the specific language governing permissions and
// limitations under the License.
#pragma once
#include <unordered_map>
#include "paddle/fluid/memory/memcpy.h"
#include "paddle/fluid/operators/elementwise/elementwise_add_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/platform/mkldnn_reuse.h"
namespace paddle {
namespace operators {
using framework::DataLayout;
using framework::Tensor;
using mkldnn::memory;
using mkldnn::primitive;
using mkldnn::stream;
template <typename T, dnnl::algorithm BINARY_OP>
class EltwiseMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
const auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
const auto* x = ctx.Input<Tensor>("X");
const auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
float scale_x = ctx.Attr<float>("Scale_x");
float scale_y = ctx.Attr<float>("Scale_y");
float scale_o = ctx.Attr<float>("Scale_out");
int axis = ctx.Attr<int>("axis");
platform::BinaryMKLDNNHandler<T> handler(
BINARY_OP, axis, dev_ctx, mkldnn_engine, ctx.GetPlace(), x, y, z,
scale_x, scale_y, scale_o, ctx.OutputName("Out"));
const auto src_x_memory = handler.AcquireSrcMemory(x);
const auto src_y_memory = handler.AcquireSecondSrcMemory(y);
// For Inplace src and and dst are the same memory object
const auto dst_memory =
x->IsSharedBufferWith(*z) ? src_x_memory : handler.AcquireDstMemory(z);
const auto binary_prim = handler.AcquireForwardPrimitive();
mkldnn::stream astream(mkldnn_engine);
const std::unordered_map<int, dnnl::memory> args = {
{DNNL_ARG_SRC_0, *src_x_memory},
{DNNL_ARG_SRC_1, *src_y_memory},
{DNNL_ARG_DST, *dst_memory}};
binary_prim->execute(astream, args);
astream.wait();
z->set_layout(DataLayout::kMKLDNN);
z->set_format(platform::GetMKLDNNFormat(*dst_memory));
}
};
} // namespace operators
} // namespace paddle
/* Copyright (c) 2016 PaddlePaddle Authors. All Rights Reserved. /* Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
Licensed under the Apache License, Version 2.0 (the "License"); Licensed under the Apache License, Version 2.0 (the "License");
you may not use this file except in compliance with the License. you may not use this file except in compliance with the License.
...@@ -12,94 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,94 +12,12 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <mkldnn/include/mkldnn.hpp> #include "paddle/fluid/operators/elementwise/mkldnn/elementwise_mkldnn_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op.h"
#include "paddle/fluid/operators/elementwise/elementwise_op_function.h"
#include "paddle/fluid/operators/jit/kernels.h"
#include "paddle/fluid/platform/cpu_info.h"
#include "paddle/fluid/platform/mkldnn_helper.h"
#ifdef PADDLE_WITH_XBYAK
#include "xbyak/xbyak.h"
#include "xbyak/xbyak_util.h"
#endif
namespace paddle {
namespace operators {
using framework::DataLayout;
using mkldnn::memory;
using platform::StringToMKLDNNFormat;
template <typename T>
static void ComputeBroadcastedMultiply(const T* x_data, const T* y_data,
T* z_data, int64_t n, int64_t c,
int64_t h, int64_t w, int simd_width,
void (*multiply)(const T*, const T*, T*,
int, int)) {
const int64_t C = c / simd_width;
#pragma omp parallel for collapse(2)
for (int ni = 0; ni < n; ni++) {
for (int ci = 0; ci < C; ci++) {
auto ptr_x =
x_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
auto ptr_y = y_data + ni * C * simd_width + ci * simd_width;
auto ptr_z =
z_data + ni * C * h * w * simd_width + ci * h * w * simd_width;
multiply(ptr_x, ptr_y, ptr_z, h, w);
}
}
}
template <typename T>
class ElementwiseMulMKLDNNKernel : public framework::OpKernel<T> {
public:
void Compute(const framework::ExecutionContext& ctx) const override {
using Tensor = framework::Tensor;
int axis = ctx.Attr<int>("axis");
auto* x = ctx.Input<Tensor>("X");
auto* y = ctx.Input<Tensor>("Y");
auto* z = ctx.Output<Tensor>("Out");
const T* x_data = x->data<T>();
const T* y_data = y->data<T>();
T* z_data = z->mutable_data<T>(ctx.GetPlace());
auto x_dims = x->dims();
auto y_dims_untrimmed = y->dims();
auto x_int_dims = paddle::framework::vectorize<int64_t>(x_dims);
int pre, num, post, is_run_common_broadcast;
get_mid_dims(x_dims, y_dims_untrimmed, axis, &pre, &num, &post,
&is_run_common_broadcast);
if (post == 1)
PADDLE_THROW(
platform::errors::Unimplemented("Not implemented when post is 1."));
const int64_t n = x_dims[0];
const int64_t c = x_dims[1];
const int64_t h = x_dims[2];
const int64_t w = x_dims[3];
const int simd_width = 16;
auto multiply =
jit::KernelFuncs<jit::NCHW16CMulNCTuple<T>, platform::CPUPlace>::Cache()
.At(0);
ComputeBroadcastedMultiply(x_data, y_data, z_data, n, c, h, w, simd_width,
multiply);
z->set_layout(DataLayout::kMKLDNN);
z->set_format(x->format());
}
};
} // namespace operators
} // namespace paddle
namespace ops = paddle::operators; namespace ops = paddle::operators;
REGISTER_OP_KERNEL(elementwise_mul, MKLDNN, ::paddle::platform::CPUPlace, REGISTER_OP_KERNEL(
ops::ElementwiseMulMKLDNNKernel<float>) elementwise_mul, MKLDNN, ::paddle::platform::CPUPlace,
ops::EltwiseMKLDNNKernel<float, dnnl::algorithm::binary_mul>,
ops::EltwiseMKLDNNKernel<int8_t, dnnl::algorithm::binary_mul>,
ops::EltwiseMKLDNNKernel<uint8_t, dnnl::algorithm::binary_mul>)
...@@ -120,8 +120,12 @@ class MKLDNNHandlerT { ...@@ -120,8 +120,12 @@ class MKLDNNHandlerT {
return (dev_ctx_.GetBlob(key_p) != nullptr); return (dev_ctx_.GetBlob(key_p) != nullptr);
} }
template <typename... Args> // If your primitive descriptor requires attributes, pass them as a
void AcquireForwardPrimitiveDescriptor(Args&&... args) { // first argument and paramters to descriptor constructor in the following
// arguments. Otherwise, all arguments will be forwarded to descriptor
// constructor, including the first one.
template <typename Arg, typename... Args>
void AcquireForwardPrimitiveDescriptor(Arg&& first_arg, Args&&... args) {
// Forward PD has to be passed to Grad op that // Forward PD has to be passed to Grad op that
// may be executed by diffrent thread, hence // may be executed by diffrent thread, hence
// for that one we use key that does not contain TID // for that one we use key that does not contain TID
...@@ -135,14 +139,34 @@ class MKLDNNHandlerT { ...@@ -135,14 +139,34 @@ class MKLDNNHandlerT {
fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>( fwd_pd_ = std::static_pointer_cast<typename TForward::primitive_desc>(
dev_ctx_.GetBlob(key_pd)); dev_ctx_.GetBlob(key_pd));
if (fwd_pd_ == nullptr) { if (fwd_pd_ == nullptr) {
auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...); CreateForwardPrimitiveDescriptor(first_arg,
fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(fwd_desc, std::forward<Args>(args)...);
engine_);
dev_ctx_.SetBlob(key_pd, fwd_pd_); dev_ctx_.SetBlob(key_pd, fwd_pd_);
} }
} }
} }
// Using sfinae to specialise variadic function. Workaround for not having
// if constexpr in C++ 11.
template <class First, class... Args>
typename std::enable_if<std::is_same<typename std::decay<First>::type,
dnnl::primitive_attr>::value>::type
CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
auto fwd_desc = typename TForward::desc(std::forward<Args>(args)...);
fwd_pd_ = std::make_shared<typename TForward::primitive_desc>(
fwd_desc, first, engine_);
}
template <class First, class... Args>
typename std::enable_if<!std::is_same<typename std::decay<First>::type,
dnnl::primitive_attr>::value>::type
CreateForwardPrimitiveDescriptor(First&& first, Args&&... args) {
auto fwd_desc = typename TForward::desc(std::forward<First>(first),
std::forward<Args>(args)...);
fwd_pd_ =
std::make_shared<typename TForward::primitive_desc>(fwd_desc, engine_);
}
template <typename... Args> template <typename... Args>
void AcquireBackwardPrimitiveDescriptor(Args&&... args) { void AcquireBackwardPrimitiveDescriptor(Args&&... args) {
const std::string key_fwd_pd = key_common_ + "@forward_pd"; const std::string key_fwd_pd = key_common_ + "@forward_pd";
...@@ -385,18 +409,23 @@ class MKLDNNHandler { ...@@ -385,18 +409,23 @@ class MKLDNNHandler {
template <typename T> template <typename T>
class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> { class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
public: public:
BinaryMKLDNNHandler(const MKLDNNDeviceContext& dev_ctx, BinaryMKLDNNHandler(const dnnl::algorithm algo, const int axis,
const MKLDNNDeviceContext& dev_ctx,
const mkldnn::engine engine, platform::Place cpu_place, const mkldnn::engine engine, platform::Place cpu_place,
const Tensor* x, const Tensor* y, Tensor* z, const Tensor* x, const Tensor* y, Tensor* z,
float scale_x, float scale_y, float scale_z,
const std::string& uniq_name) const std::string& uniq_name)
: platform::MKLDNNHandlerT<T, dnnl::binary>( : platform::MKLDNNHandlerT<T, dnnl::binary>(
dev_ctx, engine, cpu_place, dev_ctx, engine, cpu_place,
platform::CreateKey(framework::vectorize(x->dims()), uniq_name)) { platform::CreateKey(
// bradcasting combined with in-place may require longer key framework::vectorize(x->dims()),
uniq_name + (algo == dnnl::algorithm::binary_mul ? "M" : ""))) {
// bradcasting combined with in-place may require
auto rankdiff = x->dims().size() - y->dims().size(); auto rankdiff = x->dims().size() - y->dims().size();
if (rankdiff > 0) { if (rankdiff > 0) {
this->key_ += std::to_string(rankdiff); auto suffix = std::to_string(rankdiff);
this->key_common_ += std::to_string(rankdiff); this->key_ += suffix;
this->key_common_ += suffix;
} }
if (!this->isCached()) { if (!this->isCached()) {
...@@ -423,16 +452,17 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> { ...@@ -423,16 +452,17 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
auto src1_md = dnnl::memory::desc( auto src1_md = dnnl::memory::desc(
src_y_tz, platform::MKLDNNGetDataType<T>(), y->format()); src_y_tz, platform::MKLDNNGetDataType<T>(), y->format());
if (rankdiff > 0) { if (rankdiff > 0) {
std::vector<int64_t> ones(rankdiff, 1); std::vector<int64_t> dims1_ex(rankdiff, 1);
std::vector<int64_t> dims1_ex(src_y_tz); dims1_ex.insert(next(dims1_ex.begin(), (axis == -1 ? rankdiff : axis)),
dims1_ex.insert(dims1_ex.begin(), ones.begin(), ones.end()); src_y_tz.begin(), src_y_tz.end());
src1_md = src1_md.reshape(dims1_ex); src1_md = src1_md.reshape(dims1_ex);
} }
const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(), const auto dst_md = memory::desc(dst_tz, platform::MKLDNNGetDataType<T>(),
MKLDNNMemoryFormat::any); MKLDNNMemoryFormat::any);
this->AcquireForwardPrimitiveDescriptor(dnnl::algorithm::binary_add, auto attributes = CreateAttributes(algo, scale_x, scale_y, scale_z);
src0_md, src1_md, dst_md); this->AcquireForwardPrimitiveDescriptor(attributes, algo, src0_md,
src1_md, dst_md);
} }
} }
...@@ -442,6 +472,38 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> { ...@@ -442,6 +472,38 @@ class BinaryMKLDNNHandler : public platform::MKLDNNHandlerT<T, dnnl::binary> {
return this->AcquireMemoryFromPrimitive( return this->AcquireMemoryFromPrimitive(
this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p"); this->fwd_pd_->src1_desc(), to_void_cast<T>(input_data), "@src1_mem_p");
} }
private:
static inline dnnl::primitive_attr CreateAttributes(dnnl::algorithm op,
float scale_x,
float scale_y,
float scale_z) {
// Scales set in attributes for inputs contibute to the output equation
// in the following way (assuming no broadcasting takes place):
// output_i = scale_0 * x_i <+ or *> scale_1 * y_i;
// Hence we have to create scales that will:
// 1. Dequantize both values, by multiplying with (1.0 / scale_x_or_y)
// 2. Quantize their result to output scale range, by multiplying with
// (scale_z)
// If we combine these two, we end up with following equation
// output = scale_out * (1/scale_x * x <* or +> 1/scale_y * y)
// Hence, to mimic such behaviour using provided interface,
// For add operation the equation is equal to:
// output = (scale_out / scale_x) * x + (scale_out / scale_y) * y
// <scale_0> <scale_1>
// For mul operation on the other hand
// output = (scale_out / scale_x) * x * (1.0 / scale_y) * y
// <scale_0> <scale_1>
float scale_0 = scale_z / scale_x;
float scale_1 =
op == dnnl::algorithm::binary_add ? scale_z / scale_y : 1.0 / scale_y;
dnnl::primitive_attr attributes;
attributes.set_scales(/* input_x_id = */ DNNL_ARG_SRC_0, /* mask = */ 0,
{scale_0});
attributes.set_scales(/* input_y_id = */ DNNL_ARG_SRC_1, /* mask = */ 0,
{scale_1});
return attributes;
}
}; };
class SumMKLDNNHandler : public MKLDNNHandler { class SumMKLDNNHandler : public MKLDNNHandler {
......
...@@ -11907,7 +11907,9 @@ for func in [ ...@@ -11907,7 +11907,9 @@ for func in [
Default is None. It's used to print debug info for developers. Details: \ Default is None. It's used to print debug info for developers. Details: \
:ref:`api_guide_Name` " :ref:`api_guide_Name` "
], ],
skip_attrs_set={"x_data_format", "y_data_format", "axis" skip_attrs_set={
"x_data_format", "y_data_format", "axis", "use_quantizer",
"Scale_x", "Scale_y", "Scale_out"
}) + """\n""" + str(func.__doc__) }) + """\n""" + str(func.__doc__)
for func in []: for func in []:
......
...@@ -15,19 +15,11 @@ ...@@ -15,19 +15,11 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from paddle.fluid.tests.unittests.test_elementwise_add_op import * from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
''' from paddle.fluid.tests.unittests.test_elementwise_add_op import TestElementwiseAddOp
MKLDNN does not support tensors of dimensions number equal to 3.
Such dimensions cause exceptions in MKLDNN reorder primitive.
The DNNL-based kernel is used only when broadcasting is not required
(see GetExpectedKernelType() methods in elementwise_add_op.h).
'''
class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp): class TestMKLDNNElementwiseAddOp(TestElementwiseAddOp):
def init_data_format(self):
self.data_format = 'MKLDNN'
def init_kernel_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
...@@ -66,5 +58,96 @@ class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp): ...@@ -66,5 +58,96 @@ class TestMKLDNNElementwiseAddOp4(TestMKLDNNElementwiseAddOp):
pass pass
class TestMKLDNNElementwiseAddOp_broadcast_3(TestMKLDNNElementwiseAddOp):
def init_input_output(self):
self.x = np.random.rand(2, 10, 12, 3).astype(self.dtype)
self.y = np.random.rand(10, 12).astype(self.dtype)
self.out = self.x + self.y.reshape(1, 10, 12, 1)
def init_axis(self):
self.axis = 1
''' INT8 Tests '''
@skip_check_grad_ci(
reason="oneDNN's int8 elementwise_ops don't implemend grad kernel.")
class TestInt8(TestElementwiseAddOp):
def init_kernel_type(self):
self.use_mkldnn = True
self._cpu_only = True
def init_dtype(self):
self.dtype = np.int8
def init_input_output(self):
self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
self.y = np.random.randint(0, 3, (12, 9)).astype("int8")
self.out = np.add(self.x, self.y)
def init_scales(self):
self.attrs['Scale_x'] = 1.0
self.attrs['Scale_y'] = 1.0
self.attrs['Scale_out'] = 1.0
def test_check_output(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
self.init_scales()
self.check_output(check_dygraph=(self.use_mkldnn == False))
def test_check_grad_normal(self):
pass
def test_check_grad_ingore_x(self):
pass
def test_check_grad_ingore_y(self):
pass
class TestInt8Scales(TestInt8):
def quantize(self, tensor, dt="int8"):
max_int = 127.0 if dt == "int8" else 255.0
scale = max_int / np.abs(np.amax(tensor))
quantized = np.round(scale * tensor).astype(dt)
return scale, quantized
def init_input_output(self):
self.x_f = np.random.random((100, )).astype("float")
self.y_f = np.random.random((100, )).astype("float")
self.out_f = np.add(self.x_f, self.y_f)
self.scale_x, self.x = self.quantize(self.x_f)
self.scale_y, self.y = self.quantize(self.y_f)
self.scale_o, self.out = self.quantize(self.out_f)
def init_scales(self):
self.attrs['Scale_x'] = self.scale_x
self.attrs['Scale_y'] = self.scale_y
self.attrs['Scale_out'] = self.scale_o
def test_check_output(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
self.init_scales()
int_atol = 1 # different quantization techniques
self.check_output(
check_dygraph=(self.use_mkldnn == False), atol=int_atol)
class TestUint8Scales(TestInt8Scales):
def init_input_output(self):
self.x_f = np.random.random((100, )).astype("float")
self.y_f = np.random.random((100, )).astype("float")
self.out_f = np.add(self.x_f, self.y_f)
self.scale_x, self.x = self.quantize(self.x_f, "uint8")
self.scale_y, self.y = self.quantize(self.y_f, "uint8")
self.scale_o, self.out = self.quantize(self.out_f, "uint8")
def init_dtype(self):
self.dtype = np.uint8
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -15,137 +15,76 @@ ...@@ -15,137 +15,76 @@
from __future__ import print_function from __future__ import print_function
import unittest import unittest
import numpy as np import numpy as np
from paddle.fluid.tests.unittests.op_test import OpTest
import paddle.fluid.core as core
from paddle.fluid.op import Operator
from paddle.fluid.tests.unittests.test_elementwise_mul_op import *
from paddle.fluid.tests.unittests.test_conv2d_op import conv2d_forward_naive
from paddle.fluid.tests.unittests.mkldnn.mkldnn_op_test import __assert_close
import paddle.fluid as fluid
from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci from paddle.fluid.tests.unittests.op_test import skip_check_grad_ci
from paddle.fluid.tests.unittests.test_elementwise_mul_op import ElementwiseMulOp
# For UT coverage, integrate conv2d + elementwise-mul so that nchw16C could be automatically chosen when mkldnn-kernel is enabled class TestMKLDNNElementwiseMulOp(ElementwiseMulOp):
@skip_check_grad_ci( def init_kernel_type(self):
reason="TODO: this test cannot use white list to skip check_grad, need to add check_grad." self.use_mkldnn = True
)
class TestElementwiseMulMKLDNNOp_Integrated_With_Convs(ElementwiseMulOp): def init_dtype(self):
def setUp(self):
self.dtype = np.float32 self.dtype = np.float32
self.init_dtype()
self.init_kernel_type()
self.init_axis()
self._cpu_only = True
self.pad = [0, 0]
self.stride = [1, 1]
self.groups = 1
self.input_size = [1, 3, 5, 5] # NCHW
self.filter_size = [16, 3, 3, 3]
self.filter_size2 = [1, 16, 2, 2]
self.dilations = [1, 1]
self.use_cudnn = False
self.data_format = "ANYLAYOUT"
self.input = np.random.random(self.input_size).astype(self.dtype)
self.filter = np.random.random(self.filter_size).astype(self.dtype)
self.filter2 = np.random.random(self.filter_size2).astype(self.dtype)
self.elt_mul_y_size = [1, 16]
self.elt_mul_y = np.random.random(self.elt_mul_y_size).astype(
self.dtype)
conv2d_param = {
'stride': self.stride,
'pad': self.pad,
'dilation': self.dilations
}
conv_out, _, _, _, _ = conv2d_forward_naive(
self.input, self.filter, self.groups, conv2d_param) #[1, 16, 2, 2]
self.conv_output = conv_out
self.elt_mul_output = self.conv_output * self.elt_mul_y.reshape(
1, 16, 1, 1) # the result shape is [1, 16, 2, 2]
conv_output2, _, _, _, _ = conv2d_forward_naive(
self.elt_mul_output, self.filter2, self.groups, conv2d_param)
self.conv_output2 = conv_output2
self.fetch_list = ["conv_output2"]
class TestMKLDNNElementwiseMulOp2(TestMKLDNNElementwiseMulOp):
def init_input_output(self):
self.x = np.random.random((100, )).astype(self.dtype)
self.y = np.random.random((100, )).astype(self.dtype)
self.out = np.multiply(self.x, self.y)
class TestMKLDNNElementwiseMulOp3(TestMKLDNNElementwiseMulOp):
def init_input_output(self):
self.x = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
self.y = np.random.uniform(0.1, 1, [2, 3, 4, 5]).astype(self.dtype)
self.out = np.multiply(self.x, self.y)
class TestMKLDNNElementwiseMulOp4(TestMKLDNNElementwiseMulOp):
def init_input_output(self):
self.x = np.random.uniform(1, 2, [2, 3, 4, 32]).astype(self.dtype)
self.y = np.random.uniform(1, 2, [4, 32]).astype(self.dtype)
self.out = np.multiply(self.x, self.y)
# TODO(jczaja): Enable when grad is ready
def test_check_grad_normal(self):
pass
def test_check_grad_ingore_x(self):
pass
def test_check_grad_ingore_y(self):
pass
''' INT8 Tests '''
@skip_check_grad_ci(
reason="oneDNN's int8 elementwise_ops don't implemend grad kernel.")
class TestInt8(ElementwiseMulOp):
def init_kernel_type(self): def init_kernel_type(self):
self.use_mkldnn = True self.use_mkldnn = True
self._cpu_only = True
def init_dtype(self):
self.dtype = np.int8
def init_input_output(self):
self.x = np.random.randint(0, 3, (12, 9)).astype("int8")
self.y = np.random.randint(0, 3, (12, 9)).astype("int8")
self.out = np.multiply(self.x, self.y)
def init_axis(self): def init_scales(self):
self.axis = 0 self.attrs['Scale_x'] = 1.0
self.attrs['Scale_y'] = 1.0
self.attrs['Scale_out'] = 1.0
def test_check_output(self): def test_check_output(self):
ground_truth = { # TODO(wangzhongpu): support mkldnn op in dygraph mode
"input": self.input, self.init_scales()
"filter": self.filter, self.check_output(check_dygraph=(self.use_mkldnn == False))
"filter2": self.filter2,
"conv_output": self.conv_output,
"elt_mul_y": self.elt_mul_y,
"elt_mul_output": self.elt_mul_output,
"conv_output2": self.conv_output2,
}
program = fluid.Program()
with fluid.program_guard(program):
block = program.global_block()
for name in ground_truth:
block.create_var(
name=name, dtype="float32", shape=ground_truth[name].shape)
conv2d_op = block.append_op(
type="conv2d",
inputs={
"Input": block.var('input'),
'Filter': block.var('filter')
},
outputs={"Output": block.var('conv_output')},
attrs={
'strides': self.stride,
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format
})
elementwise_mul_op = block.append_op(
type="elementwise_mul",
inputs={
'X': block.var('conv_output'),
'Y': block.var('elt_mul_y'),
},
outputs={"Out": block.var('elt_mul_output')},
attrs={
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'axis': self.axis
})
conv2d_op2 = block.append_op(
type="conv2d",
inputs={
"Input": block.var('elt_mul_output'),
'Filter': block.var('filter2')
},
outputs={"Output": block.var('conv_output2')},
attrs={
'strides': self.stride,
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format
})
place = core.CPUPlace()
exe = fluid.Executor(place)
out = exe.run(
program,
feed={
name: ground_truth[name]
for name in ["input", "filter", "filter2", "elt_mul_y"]
},
fetch_list=self.fetch_list)
for id, name in enumerate(self.fetch_list):
self.assertTrue(
np.allclose(
ground_truth[name], out[id], atol=1e-4), name)
def test_check_grad_normal(self): def test_check_grad_normal(self):
pass pass
...@@ -157,5 +96,48 @@ class TestElementwiseMulMKLDNNOp_Integrated_With_Convs(ElementwiseMulOp): ...@@ -157,5 +96,48 @@ class TestElementwiseMulMKLDNNOp_Integrated_With_Convs(ElementwiseMulOp):
pass pass
class TestInt8Scales(TestInt8):
def quantize(self, tensor, dt="int8"):
max_int = 127.0 if dt == "int8" else 255.0
scale = max_int / np.abs(np.amax(tensor))
quantized = np.round(scale * tensor).astype(dt)
return scale, quantized
def init_input_output(self):
self.x_f = np.random.random((100, )).astype("float")
self.y_f = np.random.random((100, )).astype("float")
self.out_f = np.multiply(self.x_f, self.y_f)
self.scale_x, self.x = self.quantize(self.x_f)
self.scale_y, self.y = self.quantize(self.y_f)
self.scale_o, self.out = self.quantize(self.out_f)
def init_scales(self):
self.attrs['Scale_x'] = self.scale_x
self.attrs['Scale_y'] = self.scale_y
self.attrs['Scale_out'] = self.scale_o
def test_check_output(self):
# TODO(wangzhongpu): support mkldnn op in dygraph mode
self.init_scales()
int_atol = 1 # different quantization techniques
self.check_output(
check_dygraph=(self.use_mkldnn == False), atol=int_atol)
class TestUint8Scales(TestInt8Scales):
def init_input_output(self):
self.x_f = np.random.random((100, )).astype("float")
self.y_f = np.random.random((100, )).astype("float")
self.out_f = np.multiply(self.x_f, self.y_f)
self.scale_x, self.x = self.quantize(self.x_f, "uint8")
self.scale_y, self.y = self.quantize(self.y_f, "uint8")
self.scale_o, self.out = self.quantize(self.out_f, "uint8")
def init_dtype(self):
self.dtype = np.uint8
if __name__ == '__main__': if __name__ == '__main__':
unittest.main() unittest.main()
...@@ -703,10 +703,10 @@ for func in [ ...@@ -703,10 +703,10 @@ for func in [
func.__doc__ = _generate_doc_string_( func.__doc__ = _generate_doc_string_(
op_proto, op_proto,
additional_args_lines=additional_args_lines, additional_args_lines=additional_args_lines,
skip_attrs_set={"x_data_format", "y_data_format", "axis" skip_attrs_set={"x_data_format", "y_data_format", "axis",
"use_quantizer", "Scale_x", "Scale_y", "Scale_out"
}) + """\n""" + str(func.__doc__) }) + """\n""" + str(func.__doc__)
def sum(input, dim=None, dtype=None, keep_dim=False, name=None): def sum(input, dim=None, dtype=None, keep_dim=False, name=None):
""" """
:alias_main: paddle.sum :alias_main: paddle.sum
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册