提交 bbc93368 编写于 作者: X xiaolil1 提交者: Tao Luo

Enable basic MKL-DNN INT8 Conv OP (#15124)

* Enable basic MKL-DNN INT8 Conv OP
test=develop

* Modify test case
test=develop

* Clean unittest code
test=develop

* Fix test
test=develop

* Modify test
test=develop

* Modify basic INT8 Conv
test=develop
上级 a1e60ab1
...@@ -12,6 +12,7 @@ ...@@ -12,6 +12,7 @@
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <unordered_map>
#include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/framework/data_layout_transform.h"
#include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/memory/malloc.h"
#include "paddle/fluid/operators/conv_op.h" #include "paddle/fluid/operators/conv_op.h"
...@@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, ...@@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format,
} }
} }
template <typename T> template <typename T, typename K>
class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
public: public:
void Compute(const paddle::framework::ExecutionContext& ctx) const override { void Compute(const paddle::framework::ExecutionContext& ctx) const override {
PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()),
"It must use CPUPlace."); "It must use CPUPlace.");
bool is_INT8 =
std::is_same<T, int8_t>::value || std::is_same<T, uint8_t>::value;
if (!is_INT8) {
ComputeFP32(ctx);
} else {
ComputeINT8(ctx);
}
}
void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const {
const bool is_test = ctx.Attr<bool>("is_test"); const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx = auto& dev_ctx =
...@@ -274,6 +284,257 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -274,6 +284,257 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
output->set_layout(DataLayout::kMKLDNN); output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p)); output->set_format(GetMKLDNNFormat(*dst_memory_p));
} }
void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const {
const bool is_test = ctx.Attr<bool>("is_test");
auto& dev_ctx =
ctx.template device_context<paddle::platform::MKLDNNDeviceContext>();
const auto& mkldnn_engine = dev_ctx.GetEngine();
auto* input = ctx.Input<Tensor>("Input");
auto* filter = ctx.Input<Tensor>("Filter");
auto* bias = ctx.HasInput("Bias") ? ctx.Input<Tensor>("Bias") : nullptr;
auto* output = ctx.Output<Tensor>("Output");
PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN &&
input->format() != memory::format::format_undef,
"Wrong layout/format set for Input tensor");
PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN &&
filter->format() != memory::format::format_undef,
"Wrong layout/format set for Filter tensor");
PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5,
"Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW");
PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5,
"Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW");
if (bias) {
PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN &&
bias->format() != memory::format::format_undef,
"Wrong layout/format set for Bias tensor");
PADDLE_ENFORCE(bias->dims().size() == 1,
"Bias must only have 1 dimension, i.e. X");
}
std::vector<int> strides = ctx.Attr<std::vector<int>>("strides");
std::vector<int> paddings = ctx.Attr<std::vector<int>>("paddings");
std::vector<int> dilations = ctx.Attr<std::vector<int>>("dilations");
int groups = ctx.Attr<int>("groups");
bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
bool is_conv3d = strides.size() == 3U;
// TODO(tpatejko): add support for dilation
PADDLE_ENFORCE(
is_conv3d
? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 &&
dilations[2] == 1
: dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1,
"dilation in convolution is not implemented yet");
PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently");
const T* input_data = input->data<T>();
std::vector<int> src_tz = paddle::framework::vectorize2int(input->dims());
std::vector<int> weights_tz =
paddle::framework::vectorize2int(filter->dims());
int g = std::max(groups, 1);
GetWeightsTz(weights_tz, g, is_conv3d);
std::vector<int> dst_tz = paddle::framework::vectorize2int(output->dims());
// Get unique name for storing MKLDNN primitives
std::string key;
key.reserve(MaxKeyLength);
mkldnn::memory::data_type src_dt =
paddle::framework::ToMKLDNNDataType(input->type());
platform::ConvMKLDNNHandler::AppendKey(
&key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt,
input->format(), ctx.op().Output("Output"));
const std::string key_conv_pd = key + "@conv_pd";
std::shared_ptr<mkldnn::convolution_forward> conv_p = nullptr;
std::shared_ptr<mkldnn::memory> src_memory_p = nullptr;
std::shared_ptr<mkldnn::memory> user_src_memory_p = nullptr;
std::shared_ptr<mkldnn::memory> dst_memory_p = nullptr;
std::vector<primitive> pipeline;
std::shared_ptr<mkldnn::convolution_forward::primitive_desc> conv_pd =
nullptr;
std::shared_ptr<platform::ConvMKLDNNHandler> handler = nullptr;
auto prim_key = key + "@conv_p";
auto dst_key = key + "@dst_mem_p";
auto src_key = key + "@src_mem_p";
auto user_src_key = key + "@user_src_mem_p";
auto src_reorder_key = key + "@src_mem_preorder_p";
conv_p = std::static_pointer_cast<mkldnn::convolution_forward>(
dev_ctx.GetBlob(prim_key));
if (conv_p == nullptr || !is_test) {
const K* filter_data = filter->data<K>();
auto scale_in_data = ctx.Attr<float>("Scale_in");
auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
auto scale_out_data =
force_fp32_output ? 1.0f : ctx.Attr<float>("Scale_out");
bool is_multi_channel = scale_weights_data.size() > 1;
int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0]
: (weights_tz)[0])
: 1;
std::vector<float> output_shift_scale(count);
#pragma omp parallel for if (count > 1)
for (int i = 0; i < count; i++) {
if (scale_weights_data[i] == 0.0)
output_shift_scale[i] =
scale_out_data; // weights data will contain 0
// in some models, then weights
// scale couldn't be calculated
else
output_shift_scale[i] =
scale_out_data / (scale_in_data * scale_weights_data[i]);
}
auto user_src_md =
platform::MKLDNNMemDesc({src_tz}, src_dt, input->format());
auto user_weights_md = platform::MKLDNNMemDesc(
{weights_tz}, platform::MKLDNNGetDataType<K>(),
((g) == 1) ? mkldnn::memory::format::oihw
: mkldnn::memory::format::goihw);
/* create memory descriptor for convolution without specified format
* ('any') which lets a primitive (convolution in this case) choose
* the memory format preferred for best performance
*/
std::string data_format = ctx.Attr<std::string>("data_format");
auto chosen_memory_format =
platform::data_format_to_memory_format(data_format);
std::vector<int> bias_tz;
auto src_md =
platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format);
auto weights_md = platform::MKLDNNMemDesc(
weights_tz, memory::data_type::s8, chosen_memory_format);
auto dst_dt = force_fp32_output
? paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<float>::DataType)
: paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<int8_t>::DataType);
auto dst_md =
platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format);
// create a conv primitive descriptor and save it for usage in backward
if (bias) {
bias_tz = paddle::framework::vectorize2int(bias->dims());
auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32,
memory::format::x);
conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md,
strides, paddings, mkldnn_engine,
output_shift_scale, is_test);
} else {
conv_pd =
ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings,
mkldnn_engine, output_shift_scale, is_test);
}
// Save conv_pd/src_memory/weights_memory for backward pass
dev_ctx.SetBlob(key_conv_pd, conv_pd);
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
mkldnn_engine, key));
// create mkldnn memory from input tensors (data/weights)
user_src_memory_p =
handler->AcquireSrcMemory(user_src_md, to_void_cast<T>(input_data));
auto user_weights_memory_p = handler->AcquireWeightsMemory(
user_weights_md, to_void_cast<K>(filter_data));
// create reorder primitive if the input format is not the preferred one
src_memory_p =
handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline);
std::shared_ptr<mkldnn::memory> weights_memory_p;
int mask_reorder =
is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0;
weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive(
user_weights_memory_p, pipeline, is_test, true, scale_weights_data,
mask_reorder);
if (!force_fp32_output) {
dst_memory_p = platform::SetDstMemory<int8_t>(ctx, output, handler);
} else {
dst_memory_p = platform::SetDstMemory<float>(ctx, output, handler);
}
// create convolution op primitive
auto scale_bias_key = key + "@scale_bias";
if (bias) {
const float* bias_data = bias->data<float>();
auto user_bias_md = platform::MKLDNNMemDesc(
{bias_tz}, platform::MKLDNNGetDataType<float>(), memory::format::x);
auto user_bias_memory_p = handler->AcquireBiasMemory(
user_bias_md, to_void_cast<float>(bias_data));
std::shared_ptr<mkldnn::memory> bias_memory_p;
int mask_reorder = is_multi_channel ? 1 << 0 : 1;
int count =
is_multi_channel
? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
: 1;
std::vector<float> scale_bias_data(count);
#pragma omp parallel for if (count > 1)
for (int i = 0; i < count; i++) {
scale_bias_data[i] = scale_in_data * scale_weights_data[i];
}
bias_memory_p = handler->AcquireBiasMemoryFromPrimitive(
user_bias_memory_p, pipeline, is_test, true, scale_bias_data,
mask_reorder);
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
bias_memory_p, dst_memory_p);
} else {
conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p,
dst_memory_p);
}
// push primitive to stream and wait until it's executed
pipeline.push_back(*conv_p);
} else {
auto src_memory_reorder_p = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(src_reorder_key));
src_memory_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(src_key));
if (src_memory_reorder_p) {
user_src_memory_p = std::static_pointer_cast<mkldnn::memory>(
dev_ctx.GetBlob(user_src_key));
user_src_memory_p->set_data_handle(to_void_cast<T>(input_data));
} else if (src_memory_p) {
src_memory_p->set_data_handle(to_void_cast<T>(input_data));
}
dst_memory_p =
std::static_pointer_cast<mkldnn::memory>(dev_ctx.GetBlob(dst_key));
conv_pd =
std::static_pointer_cast<mkldnn::convolution_forward::primitive_desc>(
dev_ctx.GetBlob(key_conv_pd));
if (conv_pd) {
handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx,
mkldnn_engine, key));
}
if (!force_fp32_output) {
dst_memory_p =
platform::SetDstMemoryHandler<int8_t>(ctx, output, handler);
} else {
dst_memory_p =
platform::SetDstMemoryHandler<float>(ctx, output, handler);
}
if (src_memory_reorder_p) {
pipeline.push_back(*src_memory_reorder_p);
}
pipeline.push_back(*conv_p);
}
// push primitive to stream and wait until it's executed
stream(stream::kind::eager).submit(pipeline).wait();
output->set_layout(DataLayout::kMKLDNN);
output->set_format(GetMKLDNNFormat(*dst_memory_p));
}
private: private:
mkldnn::primitive_attr CreatePostOps(bool fuse_relu, mkldnn::primitive_attr CreatePostOps(bool fuse_relu,
...@@ -301,6 +562,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -301,6 +562,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return conv_attr; return conv_attr;
} }
mkldnn::primitive_attr CreatePostOps(
const std::vector<float> output_shift_scale) const {
mkldnn::primitive_attr conv_attr;
mkldnn::post_ops post_operations;
int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
conv_attr.set_output_scales(mask, output_shift_scale);
conv_attr.set_post_ops(post_operations);
return conv_attr;
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides, const memory::desc& dst, const std::vector<int>& strides,
...@@ -325,6 +596,32 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -325,6 +596,32 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
p_conv_pd); p_conv_pd);
} }
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& dst, const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine,
const std::vector<float> output_shift_scale,
bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto conv_desc = mkldnn::convolution_forward::desc(
propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims,
padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
std::unique_ptr<mkldnn::convolution_forward::primitive_desc> std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& bias, const memory::desc& dst, const memory::desc& bias, const memory::desc& dst,
...@@ -349,6 +646,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> { ...@@ -349,6 +646,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>( return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd); p_conv_pd);
} }
std::unique_ptr<mkldnn::convolution_forward::primitive_desc>
ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights,
const memory::desc& bias, const memory::desc& dst,
const std::vector<int>& strides,
const std::vector<int>& paddings,
const mkldnn::engine& engine,
const std::vector<float> output_shift_scale,
bool is_test) const {
memory::dims stride_dims = {strides[0], strides[1]};
memory::dims padding_dims = {paddings[0], paddings[1]};
auto propagation = is_test ? mkldnn::prop_kind::forward_scoring
: mkldnn::prop_kind::forward_training;
auto conv_desc = mkldnn::convolution_forward::desc(
propagation, mkldnn::convolution_direct, src, weights, bias, dst,
stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero);
mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale);
auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc(
conv_desc, conv_attr, engine);
return std::unique_ptr<mkldnn::convolution_forward::primitive_desc>(
p_conv_pd);
}
}; };
template <typename T> template <typename T>
...@@ -555,7 +879,17 @@ namespace ops = paddle::operators; ...@@ -555,7 +879,17 @@ namespace ops = paddle::operators;
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32, ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float>); ops::ConvMKLDNNOpKernel<float, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, U8,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<uint8_t, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN,
::paddle::platform::CPUPlace, S8,
ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<int8_t, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
...@@ -565,7 +899,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ...@@ -565,7 +899,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN,
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
ops::kConvMKLDNNFP32, ops::kConvMKLDNNFP32,
ops::ConvMKLDNNOpKernel<float>); ops::ConvMKLDNNOpKernel<float, float>);
REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN,
::paddle::platform::CPUPlace, FP32, ::paddle::platform::CPUPlace, FP32,
......
...@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( ...@@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType(
#endif #endif
auto input_data_type = ctx.Input<Tensor>("Input")->type(); auto input_data_type = ctx.Input<Tensor>("Input")->type();
auto filter_data_type = ctx.Input<Tensor>("Filter")->type(); if (input_data_type != framework::proto::VarType::INT8 &&
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, input_data_type != framework::proto::VarType::UINT8) {
"input and filter data type should be consistent"); auto filter_data_type = ctx.Input<Tensor>("Filter")->type();
PADDLE_ENFORCE_EQ(input_data_type, filter_data_type,
"input and filter data type should be consistent");
}
if (input_data_type == framework::proto::VarType::FP16) { if (input_data_type == framework::proto::VarType::FP16) {
PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN,
"float16 can only be used when CUDNN is used"); "float16 can only be used when CUDNN is used");
...@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() { ...@@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() {
"whenever convolution output is as an input to residual " "whenever convolution output is as an input to residual "
"connection.") "connection.")
.SetDefault(false); .SetDefault(false);
AddAttr<float>("Scale_in",
"Scale_in to be used for int8 input data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Scale_out",
"Scale_out to be used for int8 output data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<float>("Scale_in_eltwise",
"Scale_in_eltwise to be used for int8 eltwise input data."
"Only used with MKL-DNN INT8.")
.SetDefault(1.0f);
AddAttr<std::vector<float>>("Scale_weights",
"Scale_weights to be used for int8 weights data."
"Only used with MKL-DNN INT8.")
.SetDefault({1.0f});
AddAttr<bool>("force_fp32_output",
"(bool, default false) Force INT8 kernel output FP32, only "
"used in MKL-DNN INT8")
.SetDefault(false);
AddAttr<std::string>( AddAttr<std::string>(
"data_format", "data_format",
"(string, default NCHW) Only used in " "(string, default NCHW) Only used in "
...@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() { ...@@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() {
"Defaults to \"NHWC\". Specify the data format of the output data, " "Defaults to \"NHWC\". Specify the data format of the output data, "
"the input will be transformed automatically. ") "the input will be transformed automatically. ")
.SetDefault("AnyLayout"); .SetDefault("AnyLayout");
AddAttr<bool>("force_fp32_output",
"(bool, default false) Only used in mkldnn INT8 kernel")
.SetDefault(false);
// TODO(dzhwinter): need to registered layout transform function // TODO(dzhwinter): need to registered layout transform function
AddAttr<int>("workspace_size_MB", AddAttr<int>("workspace_size_MB",
"Only used in cudnn kernel. workspace size for cudnn, in MB, " "Only used in cudnn kernel. workspace size for cudnn, in MB, "
......
...@@ -29,6 +29,7 @@ namespace operators { ...@@ -29,6 +29,7 @@ namespace operators {
using Tensor = framework::Tensor; using Tensor = framework::Tensor;
constexpr int kConvMKLDNNFP32 = 1; constexpr int kConvMKLDNNFP32 = 1;
constexpr int kConvMKLDNNINT8 = 2; constexpr int kConvMKLDNNINT8 = 2;
constexpr int MaxKeyLength = 256;
// Base convolution operator definations for other conv // Base convolution operator definations for other conv
// like operators to reuse the implementation. // like operators to reuse the implementation.
......
...@@ -145,7 +145,8 @@ class MKLDNNHandler { ...@@ -145,7 +145,8 @@ class MKLDNNHandler {
const std::shared_ptr<mkldnn::memory> user_memory_p, const std::shared_ptr<mkldnn::memory> user_memory_p,
const std::string& suffix, const std::string& suffix,
std::vector<mkldnn::primitive>& pipeline, // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) { bool is_persistent = false, bool is_INT8 = false,
std::vector<float> scale_data = {1.0f}, int mask = 0) {
// create reorder primitive if the input format is not the preferred one // create reorder primitive if the input format is not the preferred one
auto local_key = key_ + suffix; auto local_key = key_ + suffix;
auto key_reorder_p = key_ + suffix + "reorder_p"; auto key_reorder_p = key_ + suffix + "reorder_p";
...@@ -159,8 +160,20 @@ class MKLDNNHandler { ...@@ -159,8 +160,20 @@ class MKLDNNHandler {
std::shared_ptr<mkldnn::primitive> reorder_p; std::shared_ptr<mkldnn::primitive> reorder_p;
if (mpd != user_mpd) { if (mpd != user_mpd) {
target_memory_p = std::make_shared<mkldnn::memory>(mpd); target_memory_p = std::make_shared<mkldnn::memory>(mpd);
auto reorder_p = std::shared_ptr<mkldnn::reorder> reorder_p;
std::make_shared<mkldnn::reorder>(*user_memory_p, *target_memory_p); if (is_INT8) {
mkldnn::primitive_attr
attri; // attribute for int8 weights and bias data reorder.
attri.set_output_scales(mask, scale_data);
auto reorder_pd = std::shared_ptr<mkldnn::reorder::primitive_desc>(
new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri));
reorder_p = std::shared_ptr<mkldnn::reorder>(new mkldnn::reorder(
*reorder_pd, *user_memory_p, *target_memory_p));
} else {
reorder_p = std::make_shared<mkldnn::reorder>(*user_memory_p,
*target_memory_p);
}
dev_ctx_.SetBlob(key_reorder_p, reorder_p); dev_ctx_.SetBlob(key_reorder_p, reorder_p);
pipeline.push_back(*reorder_p); pipeline.push_back(*reorder_p);
} }
...@@ -182,22 +195,56 @@ class MKLDNNHandler { ...@@ -182,22 +195,56 @@ class MKLDNNHandler {
return dims2str(operand_dims) + suffix; return dims2str(operand_dims) + suffix;
} }
template <typename M> template <typename T>
static void SetDstMemory( static void SetDstMemory(
const framework::ExecutionContext& ctx, framework::Tensor* output, const framework::ExecutionContext& ctx, framework::Tensor* output,
std::vector<int> dst_tz, const mkldnn::engine& engine, std::vector<int> dst_tz, const mkldnn::engine& engine,
std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd, // NOLINT std::shared_ptr<mkldnn::memory::primitive_desc>& dst_pd, // NOLINT
std::shared_ptr<mkldnn::memory>& dst_memory) { // NOLINT std::shared_ptr<mkldnn::memory>& dst_memory) { // NOLINT
M* output_data = output->mutable_data<M>(ctx.GetPlace()); T* output_data = output->mutable_data<T>(ctx.GetPlace());
auto dst_md = platform::MKLDNNMemDesc( auto dst_md = platform::MKLDNNMemDesc(
{dst_tz}, paddle::framework::ToMKLDNNDataType( {dst_tz}, paddle::framework::ToMKLDNNDataType(
framework::DataTypeTrait<M>::DataType), framework::DataTypeTrait<T>::DataType),
mkldnn::memory::format::nhwc); mkldnn::memory::format::nhwc);
dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine)); dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine));
dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<M>(output_data))); dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast<T>(output_data)));
}
static void AppendKey(
std::string* key, const mkldnn::memory::dims& input_dims,
const mkldnn::memory::dims& weights_dims, const std::vector<int>& strides,
const std::vector<int>& paddings, const std::vector<int>& dilations,
const int& groups, const mkldnn::memory::data_type& type,
const mkldnn::memory::format& format, const std::string& suffix) {
AppendKeyDims(key, input_dims);
AppendKeyDims(key, weights_dims);
AppendKeyVec(key, strides);
AppendKeyVec(key, paddings);
AppendKeyVec(key, dilations);
AppendKey(key, std::to_string(groups));
AppendKey(key, std::to_string(type));
AppendKey(key, std::to_string(format));
AppendKey(key, suffix);
} }
protected: protected:
static void AppendKeyDims(std::string* key,
const mkldnn::memory::dims& dims) {
for (unsigned int i = 0; i < dims.size(); i++) {
AppendKey(key, std::to_string(dims[i]));
}
}
static void AppendKeyVec(std::string* key, const std::vector<int>& dims) {
for (unsigned int i = 0; i < dims.size(); i++) {
AppendKey(key, std::to_string(dims[i]));
}
}
static void AppendKey(std::string* key, const std::string& s) {
key->append(s);
}
static std::string dims2str(const mkldnn::memory::dims& operand_dims) { static std::string dims2str(const mkldnn::memory::dims& operand_dims) {
std::string dstr = ""; std::string dstr = "";
for (size_t i = 0; i < operand_dims.size(); ++i) { for (size_t i = 0; i < operand_dims.size(); ++i) {
...@@ -215,7 +262,8 @@ class MKLDNNHandler { ...@@ -215,7 +262,8 @@ class MKLDNNHandler {
class TransposeMKLDNNHandler : public MKLDNNHandler { class TransposeMKLDNNHandler : public MKLDNNHandler {
public: public:
TransposeMKLDNNHandler(std::vector<int>& dims, std::vector<int>& axis, TransposeMKLDNNHandler(std::vector<int>& dims, // NOLINT
std::vector<int>& axis, // NOLINT
const platform::MKLDNNDeviceContext& dev_ctx, const platform::MKLDNNDeviceContext& dev_ctx,
mkldnn::engine engine, const std::string& base_key) mkldnn::engine engine, const std::string& base_key)
: platform::MKLDNNHandler(dev_ctx, engine, base_key), : platform::MKLDNNHandler(dev_ctx, engine, base_key),
...@@ -303,8 +351,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { ...@@ -303,8 +351,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler {
} }
protected: protected:
mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz, mkldnn_memory_desc_t Axis2MemoryDesc(std::vector<int>& nchw_tz, // NOLINT
std::vector<int>& axis) { std::vector<int>& axis // NOLINT
) {
mkldnn_memory_desc_t mem_fmt; mkldnn_memory_desc_t mem_fmt;
mem_fmt.primitive_kind = mkldnn_memory; mem_fmt.primitive_kind = mkldnn_memory;
...@@ -462,21 +511,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { ...@@ -462,21 +511,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler {
std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireWeightsMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_weights_memory_p, const std::shared_ptr<mkldnn::memory> user_weights_memory_p,
std::vector<mkldnn::primitive>& pipeline, // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false) { bool is_persistent = false, bool is_INT8 = false,
std::vector<float> scale_data = {1.0f}, int mask = 0) {
auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto user_weights_pd = user_weights_memory_p->get_primitive_desc();
auto weights_pd = conv_pd_->weights_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc();
return this->AcquireMemory(weights_pd, user_weights_pd, return this->AcquireMemory(
user_weights_memory_p, "@weights_mem_p", weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p",
pipeline, is_persistent); pipeline, is_persistent, is_INT8, scale_data, mask);
} }
std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive( std::shared_ptr<mkldnn::memory> AcquireBiasMemoryFromPrimitive(
const std::shared_ptr<mkldnn::memory> user_bias_memory_p, const std::shared_ptr<mkldnn::memory> user_bias_memory_p,
std::vector<mkldnn::primitive>& pipeline) { // NOLINT std::vector<mkldnn::primitive>& pipeline, // NOLINT
bool is_persistent = false, bool is_INT8 = false,
std::vector<float> scale_data = {1.0f},
int mask = 0) { // NOLINT
auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); auto user_bias_pd = user_bias_memory_p->get_primitive_desc();
auto bias_pd = conv_pd_->bias_primitive_desc(); auto bias_pd = conv_pd_->bias_primitive_desc();
return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p,
"@bias_mem_p", pipeline); "@bias_mem_p", pipeline, is_persistent, is_INT8,
scale_data, mask);
} }
std::shared_ptr<forward_t> AcquireConvolution( std::shared_ptr<forward_t> AcquireConvolution(
...@@ -594,5 +648,29 @@ using ConvTransposeMKLDNNHandler = ...@@ -594,5 +648,29 @@ using ConvTransposeMKLDNNHandler =
ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward, ConvMKLDNNTemplateHandler<mkldnn::deconvolution_forward,
mkldnn::deconvolution_backward_data, mkldnn::deconvolution_backward_data,
mkldnn::deconvolution_backward_weights>; mkldnn::deconvolution_backward_weights>;
template <typename T>
static std::shared_ptr<mkldnn::memory> SetDstMemory(
const framework::ExecutionContext& ctx, framework::Tensor* output,
const std::shared_ptr<ConvMKLDNNHandler>& handler) {
T* output_data = output->mutable_data<T>(
ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
handler->GetDstMemorySize());
std::shared_ptr<mkldnn::memory> dst_memory_p =
handler->AcquireDstMemoryFromPrimitive(to_void_cast<T>(output_data));
return dst_memory_p;
}
template <typename T>
static std::shared_ptr<mkldnn::memory> SetDstMemoryHandler(
const framework::ExecutionContext& ctx, framework::Tensor* output,
const std::shared_ptr<ConvMKLDNNHandler>& handler) {
T* output_data = output->mutable_data<T>(
ctx.GetPlace(), ::paddle::memory::Allocator::kDefault,
handler->GetDstMemorySize());
std::shared_ptr<mkldnn::memory> dst_memory_p;
dst_memory_p->set_data_handle(to_void_cast<T>(output_data));
return dst_memory_p;
}
} // namespace platform } // namespace platform
} // namespace paddle } // namespace paddle
...@@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest): ...@@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest):
input = np.random.random(self.input_size).astype(self.dtype) input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype)
self.output = conv2d_forward_naive(input, filter, self.groups, self.output, _, _, _, _ = conv2d_forward_naive(
conv2d_param).astype(self.dtype) input, filter, self.groups, conv2d_param)
self.output = self.output.astype(self.dtype)
self.inputs = { self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input), 'Input': OpTest.np_dtype_to_fluid_dtype(input),
......
# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import print_function
import unittest
import numpy as np
import paddle.fluid.core as core
from op_test import OpTest
from test_conv2d_op import conv2d_forward_naive, TestConv2dOp
def conv2d_forward_refer(input, filter, group, conv_param):
out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group,
conv_param)
out_tmp = np.zeros((in_n, out_h, out_w, out_c))
for n in range(in_n):
for i in range(out_h):
for j in range(out_w):
for m in range(out_c):
out_tmp[n, i, j, m] = out[n, m, i, j]
return out_tmp.reshape(in_n, out_c, out_h, out_w)
class TestConv2dInt8Op(TestConv2dOp):
def setUp(self):
self.op_type = "conv2d"
self.use_cudnn = False
self.exhaustive_search = False
self.use_cuda = False
self.use_mkldnn = False
self.data_format = "AnyLayout"
self.weighttype = np.float32
self.use_mkldnn = True
self.init_group()
self.init_dilation()
self.init_test_case()
self.init_dtype()
conv2d_param = {
'stride': self.stride,
'pad': self.pad,
'dilation': self.dilations
}
filter = np.random.random(self.filter_size).astype(self.weighttype)
if self.srctype == np.uint8:
input = np.random.randint(0, 10,
self.input_size).astype(self.srctype)
else:
input = np.random.randint(-5, 5,
self.input_size).astype(self.srctype)
input_shift = (np.ones(self.input_size) * 128).astype(np.uint8)
if self.srctype == np.int8:
filter_int = np.round(filter * self.scale_weights[0] *
0.5).astype(np.int32)
scale_output_shift = self.scale_out / (self.scale_in *
self.scale_weights[0] * 0.5)
output1 = conv2d_forward_refer(
np.round((input.astype(np.int32) + input_shift) *
self.scale_in).astype(np.int32), filter_int,
self.groups,
conv2d_param).astype(np.float32) * scale_output_shift
output2 = conv2d_forward_refer(
np.round((input_shift) * self.scale_in).astype(np.int32),
filter_int, self.groups,
conv2d_param).astype(np.float32) * scale_output_shift
output = np.round(output1 - output2).astype(self.dsttype)
else:
filter_int = np.round(filter *
self.scale_weights[0]).astype(np.int32)
scale_output_shift = self.scale_out / (self.scale_in *
self.scale_weights[0])
output1 = conv2d_forward_refer(
input.astype(np.int32), filter_int, self.groups,
conv2d_param).astype(np.float32)
output = np.round(output1 * scale_output_shift).astype(self.dsttype)
self.inputs = {
'Input':
OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)),
'Filter': OpTest.np_dtype_to_fluid_dtype(filter)
}
self.attrs = {
'strides': self.stride,
'paddings': self.pad,
'groups': self.groups,
'dilations': self.dilations,
'use_cudnn': self.use_cudnn,
'use_mkldnn': self.use_mkldnn,
'data_format': self.data_format,
'exhaustive_search': self.exhaustive_search,
'Scale_in': self.scale_in,
'Scale_out': self.scale_out,
'Scale_weights': self.scale_weights,
}
self.outputs = {'Output': output}
def test_check_output(self):
self.check_output_with_place(core.CPUPlace(), atol=0)
def test_check_grad(self):
pass
def test_check_grad_no_filter(self):
pass
def test_check_grad_no_input(self):
pass
def init_test_case(self):
TestConv2dOp.init_test_case(self)
f_c = self.input_size[1] // self.groups
self.filter_size = [1, f_c, 3, 3]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [10.0]
def init_dtype(self):
self.srctype = np.uint8
self.dsttype = np.int8
#--------------------test conv2d u8 in and s8 out--------------------
class TestConv2d(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 5, 5] # NCHW
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [10.0]
class TestWithPad(TestConv2d):
def init_test_case(self):
TestConv2d.init_test_case(self)
self.pad = [1, 1]
class TestWithGroup(TestConv2d):
def init_group(self):
self.groups = 3
class TestWithStride(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [1, 1]
self.stride = [2, 2]
self.input_size = [2, 3, 6, 6]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 3, 3]
self.scale_in = 1.0
self.scale_out = 0.8
self.scale_weights = [10.0]
class TestWith1x1(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [1, 3, 5, 5]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 1, 1]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [12.0]
class TestWithInput1x1Filter1x1(TestConv2dInt8Op):
def init_test_case(self):
self.pad = [0, 0]
self.stride = [1, 1]
self.input_size = [2, 3, 1, 1]
assert np.mod(self.input_size[1], self.groups) == 0
f_c = self.input_size[1] // self.groups
self.filter_size = [6, f_c, 1, 1]
self.scale_in = 1.0
self.scale_out = 0.5
self.scale_weights = [10.0]
def init_group(self):
self.groups = 3
#--------------------test conv2d s8 in and s8 out--------------------
def create_test_int8_class(parent):
class TestInt8Case(parent):
def init_dtype(self):
self.srctype = np.int8
self.dsttype = np.int8
cls_name = "{0}_{1}".format(parent.__name__, "s8s8")
TestInt8Case.__name__ = cls_name
globals()[cls_name] = TestInt8Case
create_test_int8_class(TestConv2dInt8Op)
create_test_int8_class(TestWithPad)
create_test_int8_class(TestWithStride)
create_test_int8_class(TestWithGroup)
create_test_int8_class(TestWith1x1)
create_test_int8_class(TestWithInput1x1Filter1x1)
if __name__ == '__main__':
unittest.main()
...@@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): ...@@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param):
np.sum(input_pad_masked * f_sub[k, :, :, :], np.sum(input_pad_masked * f_sub[k, :, :, :],
axis=(1, 2, 3)) axis=(1, 2, 3))
return out return out, in_n, out_h, out_w, out_c
class TestConv2dOp(OpTest): class TestConv2dOp(OpTest):
...@@ -85,8 +85,9 @@ class TestConv2dOp(OpTest): ...@@ -85,8 +85,9 @@ class TestConv2dOp(OpTest):
input = np.random.random(self.input_size).astype(self.dtype) input = np.random.random(self.input_size).astype(self.dtype)
filter = np.random.random(self.filter_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype)
output = conv2d_forward_naive(input, filter, self.groups, output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups,
conv2d_param).astype(self.dtype) conv2d_param)
output = output.astype(self.dtype)
self.inputs = { self.inputs = {
'Input': OpTest.np_dtype_to_fluid_dtype(input), 'Input': OpTest.np_dtype_to_fluid_dtype(input),
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册