From bbc9336878f73026ece222f2b9d85740408852f1 Mon Sep 17 00:00:00 2001 From: xiaolil1 <39753926+xiaolil1@users.noreply.github.com> Date: Fri, 4 Jan 2019 11:34:57 +0800 Subject: [PATCH] Enable basic MKL-DNN INT8 Conv OP (#15124) * Enable basic MKL-DNN INT8 Conv OP test=develop * Modify test case test=develop * Clean unittest code test=develop * Fix test test=develop * Modify test test=develop * Modify basic INT8 Conv test=develop --- paddle/fluid/operators/conv_mkldnn_op.cc | 340 +++++++++++++++++- paddle/fluid/operators/conv_op.cc | 33 +- paddle/fluid/operators/conv_op.h | 1 + paddle/fluid/platform/mkldnn_reuse.h | 110 +++++- .../tests/unittests/test_conv2d_fusion_op.py | 5 +- .../unittests/test_conv2d_int8_mkldnn_op.py | 228 ++++++++++++ .../fluid/tests/unittests/test_conv2d_op.py | 7 +- 7 files changed, 696 insertions(+), 28 deletions(-) create mode 100644 python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py diff --git a/paddle/fluid/operators/conv_mkldnn_op.cc b/paddle/fluid/operators/conv_mkldnn_op.cc index 8c116c4ab..0f2bb8c65 100644 --- a/paddle/fluid/operators/conv_mkldnn_op.cc +++ b/paddle/fluid/operators/conv_mkldnn_op.cc @@ -12,6 +12,7 @@ See the License for the specific language governing permissions and limitations under the License. */ +#include #include "paddle/fluid/framework/data_layout_transform.h" #include "paddle/fluid/memory/malloc.h" #include "paddle/fluid/operators/conv_op.h" @@ -68,13 +69,22 @@ inline mkldnn::memory::format GetWeightsFormat(mkldnn::memory::format format, } } -template +template class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { PADDLE_ENFORCE(paddle::platform::is_cpu_place(ctx.GetPlace()), "It must use CPUPlace."); + bool is_INT8 = + std::is_same::value || std::is_same::value; + if (!is_INT8) { + ComputeFP32(ctx); + } else { + ComputeINT8(ctx); + } + } + void ComputeFP32(const paddle::framework::ExecutionContext& ctx) const { const bool is_test = ctx.Attr("is_test"); auto& dev_ctx = @@ -274,6 +284,257 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { output->set_layout(DataLayout::kMKLDNN); output->set_format(GetMKLDNNFormat(*dst_memory_p)); } + void ComputeINT8(const paddle::framework::ExecutionContext& ctx) const { + const bool is_test = ctx.Attr("is_test"); + + auto& dev_ctx = + ctx.template device_context(); + const auto& mkldnn_engine = dev_ctx.GetEngine(); + + auto* input = ctx.Input("Input"); + auto* filter = ctx.Input("Filter"); + auto* bias = ctx.HasInput("Bias") ? ctx.Input("Bias") : nullptr; + auto* output = ctx.Output("Output"); + + PADDLE_ENFORCE(input->layout() == DataLayout::kMKLDNN && + input->format() != memory::format::format_undef, + "Wrong layout/format set for Input tensor"); + PADDLE_ENFORCE(filter->layout() == DataLayout::kMKLDNN && + filter->format() != memory::format::format_undef, + "Wrong layout/format set for Filter tensor"); + PADDLE_ENFORCE(input->dims().size() == 4 || input->dims().size() == 5, + "Input must be with 4 or 5 dimensions, i.e. NCHW or NCDHW"); + PADDLE_ENFORCE(filter->dims().size() == 4 || filter->dims().size() == 5, + "Filter must be with 4 or 5 dimensions, i.e. OIHW or OIDHW"); + if (bias) { + PADDLE_ENFORCE(bias->layout() == DataLayout::kMKLDNN && + bias->format() != memory::format::format_undef, + "Wrong layout/format set for Bias tensor"); + PADDLE_ENFORCE(bias->dims().size() == 1, + "Bias must only have 1 dimension, i.e. X"); + } + + std::vector strides = ctx.Attr>("strides"); + std::vector paddings = ctx.Attr>("paddings"); + std::vector dilations = ctx.Attr>("dilations"); + int groups = ctx.Attr("groups"); + + bool force_fp32_output = ctx.Attr("force_fp32_output"); + + bool is_conv3d = strides.size() == 3U; + // TODO(tpatejko): add support for dilation + PADDLE_ENFORCE( + is_conv3d + ? dilations.size() == 3 && dilations[0] == 1 && dilations[1] == 1 && + dilations[2] == 1 + : dilations.size() == 2 && dilations[0] == 1 && dilations[1] == 1, + "dilation in convolution is not implemented yet"); + PADDLE_ENFORCE(is_conv3d != true, "int8 does not support conv3d currently"); + + const T* input_data = input->data(); + + std::vector src_tz = paddle::framework::vectorize2int(input->dims()); + std::vector weights_tz = + paddle::framework::vectorize2int(filter->dims()); + int g = std::max(groups, 1); + GetWeightsTz(weights_tz, g, is_conv3d); + std::vector dst_tz = paddle::framework::vectorize2int(output->dims()); + + // Get unique name for storing MKLDNN primitives + std::string key; + key.reserve(MaxKeyLength); + mkldnn::memory::data_type src_dt = + paddle::framework::ToMKLDNNDataType(input->type()); + platform::ConvMKLDNNHandler::AppendKey( + &key, src_tz, weights_tz, strides, paddings, dilations, groups, src_dt, + input->format(), ctx.op().Output("Output")); + + const std::string key_conv_pd = key + "@conv_pd"; + + std::shared_ptr conv_p = nullptr; + std::shared_ptr src_memory_p = nullptr; + std::shared_ptr user_src_memory_p = nullptr; + std::shared_ptr dst_memory_p = nullptr; + std::vector pipeline; + std::shared_ptr conv_pd = + nullptr; + std::shared_ptr handler = nullptr; + + auto prim_key = key + "@conv_p"; + auto dst_key = key + "@dst_mem_p"; + auto src_key = key + "@src_mem_p"; + auto user_src_key = key + "@user_src_mem_p"; + auto src_reorder_key = key + "@src_mem_preorder_p"; + conv_p = std::static_pointer_cast( + dev_ctx.GetBlob(prim_key)); + if (conv_p == nullptr || !is_test) { + const K* filter_data = filter->data(); + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + auto scale_out_data = + force_fp32_output ? 1.0f : ctx.Attr("Scale_out"); + + bool is_multi_channel = scale_weights_data.size() > 1; + + int count = is_multi_channel ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] + : (weights_tz)[0]) + : 1; + std::vector output_shift_scale(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + if (scale_weights_data[i] == 0.0) + output_shift_scale[i] = + scale_out_data; // weights data will contain 0 + // in some models, then weights + // scale couldn't be calculated + else + output_shift_scale[i] = + scale_out_data / (scale_in_data * scale_weights_data[i]); + } + + auto user_src_md = + platform::MKLDNNMemDesc({src_tz}, src_dt, input->format()); + auto user_weights_md = platform::MKLDNNMemDesc( + {weights_tz}, platform::MKLDNNGetDataType(), + ((g) == 1) ? mkldnn::memory::format::oihw + : mkldnn::memory::format::goihw); + + /* create memory descriptor for convolution without specified format + * ('any') which lets a primitive (convolution in this case) choose + * the memory format preferred for best performance + */ + std::string data_format = ctx.Attr("data_format"); + auto chosen_memory_format = + platform::data_format_to_memory_format(data_format); + + std::vector bias_tz; + + auto src_md = + platform::MKLDNNMemDesc(src_tz, src_dt, chosen_memory_format); + auto weights_md = platform::MKLDNNMemDesc( + weights_tz, memory::data_type::s8, chosen_memory_format); + + auto dst_dt = force_fp32_output + ? paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType) + : paddle::framework::ToMKLDNNDataType( + framework::DataTypeTrait::DataType); + + auto dst_md = + platform::MKLDNNMemDesc(dst_tz, dst_dt, chosen_memory_format); + // create a conv primitive descriptor and save it for usage in backward + if (bias) { + bias_tz = paddle::framework::vectorize2int(bias->dims()); + auto bias_md = platform::MKLDNNMemDesc(bias_tz, memory::data_type::s32, + memory::format::x); + conv_pd = ConvFwdPrimitiveDesc(src_md, weights_md, bias_md, dst_md, + strides, paddings, mkldnn_engine, + output_shift_scale, is_test); + } else { + conv_pd = + ConvFwdPrimitiveDesc(src_md, weights_md, dst_md, strides, paddings, + mkldnn_engine, output_shift_scale, is_test); + } + // Save conv_pd/src_memory/weights_memory for backward pass + dev_ctx.SetBlob(key_conv_pd, conv_pd); + + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + + // create mkldnn memory from input tensors (data/weights) + user_src_memory_p = + handler->AcquireSrcMemory(user_src_md, to_void_cast(input_data)); + auto user_weights_memory_p = handler->AcquireWeightsMemory( + user_weights_md, to_void_cast(filter_data)); + + // create reorder primitive if the input format is not the preferred one + src_memory_p = + handler->AcquireSrcMemoryFromPrimitive(user_src_memory_p, pipeline); + + std::shared_ptr weights_memory_p; + int mask_reorder = + is_multi_channel ? ((g != 1) ? (1 << 1) + (1 << 0) : 1 << 0) : 0; + weights_memory_p = handler->AcquireWeightsMemoryFromPrimitive( + user_weights_memory_p, pipeline, is_test, true, scale_weights_data, + mask_reorder); + + if (!force_fp32_output) { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } else { + dst_memory_p = platform::SetDstMemory(ctx, output, handler); + } + + // create convolution op primitive + auto scale_bias_key = key + "@scale_bias"; + if (bias) { + const float* bias_data = bias->data(); + auto user_bias_md = platform::MKLDNNMemDesc( + {bias_tz}, platform::MKLDNNGetDataType(), memory::format::x); + auto user_bias_memory_p = handler->AcquireBiasMemory( + user_bias_md, to_void_cast(bias_data)); + std::shared_ptr bias_memory_p; + int mask_reorder = is_multi_channel ? 1 << 0 : 1; + int count = + is_multi_channel + ? (g > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0]) + : 1; + std::vector scale_bias_data(count); +#pragma omp parallel for if (count > 1) + for (int i = 0; i < count; i++) { + scale_bias_data[i] = scale_in_data * scale_weights_data[i]; + } + bias_memory_p = handler->AcquireBiasMemoryFromPrimitive( + user_bias_memory_p, pipeline, is_test, true, scale_bias_data, + mask_reorder); + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + bias_memory_p, dst_memory_p); + } else { + conv_p = handler->AcquireConvolution(src_memory_p, weights_memory_p, + dst_memory_p); + } + + // push primitive to stream and wait until it's executed + pipeline.push_back(*conv_p); + } else { + auto src_memory_reorder_p = std::static_pointer_cast( + dev_ctx.GetBlob(src_reorder_key)); + src_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(src_key)); + if (src_memory_reorder_p) { + user_src_memory_p = std::static_pointer_cast( + dev_ctx.GetBlob(user_src_key)); + user_src_memory_p->set_data_handle(to_void_cast(input_data)); + } else if (src_memory_p) { + src_memory_p->set_data_handle(to_void_cast(input_data)); + } + + dst_memory_p = + std::static_pointer_cast(dev_ctx.GetBlob(dst_key)); + conv_pd = + std::static_pointer_cast( + dev_ctx.GetBlob(key_conv_pd)); + if (conv_pd) { + handler.reset(new platform::ConvMKLDNNHandler(conv_pd, dev_ctx, + mkldnn_engine, key)); + } + if (!force_fp32_output) { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } else { + dst_memory_p = + platform::SetDstMemoryHandler(ctx, output, handler); + } + if (src_memory_reorder_p) { + pipeline.push_back(*src_memory_reorder_p); + } + pipeline.push_back(*conv_p); + } + // push primitive to stream and wait until it's executed + stream(stream::kind::eager).submit(pipeline).wait(); + + output->set_layout(DataLayout::kMKLDNN); + output->set_format(GetMKLDNNFormat(*dst_memory_p)); + } private: mkldnn::primitive_attr CreatePostOps(bool fuse_relu, @@ -301,6 +562,16 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return conv_attr; } + mkldnn::primitive_attr CreatePostOps( + const std::vector output_shift_scale) const { + mkldnn::primitive_attr conv_attr; + mkldnn::post_ops post_operations; + int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0; + conv_attr.set_output_scales(mask, output_shift_scale); + conv_attr.set_post_ops(post_operations); + return conv_attr; + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& dst, const std::vector& strides, @@ -325,6 +596,32 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { p_conv_pd); } + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& dst, const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, dst, stride_dims, + padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } + std::unique_ptr ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, const memory::desc& bias, const memory::desc& dst, @@ -349,6 +646,33 @@ class ConvMKLDNNOpKernel : public paddle::framework::OpKernel { return std::unique_ptr( p_conv_pd); } + + std::unique_ptr + ConvFwdPrimitiveDesc(const memory::desc& src, const memory::desc& weights, + const memory::desc& bias, const memory::desc& dst, + const std::vector& strides, + const std::vector& paddings, + const mkldnn::engine& engine, + const std::vector output_shift_scale, + bool is_test) const { + memory::dims stride_dims = {strides[0], strides[1]}; + memory::dims padding_dims = {paddings[0], paddings[1]}; + + auto propagation = is_test ? mkldnn::prop_kind::forward_scoring + : mkldnn::prop_kind::forward_training; + + auto conv_desc = mkldnn::convolution_forward::desc( + propagation, mkldnn::convolution_direct, src, weights, bias, dst, + stride_dims, padding_dims, padding_dims, mkldnn::padding_kind::zero); + + mkldnn::primitive_attr conv_attr = CreatePostOps(output_shift_scale); + + auto p_conv_pd = new mkldnn::convolution_forward::primitive_desc( + conv_desc, conv_attr, engine); + + return std::unique_ptr( + p_conv_pd); + } }; template @@ -555,7 +879,17 @@ namespace ops = paddle::operators; REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, U8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, + ::paddle::platform::CPUPlace, S8, + ops::kConvMKLDNNFP32, + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, @@ -565,7 +899,7 @@ REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv2d_grad, MKLDNN, REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d, MKLDNN, ::paddle::platform::CPUPlace, FP32, ops::kConvMKLDNNFP32, - ops::ConvMKLDNNOpKernel); + ops::ConvMKLDNNOpKernel); REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(conv3d_grad, MKLDNN, ::paddle::platform::CPUPlace, FP32, diff --git a/paddle/fluid/operators/conv_op.cc b/paddle/fluid/operators/conv_op.cc index 8e0d28249..c8b33b893 100644 --- a/paddle/fluid/operators/conv_op.cc +++ b/paddle/fluid/operators/conv_op.cc @@ -98,10 +98,12 @@ framework::OpKernelType ConvOp::GetExpectedKernelType( #endif auto input_data_type = ctx.Input("Input")->type(); - auto filter_data_type = ctx.Input("Filter")->type(); - PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, - "input and filter data type should be consistent"); - + if (input_data_type != framework::proto::VarType::INT8 && + input_data_type != framework::proto::VarType::UINT8) { + auto filter_data_type = ctx.Input("Filter")->type(); + PADDLE_ENFORCE_EQ(input_data_type, filter_data_type, + "input and filter data type should be consistent"); + } if (input_data_type == framework::proto::VarType::FP16) { PADDLE_ENFORCE_EQ(library, framework::LibraryType::kCUDNN, "float16 can only be used when CUDNN is used"); @@ -179,6 +181,26 @@ void Conv2DOpMaker::Make() { "whenever convolution output is as an input to residual " "connection.") .SetDefault(false); + AddAttr("Scale_in", + "Scale_in to be used for int8 input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_out", + "Scale_out to be used for int8 output data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr("Scale_in_eltwise", + "Scale_in_eltwise to be used for int8 eltwise input data." + "Only used with MKL-DNN INT8.") + .SetDefault(1.0f); + AddAttr>("Scale_weights", + "Scale_weights to be used for int8 weights data." + "Only used with MKL-DNN INT8.") + .SetDefault({1.0f}); + AddAttr("force_fp32_output", + "(bool, default false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); AddAttr( "data_format", "(string, default NCHW) Only used in " @@ -303,6 +325,9 @@ void Conv3DOpMaker::Make() { "Defaults to \"NHWC\". Specify the data format of the output data, " "the input will be transformed automatically. ") .SetDefault("AnyLayout"); + AddAttr("force_fp32_output", + "(bool, default false) Only used in mkldnn INT8 kernel") + .SetDefault(false); // TODO(dzhwinter): need to registered layout transform function AddAttr("workspace_size_MB", "Only used in cudnn kernel. workspace size for cudnn, in MB, " diff --git a/paddle/fluid/operators/conv_op.h b/paddle/fluid/operators/conv_op.h index 24b8e2387..eaa288edc 100644 --- a/paddle/fluid/operators/conv_op.h +++ b/paddle/fluid/operators/conv_op.h @@ -29,6 +29,7 @@ namespace operators { using Tensor = framework::Tensor; constexpr int kConvMKLDNNFP32 = 1; constexpr int kConvMKLDNNINT8 = 2; +constexpr int MaxKeyLength = 256; // Base convolution operator definations for other conv // like operators to reuse the implementation. diff --git a/paddle/fluid/platform/mkldnn_reuse.h b/paddle/fluid/platform/mkldnn_reuse.h index 584df85e8..98d1242a1 100644 --- a/paddle/fluid/platform/mkldnn_reuse.h +++ b/paddle/fluid/platform/mkldnn_reuse.h @@ -145,7 +145,8 @@ class MKLDNNHandler { const std::shared_ptr user_memory_p, const std::string& suffix, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { // create reorder primitive if the input format is not the preferred one auto local_key = key_ + suffix; auto key_reorder_p = key_ + suffix + "reorder_p"; @@ -159,8 +160,20 @@ class MKLDNNHandler { std::shared_ptr reorder_p; if (mpd != user_mpd) { target_memory_p = std::make_shared(mpd); - auto reorder_p = - std::make_shared(*user_memory_p, *target_memory_p); + std::shared_ptr reorder_p; + if (is_INT8) { + mkldnn::primitive_attr + attri; // attribute for int8 weights and bias data reorder. + attri.set_output_scales(mask, scale_data); + + auto reorder_pd = std::shared_ptr( + new mkldnn::reorder::primitive_desc(user_mpd, mpd, attri)); + reorder_p = std::shared_ptr(new mkldnn::reorder( + *reorder_pd, *user_memory_p, *target_memory_p)); + } else { + reorder_p = std::make_shared(*user_memory_p, + *target_memory_p); + } dev_ctx_.SetBlob(key_reorder_p, reorder_p); pipeline.push_back(*reorder_p); } @@ -182,22 +195,56 @@ class MKLDNNHandler { return dims2str(operand_dims) + suffix; } - template + template static void SetDstMemory( const framework::ExecutionContext& ctx, framework::Tensor* output, std::vector dst_tz, const mkldnn::engine& engine, std::shared_ptr& dst_pd, // NOLINT std::shared_ptr& dst_memory) { // NOLINT - M* output_data = output->mutable_data(ctx.GetPlace()); + T* output_data = output->mutable_data(ctx.GetPlace()); auto dst_md = platform::MKLDNNMemDesc( {dst_tz}, paddle::framework::ToMKLDNNDataType( - framework::DataTypeTrait::DataType), + framework::DataTypeTrait::DataType), mkldnn::memory::format::nhwc); dst_pd.reset(new mkldnn::memory::primitive_desc(dst_md, engine)); - dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + dst_memory.reset(new mkldnn::memory(*dst_pd, to_void_cast(output_data))); + } + + static void AppendKey( + std::string* key, const mkldnn::memory::dims& input_dims, + const mkldnn::memory::dims& weights_dims, const std::vector& strides, + const std::vector& paddings, const std::vector& dilations, + const int& groups, const mkldnn::memory::data_type& type, + const mkldnn::memory::format& format, const std::string& suffix) { + AppendKeyDims(key, input_dims); + AppendKeyDims(key, weights_dims); + AppendKeyVec(key, strides); + AppendKeyVec(key, paddings); + AppendKeyVec(key, dilations); + AppendKey(key, std::to_string(groups)); + AppendKey(key, std::to_string(type)); + AppendKey(key, std::to_string(format)); + AppendKey(key, suffix); } protected: + static void AppendKeyDims(std::string* key, + const mkldnn::memory::dims& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKeyVec(std::string* key, const std::vector& dims) { + for (unsigned int i = 0; i < dims.size(); i++) { + AppendKey(key, std::to_string(dims[i])); + } + } + + static void AppendKey(std::string* key, const std::string& s) { + key->append(s); + } + static std::string dims2str(const mkldnn::memory::dims& operand_dims) { std::string dstr = ""; for (size_t i = 0; i < operand_dims.size(); ++i) { @@ -215,7 +262,8 @@ class MKLDNNHandler { class TransposeMKLDNNHandler : public MKLDNNHandler { public: - TransposeMKLDNNHandler(std::vector& dims, std::vector& axis, + TransposeMKLDNNHandler(std::vector& dims, // NOLINT + std::vector& axis, // NOLINT const platform::MKLDNNDeviceContext& dev_ctx, mkldnn::engine engine, const std::string& base_key) : platform::MKLDNNHandler(dev_ctx, engine, base_key), @@ -303,8 +351,9 @@ class TransposeMKLDNNHandler : public MKLDNNHandler { } protected: - mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, - std::vector& axis) { + mkldnn_memory_desc_t Axis2MemoryDesc(std::vector& nchw_tz, // NOLINT + std::vector& axis // NOLINT + ) { mkldnn_memory_desc_t mem_fmt; mem_fmt.primitive_kind = mkldnn_memory; @@ -462,21 +511,26 @@ class ConvMKLDNNTemplateHandler : public MKLDNNHandler { std::shared_ptr AcquireWeightsMemoryFromPrimitive( const std::shared_ptr user_weights_memory_p, std::vector& pipeline, // NOLINT - bool is_persistent = false) { + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, int mask = 0) { auto user_weights_pd = user_weights_memory_p->get_primitive_desc(); auto weights_pd = conv_pd_->weights_primitive_desc(); - return this->AcquireMemory(weights_pd, user_weights_pd, - user_weights_memory_p, "@weights_mem_p", - pipeline, is_persistent); + return this->AcquireMemory( + weights_pd, user_weights_pd, user_weights_memory_p, "@weights_mem_p", + pipeline, is_persistent, is_INT8, scale_data, mask); } std::shared_ptr AcquireBiasMemoryFromPrimitive( const std::shared_ptr user_bias_memory_p, - std::vector& pipeline) { // NOLINT + std::vector& pipeline, // NOLINT + bool is_persistent = false, bool is_INT8 = false, + std::vector scale_data = {1.0f}, + int mask = 0) { // NOLINT auto user_bias_pd = user_bias_memory_p->get_primitive_desc(); auto bias_pd = conv_pd_->bias_primitive_desc(); return this->AcquireMemory(bias_pd, user_bias_pd, user_bias_memory_p, - "@bias_mem_p", pipeline); + "@bias_mem_p", pipeline, is_persistent, is_INT8, + scale_data, mask); } std::shared_ptr AcquireConvolution( @@ -594,5 +648,29 @@ using ConvTransposeMKLDNNHandler = ConvMKLDNNTemplateHandler; + +template +static std::shared_ptr SetDstMemory( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p = + handler->AcquireDstMemoryFromPrimitive(to_void_cast(output_data)); + return dst_memory_p; +} + +template +static std::shared_ptr SetDstMemoryHandler( + const framework::ExecutionContext& ctx, framework::Tensor* output, + const std::shared_ptr& handler) { + T* output_data = output->mutable_data( + ctx.GetPlace(), ::paddle::memory::Allocator::kDefault, + handler->GetDstMemorySize()); + std::shared_ptr dst_memory_p; + dst_memory_p->set_data_handle(to_void_cast(output_data)); + return dst_memory_p; +} } // namespace platform } // namespace paddle diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py index a27212f38..ab34a51dd 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_fusion_op.py @@ -51,8 +51,9 @@ class TestConv2dFusionOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - self.output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + self.output, _, _, _, _ = conv2d_forward_naive( + input, filter, self.groups, conv2d_param) + self.output = self.output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py new file mode 100644 index 000000000..ca35adc1a --- /dev/null +++ b/python/paddle/fluid/tests/unittests/test_conv2d_int8_mkldnn_op.py @@ -0,0 +1,228 @@ +# Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import print_function + +import unittest +import numpy as np + +import paddle.fluid.core as core +from op_test import OpTest +from test_conv2d_op import conv2d_forward_naive, TestConv2dOp + + +def conv2d_forward_refer(input, filter, group, conv_param): + out, in_n, out_h, out_w, out_c = conv2d_forward_naive(input, filter, group, + conv_param) + out_tmp = np.zeros((in_n, out_h, out_w, out_c)) + for n in range(in_n): + for i in range(out_h): + for j in range(out_w): + for m in range(out_c): + out_tmp[n, i, j, m] = out[n, m, i, j] + return out_tmp.reshape(in_n, out_c, out_h, out_w) + + +class TestConv2dInt8Op(TestConv2dOp): + def setUp(self): + self.op_type = "conv2d" + self.use_cudnn = False + self.exhaustive_search = False + self.use_cuda = False + self.use_mkldnn = False + self.data_format = "AnyLayout" + self.weighttype = np.float32 + self.use_mkldnn = True + self.init_group() + self.init_dilation() + self.init_test_case() + self.init_dtype() + + conv2d_param = { + 'stride': self.stride, + 'pad': self.pad, + 'dilation': self.dilations + } + + filter = np.random.random(self.filter_size).astype(self.weighttype) + if self.srctype == np.uint8: + input = np.random.randint(0, 10, + self.input_size).astype(self.srctype) + else: + input = np.random.randint(-5, 5, + self.input_size).astype(self.srctype) + input_shift = (np.ones(self.input_size) * 128).astype(np.uint8) + + if self.srctype == np.int8: + filter_int = np.round(filter * self.scale_weights[0] * + 0.5).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0] * 0.5) + output1 = conv2d_forward_refer( + np.round((input.astype(np.int32) + input_shift) * + self.scale_in).astype(np.int32), filter_int, + self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + output2 = conv2d_forward_refer( + np.round((input_shift) * self.scale_in).astype(np.int32), + filter_int, self.groups, + conv2d_param).astype(np.float32) * scale_output_shift + output = np.round(output1 - output2).astype(self.dsttype) + else: + filter_int = np.round(filter * + self.scale_weights[0]).astype(np.int32) + scale_output_shift = self.scale_out / (self.scale_in * + self.scale_weights[0]) + output1 = conv2d_forward_refer( + input.astype(np.int32), filter_int, self.groups, + conv2d_param).astype(np.float32) + output = np.round(output1 * scale_output_shift).astype(self.dsttype) + + self.inputs = { + 'Input': + OpTest.np_dtype_to_fluid_dtype(input.astype(self.srctype)), + 'Filter': OpTest.np_dtype_to_fluid_dtype(filter) + } + self.attrs = { + 'strides': self.stride, + 'paddings': self.pad, + 'groups': self.groups, + 'dilations': self.dilations, + 'use_cudnn': self.use_cudnn, + 'use_mkldnn': self.use_mkldnn, + 'data_format': self.data_format, + 'exhaustive_search': self.exhaustive_search, + 'Scale_in': self.scale_in, + 'Scale_out': self.scale_out, + 'Scale_weights': self.scale_weights, + } + self.outputs = {'Output': output} + + def test_check_output(self): + self.check_output_with_place(core.CPUPlace(), atol=0) + + def test_check_grad(self): + pass + + def test_check_grad_no_filter(self): + pass + + def test_check_grad_no_input(self): + pass + + def init_test_case(self): + TestConv2dOp.init_test_case(self) + f_c = self.input_size[1] // self.groups + self.filter_size = [1, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_dtype(self): + self.srctype = np.uint8 + self.dsttype = np.int8 + + +#--------------------test conv2d u8 in and s8 out-------------------- + + +class TestConv2d(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 5, 5] # NCHW + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + +class TestWithPad(TestConv2d): + def init_test_case(self): + TestConv2d.init_test_case(self) + self.pad = [1, 1] + + +class TestWithGroup(TestConv2d): + def init_group(self): + self.groups = 3 + + +class TestWithStride(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [1, 1] + self.stride = [2, 2] + self.input_size = [2, 3, 6, 6] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 3, 3] + self.scale_in = 1.0 + self.scale_out = 0.8 + self.scale_weights = [10.0] + + +class TestWith1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [1, 3, 5, 5] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [12.0] + + +class TestWithInput1x1Filter1x1(TestConv2dInt8Op): + def init_test_case(self): + self.pad = [0, 0] + self.stride = [1, 1] + self.input_size = [2, 3, 1, 1] + assert np.mod(self.input_size[1], self.groups) == 0 + f_c = self.input_size[1] // self.groups + self.filter_size = [6, f_c, 1, 1] + self.scale_in = 1.0 + self.scale_out = 0.5 + self.scale_weights = [10.0] + + def init_group(self): + self.groups = 3 + + +#--------------------test conv2d s8 in and s8 out-------------------- + + +def create_test_int8_class(parent): + class TestInt8Case(parent): + def init_dtype(self): + self.srctype = np.int8 + self.dsttype = np.int8 + + cls_name = "{0}_{1}".format(parent.__name__, "s8s8") + TestInt8Case.__name__ = cls_name + globals()[cls_name] = TestInt8Case + + +create_test_int8_class(TestConv2dInt8Op) +create_test_int8_class(TestWithPad) +create_test_int8_class(TestWithStride) +create_test_int8_class(TestWithGroup) +create_test_int8_class(TestWith1x1) +create_test_int8_class(TestWithInput1x1Filter1x1) + +if __name__ == '__main__': + unittest.main() diff --git a/python/paddle/fluid/tests/unittests/test_conv2d_op.py b/python/paddle/fluid/tests/unittests/test_conv2d_op.py index bcb79f232..25a9e8d46 100644 --- a/python/paddle/fluid/tests/unittests/test_conv2d_op.py +++ b/python/paddle/fluid/tests/unittests/test_conv2d_op.py @@ -60,7 +60,7 @@ def conv2d_forward_naive(input, filter, group, conv_param): np.sum(input_pad_masked * f_sub[k, :, :, :], axis=(1, 2, 3)) - return out + return out, in_n, out_h, out_w, out_c class TestConv2dOp(OpTest): @@ -85,8 +85,9 @@ class TestConv2dOp(OpTest): input = np.random.random(self.input_size).astype(self.dtype) filter = np.random.random(self.filter_size).astype(self.dtype) - output = conv2d_forward_naive(input, filter, self.groups, - conv2d_param).astype(self.dtype) + output, _, _, _, _ = conv2d_forward_naive(input, filter, self.groups, + conv2d_param) + output = output.astype(self.dtype) self.inputs = { 'Input': OpTest.np_dtype_to_fluid_dtype(input), -- GitLab