diff --git a/cmake/operators.cmake b/cmake/operators.cmake index 7d75690f6b791032e6577916bbf008a4a1983950..015e813ecf0cfc0e9da1318237ca4231edb8f0bc 100644 --- a/cmake/operators.cmake +++ b/cmake/operators.cmake @@ -186,10 +186,14 @@ function(op_library TARGET) file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n") - elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n") - file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n") + elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n") file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n") + elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n") + file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n") else() file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n") endif() diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc index 4598cd8c5f7d3ca6ce2d634db125e851b1c95450..bc04c6f5b1259cf80a45d76bc28084360a236da2 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.cc +++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc @@ -905,15 +905,17 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x, auto *fc_op = pattern->NewNode(fc_repr())->assert_is_op("fc"); // Create variables + // Input + auto *input_var = pattern->NewNode(input_repr()) + ->AsInput() + ->assert_is_op_input("fc", "Input"); // Filter auto *fc_weight_var = pattern->NewNode(weights_repr()) ->AsInput() - ->assert_is_persistable_var() ->assert_is_op_input("fc", "W"); // Bias auto *fc_bias_var = pattern->NewNode(bias_repr()) ->AsInput() - ->assert_is_persistable_var() ->assert_is_op_input("fc", "Bias"); // Output auto *fc_out_var = pattern->NewNode(output_repr()) @@ -921,7 +923,8 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x, ->assert_is_op_output("fc", "Out") ->assert_is_only_output_of_op("fc"); - fc_op->LinksFrom({x, fc_weight_var, fc_bias_var}).LinksTo({fc_out_var}); + fc_op->LinksFrom({input_var, fc_weight_var, fc_bias_var}) + .LinksTo({fc_out_var}); return fc_out_var; } diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h index e6748cd34cbbe289ad8fd6183f8b3f1e6c8bcb9e..d13af21b415f74fd4c738830f1795d04cc0ca4b7 100644 --- a/paddle/fluid/framework/ir/graph_pattern_detector.h +++ b/paddle/fluid/framework/ir/graph_pattern_detector.h @@ -517,6 +517,7 @@ struct FCMKLDNN : public PatternBase { // declare operator node's name PATTERN_DECL_NODE(fc); // declare variable node's name + PATTERN_DECL_NODE(input); PATTERN_DECL_NODE(weights); PATTERN_DECL_NODE(bias); PATTERN_DECL_NODE(output); diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc index b743dee8b18a9407c96e2c42342fd963eb949078..ca992558955fbce3702e792eb23271d426835e6d 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc @@ -17,6 +17,7 @@ #include #include #include "paddle/fluid/framework/eigen.h" +#include "paddle/fluid/platform/errors.h" #include "paddle/fluid/string/pretty_log.h" namespace paddle { @@ -43,6 +44,13 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name, double scale_to_one, bool is_unsigned, std::string scale_attr_name) const { + auto inputs = op->Op()->InputNames(); + bool name_found = + std::find(inputs.begin(), inputs.end(), input_name) != inputs.end(); + PADDLE_ENFORCE_EQ( + name_found, true, + platform::errors::InvalidArgument("%s isn't the input of the %s operator", + input_name, op->Op()->Type())); unsigned max = is_unsigned ? U8_MAX : S8_MAX; float scale = scale_to_one * max; @@ -122,6 +130,13 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output, std::string output_name, double scale_to_one, bool is_unsigned, std::string scale_attr_name) const { + auto outputs = op->Op()->OutputNames(); + bool name_found = + std::find(outputs.begin(), outputs.end(), output_name) != outputs.end(); + PADDLE_ENFORCE_EQ(name_found, true, + platform::errors::InvalidArgument( + "%s isn't the output of the %s operator", output_name, + op->Op()->Type())); unsigned max = is_unsigned ? U8_MAX : S8_MAX; float scale = scale_to_one * max; @@ -228,6 +243,66 @@ void CPUQuantizePass::QuantizeConv(Graph* graph, PrettyLogDetail(msg_ss.str().c_str()); } +void CPUQuantizePass::QuantizeFc(Graph* graph) const { + GraphPatternDetector gpd; + auto pattern = gpd.mutable_pattern(); + patterns::FCMKLDNN fc_pattern{pattern, name_scope_}; + auto* fc_input = gpd.mutable_pattern() + ->NewNode("fc_quantizer/input") + ->AsInput() + ->assert_is_op_input("fc", "Input"); + fc_pattern(fc_input, false); + + int quantize_fc_count = 0; + auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph, + Graph* g) { + VLOG(4) << "Quantize fc op"; + GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern); + auto* fc_op_desc = fc->Op(); + + // skip if should not be quantized + if (fc_op_desc->GetAttrIfExists("use_quantizer") != true || + fc_op_desc->GetAttrIfExists("use_mkldnn") != true) + return; + + GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern); + + // get scales calculated after warmup, they scale variables to MAX=1.0 + auto scales = Get("quant_var_scales"); + + auto input_scale = scales[input->Name()].second.data()[0]; + bool is_input_unsigned = scales[input->Name()].first; + QuantizeInput(g, fc, input, "Input", input_scale, is_input_unsigned, + "Scale_in"); + + auto weight_scale_tensor = scales[weights->Name()].second; + EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data(), + weight_scale_tensor.numel(), 1}; + eigen_tensor *= static_cast(S8_MAX); + std::vector filter_scale{ + weight_scale_tensor.data(), + weight_scale_tensor.data() + weight_scale_tensor.numel()}; + + fc->Op()->SetAttr("Scale_weights", filter_scale); + + auto output_scale = scales[output->Name()].second.data()[0]; + bool is_output_unsigned = scales[output->Name()].first; + DequantizeOutput(g, fc, output, "Out", output_scale, is_output_unsigned, + "Scale_out"); + + ++quantize_fc_count; + }; + + gpd(graph, handler); + AddStatis(quantize_fc_count); + + std::stringstream msg_ss; + msg_ss << "--- quantized " << quantize_fc_count << " fc ops"; + PrettyLogDetail(msg_ss.str().c_str()); +} + void CPUQuantizePass::QuantizePool(Graph* graph) const { GraphPatternDetector gpd; auto pattern = gpd.mutable_pattern(); @@ -418,6 +493,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const { QuantizeConcat(graph); QuantizePriorBox(graph); QuantizeTranspose(graph); + QuantizeFc(graph); } } // namespace ir diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h index d1b23227b6819ac964112f6aa06d5f986acb6b33..fd213d55be4fc7c6dc2552566f5d3b8091854bd4 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h @@ -46,6 +46,8 @@ class CPUQuantizePass : public FusePassBase { void QuantizeConv(Graph* graph, bool with_residual_data = false) const; + void QuantizeFc(Graph* graph) const; + void QuantizePool(Graph* graph) const; void QuantizeConcat(Graph* graph) const; diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc index 00208c9bba296d6535025b3927a93daf1b459efa..923beb139099fae515eb74fb401d58c24b436117 100644 --- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc +++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc @@ -62,6 +62,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, if (inputs.size() > 1) op->SetInput("W", {inputs[1]}); if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]}); op->SetOutput("Out", {outputs[0]}); + op->SetAttr("use_quantizer", use_quantizer); + op->SetAttr("Scale_in", 1.0f); + op->SetAttr("Scale_out", 1.0f); + op->SetAttr("Scale_weights", std::vector{1.0f}); } else if (type == "concat") { op->SetInput("X", inputs); op->SetOutput("Out", outputs); @@ -71,13 +75,13 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name, namespace { static const std::initializer_list variable_names{ - "a", "w1", "c", "d", "w2", "e", "f", "g", - "h", "w3", "b1", "i", "j", "w4", "b2"}; + "a", "w1", "c", "d", "w2", "e", "f", "g", "h", + "w3", "b1", "i", "j", "w4", "b2", "w5", "b3"}; // (a,w1)->Conv1->c and c->Pool1->d // // (d,w2)->Conv2->e and e->Pool2->f // -// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j +// d->Dropout1->g and (g, w5, b3)->Fc1->h and (h,w3,b1,i)->Conv3->j // // (d,w4, b2)->Conv4->i ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) { @@ -98,7 +102,8 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) { SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer); SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn); - SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn); + SetOp(&prog, "fc", "Fc1", {"g", "w5", "b3"}, {"h"}, use_mkldnn, + use_quantizer); SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn, use_quantizer); @@ -194,13 +199,13 @@ TEST(CpuQuantizePass, quantize) { // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f // - // d->Dropout1->g and g->Fc1->h and + // d->Dropout1->g and (g->QUANT8->IN8,w5,b3)->Fc1->OUT7->DEQUANT7->h and // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j // // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i - // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT - int added_nodes = 7 + 7 + 6 + 6; - MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes, + // Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT + int added_nodes = 8 + 8 + 7 + 7; + MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 8, 7, added_nodes, 2.0f * 127); } diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc index 9cc2d3da3fceea06a7a753a88577fdff59a4a136..9b71e2abd759230dbebee9bf004d0f968bee7cc2 100644 --- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc +++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc @@ -26,12 +26,11 @@ namespace framework { namespace ir { void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const { - PADDLE_ENFORCE(graph); + PADDLE_ENFORCE_NOT_NULL(graph, + platform::errors::InvalidArgument( + "Pointer to graph argument should not be NULL.")); Init("fc_mkldnn_pass", graph); - auto* scope = param_scope(); - PADDLE_ENFORCE(scope); - GraphPatternDetector gpd; auto* x = gpd.mutable_pattern() ->NewNode("fc_mkldnn_pass/x") @@ -49,18 +48,25 @@ void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const { return; } GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern); + GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(bias, bias, fc_pattern); GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern); OpDesc* desc = fc->Op(); - auto in_size = fc->inputs[0]->Var()->GetShape().size(); - if (in_size != 2 && in_size != 4) { + auto dims = fc->inputs[0]->Var()->GetShape(); + auto dim_num = dims.size(); + bool are_dims_supported = dim_num == 2 || dim_num == 4; + constexpr size_t height_axis = 2; + constexpr size_t width_axis = 3; + bool is_size_supported = + dim_num == 4 ? (dims[width_axis] == 1 && dims[height_axis] == 1) : true; + if (!are_dims_supported || !is_size_supported) { VLOG(3) << "Do not enable FC MKL-DNN for dimensions different than 2 & 4"; + VLOG(3) << "Or when width and height are different than one"; return; } desc->SetAttr("use_mkldnn", true); - PADDLE_ENFORCE(subgraph.count(x)); found_fc_count++; }; diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc index e990b2c7736ae51a1ac2ba2fd15362012288b9bb..dea448f9b03468eabda16d4375ea60348a09efb2 100644 --- a/paddle/fluid/inference/api/analysis_predictor_tester.cc +++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc @@ -276,7 +276,7 @@ class MkldnnQuantizerTest : public testing::Test { std::pair GetMaxChScalingFactor( const framework::LoDTensor& var_tensor, bool is_unsigned) const { - return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned); + return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned, 0); } std::pair GetKLScalingFactor( diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc index 94c556ce52d61258475e4e9cc497b23b073938fc..dc8559e324e047cdf4addaf8fdc04ec0949742d0 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc @@ -37,6 +37,11 @@ using framework::LoDTensor; using framework::ir::Graph; using ConstEigenVectorArrayMap = Eigen::Map>; +using EigenMatrixDoubleArray = + Eigen::Array; +using EigenMatrixArray = + Eigen::Array; +using ConstEigenMatrixArrayMap = Eigen::Map; using string::PrettyLogH1; static LoDTensor CreateScaleTensor(int64_t channels_num = 1); @@ -66,7 +71,7 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() { bool is_unsigned = false; bool compute_scale = true; if (is_output) { - if (op->Type() == "conv2d") { + if (op->Type() == "conv2d" || op->Type() == "fc") { // output of conv2d with relu must be unsigned std::string fuse_activation = op->GetAttrIfExists("fuse_activation"); @@ -138,7 +143,12 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale( scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned); break; case ScaleAlgo::MAX_CH: - scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned); + scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned, + /*is_transposed*/ false); + break; + case ScaleAlgo::MAX_CH_T: + scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned, + /*is_transposed*/ true); break; case ScaleAlgo::KL: scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned); @@ -319,7 +329,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor( std::pair AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( - const LoDTensor& var_tensor, bool is_unsigned) const { + const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const { PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty."); ConstEigenVectorArrayMap eigen_tensor{var_tensor.data(), @@ -331,18 +341,23 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor( "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0", min_val); - int channels = var_tensor.dims()[0]; - LoDTensor scale_tensor = CreateScaleTensor(channels); - auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); - - for (int i = 0; i < channels; ++i) { - const auto tensor = var_tensor.Slice(i, i + 1); + auto dims = var_tensor.dims(); + constexpr int num_col_dims = 1; + auto flattened_dims = framework::flatten_to_2d(dims, num_col_dims); + ConstEigenMatrixArrayMap eigen_tensor_mat{ + var_tensor.data(), flattened_dims[0], flattened_dims[1]}; - ConstEigenVectorArrayMap eigen_tensor{tensor.data(), tensor.numel(), - 1}; - float max_abs = eigen_tensor.abs().maxCoeff(); - scale_ptr[i] = 1.0 / max_abs; + EigenMatrixDoubleArray scales; + if (is_transposed) { + scales = 1.0 / eigen_tensor_mat.cast().abs().colwise().maxCoeff(); + } else { + scales = 1.0 / eigen_tensor_mat.cast().abs().rowwise().maxCoeff(); } + int output_channel_axis = is_transposed; + int channels = dims[output_channel_axis]; + LoDTensor scale_tensor = CreateScaleTensor(channels); + auto* scale_ptr = scale_tensor.mutable_data(CPUPlace()); + std::copy(scales.data(), scales.data() + scales.size(), scale_ptr); return std::make_pair(is_unsigned, scale_tensor); } diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h index 6c438265f0b8e2a65c0475f0b11064042549269e..eeaba7952902b04e83cc5b6c890ecf510b914c65 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer.h +++ b/paddle/fluid/inference/api/mkldnn_quantizer.h @@ -79,7 +79,8 @@ class AnalysisPredictor::MkldnnQuantizer { const framework::LoDTensor& var_tensor, bool is_unsigned) const; std::pair GetMaxChScalingFactor( - const framework::LoDTensor& var_tensor, bool is_unsigned) const; + const framework::LoDTensor& var_tensor, bool is_unsigned, + bool is_transposed) const; std::pair GetMaxScalingFactor( const framework::LoDTensor& var_tensor, bool is_unsigned) const; diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc index b7d6c87fd99890241cff5d3074584f8f74e3143a..4f477071ba616e4a8a36196ef8391367424bae0e 100644 --- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc +++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc @@ -37,6 +37,11 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() { rules_["transpose2"]["X"] = ScaleAlgo::KL; rules_["transpose2"]["Out"] = ScaleAlgo::NONE; + + rules_["fc"]["Input"] = ScaleAlgo::KL; + rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T; + rules_["fc"]["Bias"] = ScaleAlgo::NONE; + rules_["fc"]["Out"] = ScaleAlgo::KL; } ScaleAlgo MkldnnQuantizerConfig::scale_algo( diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h index d46f842de7a2277ee5d00672386b12af7ba28deb..2ac09b82138ecaf3663a3783633c9cbf50da16bd 100644 --- a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h +++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h @@ -26,10 +26,12 @@ namespace paddle { // Algorithms for finding scale of quantized Tensors. enum class ScaleAlgo { - NONE, // Do not compute scale - MAX, // Find scale based on the maximum absolute value - MAX_CH, // Find scale based on the maximum absolute value per channel - KL, // Find scale based on KL Divergence + NONE, // Do not compute scale + MAX, // Find scale based on the max absolute value + MAX_CH, // Find scale based on the max absolute value per output channel + MAX_CH_T, // Find scale based on the max absolute value per output channel + // of a transposed tensor + KL, // Find scale based on KL Divergence }; struct MkldnnQuantizerConfig { diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc index 46ea4b6bb84c3cdf97c609230139b7dae98c7873..38af3149868a12bb6bca8842c8192ee672e63180 100644 --- a/paddle/fluid/operators/fc_op.cc +++ b/paddle/fluid/operators/fc_op.cc @@ -93,13 +93,21 @@ class FCOp : public framework::OperatorWithKernel { const framework::ExecutionContext& ctx) const override { framework::LibraryType library = framework::LibraryType::kPlain; framework::DataLayout layout = framework::DataLayout::kAnyLayout; + int customized_type_value = + framework::OpKernelType::kDefaultCustomizedTypeValue; + auto input_data_type = + OperatorWithKernel::IndicateVarDataType(ctx, "Input"); if (ctx.Attr("use_mkldnn")) { library = framework::LibraryType::kMKLDNN; layout = framework::DataLayout::kMKLDNN; + using framework::proto::VarType; + customized_type_value = (input_data_type == VarType::INT8 || + input_data_type == VarType::UINT8) + ? kFCMKLDNNINT8 + : kFCMKLDNNFP32; } - return framework::OpKernelType( - OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(), - layout, library); + return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout, + library, customized_type_value); } }; @@ -132,6 +140,27 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker { AddAttr(framework::kAllKernelsMustComputeRuntimeShape, "Skip calling InferShape() function in the runtime.") .SetDefault(true); + /* int8 parameters */ + AddAttr("use_quantizer", + "(bool, default false) " + "Set to true for operators that should be quantized and use " + "int8 kernel. " + "Only used on CPU.") + .SetDefault(false); + AddAttr("Scale_in", + "(float, default 1.0f), The quantize scale of input data") + .SetDefault(1.0f); + AddAttr>("Scale_weights", + "(std::vector, default {1.0f}), The " + "quantize scale of weights data") + .SetDefault({1.0f}); + AddAttr("Scale_out", + "(float, default 1.0f), The quantize scale of output data") + .SetDefault(1.0f); + AddAttr("force_fp32_output", + "(bool, default false) Force INT8 kernel output FP32, only " + "used in MKL-DNN INT8") + .SetDefault(false); AddComment(R"DOC( Fully Connected Operator. diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h index 54a81812c25707393619f6aae8e4b26ab6b0b5ef..907f61196d61b12106fc526cbcabf862f870b511 100644 --- a/paddle/fluid/operators/fc_op.h +++ b/paddle/fluid/operators/fc_op.h @@ -21,6 +21,7 @@ limitations under the License. */ namespace paddle { namespace operators { +enum { kFCMKLDNNFP32 = 1, kFCMKLDNNINT8 = 2 }; using Tensor = framework::Tensor; diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc index 1d451915a150c76c21f201c8d8d6f83cde5d11be..a9f8ed74c3da5faf9744e86ba0e349d30eeae980 100644 --- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc @@ -78,7 +78,6 @@ class DeQuantOpKernel : public framework::OpKernel { auto dst_md = platform::MKLDNNMemDesc( {dst_tz}, memory::data_type::f32, platform::MKLDNNFormatForSize(dst_tz.size(), memory::format::nchw)); - auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine); dst_memory = std::make_shared( dst_pd, to_void_cast(output_data)); diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc index dfaf47653fac51c1aa7d2150b80efe0726ef36eb..458d56e40239bfea50f27cd8ebe69f1c12be83fe 100644 --- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc +++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc @@ -37,7 +37,7 @@ using mkldnn::primitive; using mkldnn::stream; using mkldnn::prop_kind; -template +template class FCPrimitiveFactory { public: explicit FCPrimitiveFactory(const mkldnn::engine& engine) : engine_(engine) {} @@ -47,19 +47,29 @@ class FCPrimitiveFactory { const Tensor* bias, LoDTensor* output, const ExecutionContext& ctx) { RecomputeOutputDims(ctx, input, weights, output); + // If primitive has already been created and cached, don't create new one, + // but update input and output data pointers and return it. if (fc_) { UpdateDataPointers(ctx, output, input); return *fc_; } - auto src_desc = CreateMemDescriptor(input, input->format()); - input_ = CreateMemory(src_desc, input); + auto src_desc = CreateMemDescriptor(input, input->format()); + input_ = CreateMemory(src_desc, input); + // Since MKL-DNN doesn't support 4D column-major data formats in + // inner_product + // primitive, transpose the weights to be in row-major format weights_ = TransposeWeights(weights); if (src_desc.data.ndims == 4) { weights_ = CreateFourDimWeightsMemory(input, weights); } + // If int8 data type is desired, weights are quantized to signed int8 + QuantizeWeights(ctx); - auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any); + // Choose MKLDNNMemoryFormat::any so that MKL-DNN can determine itself what + // is the best format for output during the creation of inner product + // primitive descriptor + auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any); fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx); return *fc_; @@ -68,14 +78,18 @@ class FCPrimitiveFactory { private: void UpdateDataPointers(const ExecutionContext& ctx, Tensor* out, const Tensor* in) { - input_->set_data_handle(const_cast(in->data())); - output_->set_data_handle(out->mutable_data(ctx.GetPlace())); + input_->set_data_handle(to_void_cast(in->data())); + output_->set_data_handle(out->mutable_data(ctx.GetPlace())); + // If the primitive exists, but the output tensor has changed its + // variable, update its format to what has been determined in first + // call to CreateFcPrimitive method. if (out->format() == MKLDNNMemoryFormat::format_undef) { auto output_format = platform::GetMKLDNNFormat(*output_); out->set_format((MKLDNNMemoryFormat)output_format); } } + // Choose weight memory format based on input memory format MKLDNNMemoryFormat MatchWeightFormat(MKLDNNMemoryFormat fmt) { using format = MKLDNNMemoryFormat; switch (fmt) { @@ -85,11 +99,14 @@ class FCPrimitiveFactory { return format::oIhw8i; case format::nchw: return format::oihw; + case format::nhwc: + return format::hwio; default: return format::format_undef; } } + // Convert data from one data format to another mkldnn::memory Reorder(const memory::desc& src_desc, const memory::desc& dst_desc, const void* src_data) { auto src_mem = memory({src_desc, engine_}, const_cast(src_data)); @@ -101,18 +118,46 @@ class FCPrimitiveFactory { return dst_mem; } + // Convert data from one data format to another and rescale it. + // If the desired data type is (un)signed int8, quantization occurs here. + mkldnn::memory Reorder(const memory& src_mem, + const memory::primitive_desc& dst_pd, + const std::vector& scale_data) { + mkldnn::memory dst_mem = mkldnn::memory(dst_pd); + mkldnn::primitive_attr attributes; + // According to MKL-DNN's documentation mask determines along which + // dimensions should the scale be applied. + // 0 - Single scale applied to whole tensor + // 1 - Apply Scale along a slice of each dimension which index is 1. + // In case of weights quantization, that dimension is output, + // becuase we perform per-output-channel quantization + int mask = CreateMask(0, scale_data.size() > 1); + attributes.set_output_scales(mask, scale_data); + auto reorder = + mkldnn::reorder(mkldnn::reorder::primitive_desc( + src_mem.get_primitive_desc(), dst_pd, attributes), + src_mem, dst_mem); + + stream(stream::kind::eager).submit({reorder}).wait(); + + return dst_mem; + } + + template static mkldnn::memory::desc CreateMemDescriptor(const std::vector& dims, MKLDNNMemoryFormat format) { return platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType(), format); } + template static mkldnn::memory::desc CreateMemDescriptor(const Tensor* tensor, MKLDNNMemoryFormat format) { auto dims = framework::vectorize(tensor->dims()); - return CreateMemDescriptor(dims, format); + return CreateMemDescriptor(dims, format); } + template mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc, const Tensor* tensor) { return CreateMemory(desc, tensor->data()); @@ -123,12 +168,102 @@ class FCPrimitiveFactory { return memory({desc, engine_}, const_cast(data)); } + // Transpose weights through MKL-DNN's reorder from io to oi format. mkldnn::memory TransposeWeights(const Tensor* weights) { auto dims = framework::vectorize(weights->dims()); std::swap(dims[0], dims[1]); // Correct output dimensions - auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io); - auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi); - return Reorder(src_desc, dst_desc, weights->data()); + auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io); + auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi); + return Reorder(src_desc, dst_desc, weights->data()); + } + + // Compute the bias scales so that its values correspond to the + // scale of data being an output of weights and input multiplication + std::vector ComputeBiasScales(const ExecutionContext& ctx) { + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + const size_t weight_scales_num = scale_weights_data.size(); + std::vector bias_scales(weight_scales_num); + +#pragma omp parallel for + for (size_t i = 0; i < weight_scales_num; i++) { + if (scale_weights_data[i] == 0.0) + bias_scales[i] = 1.0f; + else + bias_scales[i] = scale_in_data * scale_weights_data[i]; + } + + return bias_scales; + } + + // Correct output scale, to take into account scaling of input and weights + // Since the data that comes out of input and weight multiplication is + // scaled with its own scales, this data needs to be divided by + // those scales to normalise them back to what their floating-point range + // was. Then we multiply them by desired output scale we want on the output. + std::vector ComputeOutputShiftScale(const ExecutionContext& ctx) { + auto scale_in_data = ctx.Attr("Scale_in"); + auto scale_weights_data = ctx.Attr>("Scale_weights"); + // If the output will be in floats, we don't multiply by scale_out. + auto scale_out_data = ctx.Attr("force_fp32_output") + ? 1.0f + : ctx.Attr("Scale_out"); + const size_t weight_scales_num = scale_weights_data.size(); + std::vector output_shift_scale(weight_scales_num); + +#pragma omp parallel for + for (size_t i = 0; i < weight_scales_num; i++) { + if (scale_weights_data[i] == 0.0) + output_shift_scale[i] = scale_out_data; + else + output_shift_scale[i] = + scale_out_data / (scale_in_data * scale_weights_data[i]); + } + + return output_shift_scale; + } + + // Computing MKL-DNN's scaling mask which determines along which dimension + // slice should the scaling be applied. For more data plase refer to: + // https://intel.github.io/mkl-dnn/group__c__api__attributes.html + // Section dnnl_status_t DNNL_API dnnl_primitive_attr_set_output_scales + int CreateMask(int slice_dimension, bool is_multi_channel_quantizied) { + return is_multi_channel_quantizied ? 1 << slice_dimension : 0; + } + + void QuantizeWeights(const ExecutionContext& ctx) { + auto quantized_desc = weights_->get_primitive_desc().desc(); + quantized_desc.data.data_type = + (mkldnn_data_type_t)platform::MKLDNNGetDataType(); + weights_ = Reorder(*weights_, {quantized_desc, engine_}, + ctx.Attr>("Scale_weights")); + } + + void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc, + const ExecutionContext& ctx) { + auto bias_scales = ComputeBiasScales(ctx); + bias_ = Reorder(*bias_, fc_prim_desc.bias_primitive_desc(), bias_scales); + } + + // Fuse relu into FC with activation type attribute has been set to 'relu' + mkldnn::primitive_attr CreatePostOps(const ExecutionContext& ctx) { + mkldnn::primitive_attr attributes; + mkldnn::post_ops post_operations; + + auto output_shift_scale = ComputeOutputShiftScale(ctx); + int mask = CreateMask(1, output_shift_scale.size() > 1); + attributes.set_output_scales(mask, output_shift_scale); + + if (ctx.Attr("activation_type") == "relu") { + constexpr float scale = 1.0f; + constexpr float negative_slope = 0.0f; + constexpr float placeholder = 1.0f; // beta + post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu, + negative_slope, placeholder); + } + + attributes.set_post_ops(post_operations); + return attributes; } inner_product_forward CreateFcPrimitive(const memory& src_memory, @@ -136,21 +271,34 @@ class FCPrimitiveFactory { const memory::desc& dst_desc, const Tensor* bias, Tensor* output, const ExecutionContext& ctx) { + // Acquire descriptors needed for creation of inner_product primitive + // descriptor const auto weights_desc = weights_memory.get_primitive_desc().desc(); const auto src_desc = src_memory.get_primitive_desc().desc(); + // Based on provided attributes, create attributes used by MKL-DNN to + // enable fused post-op activations such as 'relu' + const auto attrs = CreatePostOps(ctx); + // If bias exists, create inner_product primitive with or without bias if (bias) { - auto bias_desc = CreateMemDescriptor(bias, bias->format()); - bias_ = CreateMemory(bias_desc, bias); + auto bias_desc = CreateMemDescriptor(bias, bias->format()); + bias_ = CreateMemory(bias_desc, bias); + // Create inner_product descriptor. At this point the format of output + // is determined. auto fc_prim_desc = - CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc); + CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs); + // If int8 is desired, quantize bias into 32-bit signed int + QuantizeBias(fc_prim_desc, ctx); + // Based on format determined by inner_product, create output in desired + // memory format output_ = CreateDstMemory(fc_prim_desc, ctx, output); + // Return MKL-DNN primitive ready to be fed into pipeline and executed return inner_product_forward(fc_prim_desc, src_memory, weights_memory, *bias_, *output_); } else { - auto fc_prim_desc = CreateFcPrimDesc(src_desc, weights_desc, dst_desc); - + auto fc_prim_desc = + CreateFcPrimDesc(src_desc, weights_desc, dst_desc, attrs); output_ = CreateDstMemory(fc_prim_desc, ctx, output); return inner_product_forward(fc_prim_desc, src_memory, weights_memory, @@ -162,24 +310,39 @@ class FCPrimitiveFactory { const mkldnn::memory::desc& input_desc, const mkldnn::memory::desc& weights_desc, const mkldnn::memory::desc& bias_desc, - const mkldnn::memory::desc& dst_desc) { + const mkldnn::memory::desc& dst_desc, + const mkldnn::primitive_attr& attrs) { auto fc_desc = inner_product_forward::desc(prop_kind::forward_scoring, input_desc, weights_desc, bias_desc, dst_desc); - return inner_product_forward::primitive_desc(fc_desc, engine_); + return inner_product_forward::primitive_desc(fc_desc, attrs, engine_); } mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc( const mkldnn::memory::desc& input_desc, const mkldnn::memory::desc& weights_desc, - const mkldnn::memory::desc& dst_desc) { + const mkldnn::memory::desc& dst_desc, + const mkldnn::primitive_attr& attrs) { auto fc_desc = inner_product_forward::desc(prop_kind::forward, input_desc, weights_desc, dst_desc); - return inner_product_forward::primitive_desc(fc_desc, engine_); + return inner_product_forward::primitive_desc(fc_desc, attrs, engine_); } + // Since MKL-DNN requires the number of input dimensions to be + // equal to the number of weight dimensions, we have to convert + // weights to 4D memory if input is 4D. It also requires that + // all dimensions of weights and inputs agree, with an exception + // for the batch size and number of output channels (the first dim). + // In order to perform that we have to prepare the memory descriptor + // by hand, as MKL-DNN's reorder does not support conversion + // from one dimensionality to another. Hence, we set + // the first dimension of weights to resemble number of outputs + // and then we use the sizes of number of input channels as well + // as image width and height for latter dimensions. Then we create + // memories, find a format corresponding with input format and + // perform a converion. mkldnn::memory CreateFourDimWeightsMemory(const Tensor* input, const Tensor* weights) { auto input_dims = framework::vectorize(input->dims()); @@ -187,19 +350,22 @@ class FCPrimitiveFactory { auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]}; auto dst_format = MatchWeightFormat(input->format()); - auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oihw); - auto dst_desc = CreateMemDescriptor(dims, dst_format); + auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oihw); + auto dst_desc = CreateMemDescriptor(dims, dst_format); return Reorder(src_desc, dst_desc, weights_->get_data_handle()); } + // Create output memory based on output tensor and inner_product + // primitive descriptor format chosen for output mkldnn::memory CreateDstMemory( const mkldnn::inner_product_forward::primitive_desc& fc_prim_desc, const ExecutionContext& ctx, Tensor* output) { auto dst_prim_desc = fc_prim_desc.dst_primitive_desc(); auto buffer_size = dst_prim_desc.get_size(); - T* output_data = output->mutable_data(ctx.GetPlace(), buffer_size); - memory dst_mem(dst_prim_desc, to_void_cast(output_data)); + T_out* output_data = + output->mutable_data(ctx.GetPlace(), buffer_size); + memory dst_mem(dst_prim_desc, to_void_cast(output_data)); output->set_format(platform::GetMKLDNNFormat(dst_mem)); return dst_mem; } @@ -227,30 +393,63 @@ class FCPrimitiveFactory { boost::optional fc_; }; -template -std::shared_ptr> GetPrimitiveFactory( - const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx, - const Tensor* input, const Tensor* weights, - const mkldnn::engine& mkldnn_engine) { +// Attempt to fetch cached primitive factory based on provided parameters +// of input format, weight dimensions and output name. +// If not cached, create a new one. +template +static std::shared_ptr> +GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx, + const ExecutionContext& ctx, const Tensor* input, + const Tensor* weights, + const mkldnn::engine& mkldnn_engine) { const std::string key = platform::CreateKey( + platform::ThreadIDasStr(), input->format(), framework::vectorize(weights->dims()), ctx.op().Output("Out")); auto prim_creator = - std::static_pointer_cast>(dev_ctx.GetBlob(key)); + std::static_pointer_cast>( + dev_ctx.GetBlob(key)); if (prim_creator == nullptr) { - prim_creator = std::make_shared>(mkldnn_engine); + prim_creator = + std::make_shared>(mkldnn_engine); dev_ctx.SetBlob(key, prim_creator); } return prim_creator; } -template -class FCMKLDNNOpKernel : public framework::OpKernel { +// Choose appropriate primitive factory implementation based on inferred +// output type (uint8, int8 or float). +template +static inner_product_forward GetFcPrimitive( + const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx, + const LoDTensor* input, const Tensor* w, const Tensor* bias, + LoDTensor* output, const mkldnn::engine& mkldnn_engine, bool fuse_relu, + bool force_fp32_output) { + constexpr bool is_int8 = + std::is_same::value || std::is_same::value; + if (!is_int8 || force_fp32_output) { + return GetPrimitiveFactory(dev_ctx, ctx, input, w, + mkldnn_engine) + ->CreateFcPrimitive(input, w, bias, output, ctx); + } else if (fuse_relu) { + return GetPrimitiveFactory(dev_ctx, ctx, input, w, + mkldnn_engine) + ->CreateFcPrimitive(input, w, bias, output, ctx); + } else { + return GetPrimitiveFactory(dev_ctx, ctx, input, w, + mkldnn_engine) + ->CreateFcPrimitive(input, w, bias, output, ctx); + } +} + +template +class FCMKLDNNOpKernel : public framework::OpKernel { public: void Compute(const paddle::framework::ExecutionContext& ctx) const override { - PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()), - "It must use CPUPlace."); + PADDLE_ENFORCE_EQ( + platform::is_cpu_place(ctx.GetPlace()), true, + platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace.")); auto& dev_ctx = ctx.template device_context(); const auto& mkldnn_engine = dev_ctx.GetEngine(); @@ -259,9 +458,12 @@ class FCMKLDNNOpKernel : public framework::OpKernel { auto bias = ctx.Input("Bias"); auto output = ctx.Output("Out"); - auto prim_creator = - GetPrimitiveFactory(dev_ctx, ctx, input, w, mkldnn_engine); - auto fc = prim_creator->CreateFcPrimitive(input, w, bias, output, ctx); + bool fuse_relu = ctx.Attr("activation_type") == "relu"; + bool force_fp32_output = ctx.Attr("force_fp32_output"); + + auto fc = + GetFcPrimitive(dev_ctx, ctx, input, w, bias, output, + mkldnn_engine, fuse_relu, force_fp32_output); stream(stream::kind::eager).submit({fc}).wait(); output->set_layout(DataLayout::kMKLDNN); @@ -270,5 +472,18 @@ class FCMKLDNNOpKernel : public framework::OpKernel { } // namespace operators } // namespace paddle -REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace, - paddle::operators::FCMKLDNNOpKernel); +// Weights of FC are by default stored using fp32, template argument of weight +// data type implies their destination data type. (What's eventually going to +// be used during computations of kernel). +namespace ops = paddle::operators; +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, ::paddle::platform::CPUPlace, + FP32, ops::kFCMKLDNNFP32, + ops::FCMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, ::paddle::platform::CPUPlace, + U8, ops::kFCMKLDNNINT8, + ops::FCMKLDNNOpKernel); + +REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, ::paddle::platform::CPUPlace, + S8, ops::kFCMKLDNNINT8, + ops::FCMKLDNNOpKernel); diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py index b69633c8cd4398858eee0d9caa2fc5e29942cf0e..fb54bf555434fc2df561f10544cb6376effab44c 100644 --- a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py +++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py @@ -42,6 +42,7 @@ class TestFCMKLDNNOp(OpTest): def setUp(self): self.op_type = "fc" + self._cpu_only = True self.use_mkldnn = True self.create_data() self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}