diff --git a/cmake/operators.cmake b/cmake/operators.cmake
index 7d75690f6b791032e6577916bbf008a4a1983950..015e813ecf0cfc0e9da1318237ca4231edb8f0bc 100644
--- a/cmake/operators.cmake
+++ b/cmake/operators.cmake
@@ -186,10 +186,14 @@ function(op_library TARGET)
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, FP32);\n")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, S8);\n")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(conv2d, MKLDNN, U8);\n")
-        elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op")
-          file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n")
-          file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n")
+      elseif(${MKLDNN_FILE} STREQUAL "transpose_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, FP32);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, S8);\n")
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(transpose2, MKLDNN, U8);\n")
+      elseif(${MKLDNN_FILE} STREQUAL "fc_mkldnn_op")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, FP32);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, S8);\n")
+        file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, U8);\n")
       else()
         file(APPEND ${pybind_file} "USE_OP_DEVICE_KERNEL(${TARGET}, MKLDNN);\n")
       endif()
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 4598cd8c5f7d3ca6ce2d634db125e851b1c95450..bc04c6f5b1259cf80a45d76bc28084360a236da2 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -905,15 +905,17 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x,
 
   auto *fc_op = pattern->NewNode(fc_repr())->assert_is_op("fc");
   // Create variables
+  // Input
+  auto *input_var = pattern->NewNode(input_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("fc", "Input");
   // Filter
   auto *fc_weight_var = pattern->NewNode(weights_repr())
                             ->AsInput()
-                            ->assert_is_persistable_var()
                             ->assert_is_op_input("fc", "W");
   // Bias
   auto *fc_bias_var = pattern->NewNode(bias_repr())
                           ->AsInput()
-                          ->assert_is_persistable_var()
                           ->assert_is_op_input("fc", "Bias");
   // Output
   auto *fc_out_var = pattern->NewNode(output_repr())
@@ -921,7 +923,8 @@ PDNode *patterns::FCMKLDNN::operator()(paddle::framework::ir::PDNode *x,
                          ->assert_is_op_output("fc", "Out")
                          ->assert_is_only_output_of_op("fc");
 
-  fc_op->LinksFrom({x, fc_weight_var, fc_bias_var}).LinksTo({fc_out_var});
+  fc_op->LinksFrom({input_var, fc_weight_var, fc_bias_var})
+      .LinksTo({fc_out_var});
   return fc_out_var;
 }
 
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index e6748cd34cbbe289ad8fd6183f8b3f1e6c8bcb9e..d13af21b415f74fd4c738830f1795d04cc0ca4b7 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -517,6 +517,7 @@ struct FCMKLDNN : public PatternBase {
   // declare operator node's name
   PATTERN_DECL_NODE(fc);
   // declare variable node's name
+  PATTERN_DECL_NODE(input);
   PATTERN_DECL_NODE(weights);
   PATTERN_DECL_NODE(bias);
   PATTERN_DECL_NODE(output);
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
index b743dee8b18a9407c96e2c42342fd963eb949078..ca992558955fbce3702e792eb23271d426835e6d 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -17,6 +17,7 @@
 #include <utility>
 #include <vector>
 #include "paddle/fluid/framework/eigen.h"
+#include "paddle/fluid/platform/errors.h"
 #include "paddle/fluid/string/pretty_log.h"
 
 namespace paddle {
@@ -43,6 +44,13 @@ void CPUQuantizePass::QuantizeInput(Graph* g, Node* op, Node* input,
                                     std::string input_name, double scale_to_one,
                                     bool is_unsigned,
                                     std::string scale_attr_name) const {
+  auto inputs = op->Op()->InputNames();
+  bool name_found =
+      std::find(inputs.begin(), inputs.end(), input_name) != inputs.end();
+  PADDLE_ENFORCE_EQ(
+      name_found, true,
+      platform::errors::InvalidArgument("%s isn't the input of the %s operator",
+                                        input_name, op->Op()->Type()));
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
@@ -122,6 +130,13 @@ void CPUQuantizePass::DequantizeOutput(Graph* g, Node* op, Node* output,
                                        std::string output_name,
                                        double scale_to_one, bool is_unsigned,
                                        std::string scale_attr_name) const {
+  auto outputs = op->Op()->OutputNames();
+  bool name_found =
+      std::find(outputs.begin(), outputs.end(), output_name) != outputs.end();
+  PADDLE_ENFORCE_EQ(name_found, true,
+                    platform::errors::InvalidArgument(
+                        "%s isn't the output of the %s operator", output_name,
+                        op->Op()->Type()));
   unsigned max = is_unsigned ? U8_MAX : S8_MAX;
   float scale = scale_to_one * max;
 
@@ -228,6 +243,66 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
   PrettyLogDetail(msg_ss.str().c_str());
 }
 
+void CPUQuantizePass::QuantizeFc(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::FCMKLDNN fc_pattern{pattern, name_scope_};
+  auto* fc_input = gpd.mutable_pattern()
+                       ->NewNode("fc_quantizer/input")
+                       ->AsInput()
+                       ->assert_is_op_input("fc", "Input");
+  fc_pattern(fc_input, false);
+
+  int quantize_fc_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize fc op";
+    GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
+    auto* fc_op_desc = fc->Op();
+
+    // skip if should not be quantized
+    if (fc_op_desc->GetAttrIfExists<bool>("use_quantizer") != true ||
+        fc_op_desc->GetAttrIfExists<bool>("use_mkldnn") != true)
+      return;
+
+    GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern);
+
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+
+    auto input_scale = scales[input->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[input->Name()].first;
+    QuantizeInput(g, fc, input, "Input", input_scale, is_input_unsigned,
+                  "Scale_in");
+
+    auto weight_scale_tensor = scales[weights->Name()].second;
+    EigenVectorArrayMap eigen_tensor{weight_scale_tensor.data<double>(),
+                                     weight_scale_tensor.numel(), 1};
+    eigen_tensor *= static_cast<double>(S8_MAX);
+    std::vector<float> filter_scale{
+        weight_scale_tensor.data<double>(),
+        weight_scale_tensor.data<double>() + weight_scale_tensor.numel()};
+
+    fc->Op()->SetAttr("Scale_weights", filter_scale);
+
+    auto output_scale = scales[output->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[output->Name()].first;
+    DequantizeOutput(g, fc, output, "Out", output_scale, is_output_unsigned,
+                     "Scale_out");
+
+    ++quantize_fc_count;
+  };
+
+  gpd(graph, handler);
+  AddStatis(quantize_fc_count);
+
+  std::stringstream msg_ss;
+  msg_ss << "---    quantized " << quantize_fc_count << " fc ops";
+  PrettyLogDetail(msg_ss.str().c_str());
+}
+
 void CPUQuantizePass::QuantizePool(Graph* graph) const {
   GraphPatternDetector gpd;
   auto pattern = gpd.mutable_pattern();
@@ -418,6 +493,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
   QuantizeConcat(graph);
   QuantizePriorBox(graph);
   QuantizeTranspose(graph);
+  QuantizeFc(graph);
 }
 
 }  // namespace ir
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
index d1b23227b6819ac964112f6aa06d5f986acb6b33..fd213d55be4fc7c6dc2552566f5d3b8091854bd4 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -46,6 +46,8 @@ class CPUQuantizePass : public FusePassBase {
 
   void QuantizeConv(Graph* graph, bool with_residual_data = false) const;
 
+  void QuantizeFc(Graph* graph) const;
+
   void QuantizePool(Graph* graph) const;
 
   void QuantizeConcat(Graph* graph) const;
diff --git a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
index 00208c9bba296d6535025b3927a93daf1b459efa..923beb139099fae515eb74fb401d58c24b436117 100644
--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -62,6 +62,10 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
     if (inputs.size() > 1) op->SetInput("W", {inputs[1]});
     if (inputs.size() > 2) op->SetInput("Bias", {inputs[2]});
     op->SetOutput("Out", {outputs[0]});
+    op->SetAttr("use_quantizer", use_quantizer);
+    op->SetAttr("Scale_in", 1.0f);
+    op->SetAttr("Scale_out", 1.0f);
+    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
   } else if (type == "concat") {
     op->SetInput("X", inputs);
     op->SetOutput("Out", outputs);
@@ -71,13 +75,13 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
 
 namespace {
 static const std::initializer_list<std::string> variable_names{
-    "a", "w1", "c",  "d", "w2", "e",  "f", "g",
-    "h", "w3", "b1", "i", "j",  "w4", "b2"};
+    "a",  "w1", "c", "d", "w2", "e",  "f",  "g", "h",
+    "w3", "b1", "i", "j", "w4", "b2", "w5", "b3"};
 // (a,w1)->Conv1->c and c->Pool1->d
 //
 // (d,w2)->Conv2->e and e->Pool2->f
 //
-// d->Dropout1->g and g->Fc1->h and (h,w3,b1,i)->Conv3->j
+// d->Dropout1->g and (g, w5, b3)->Fc1->h and (h,w3,b1,i)->Conv3->j
 //
 // (d,w4, b2)->Conv4->i
 ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
@@ -98,7 +102,8 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
   SetOp(&prog, "pool2d", "Pool2", {"e"}, {"f"}, use_mkldnn, use_quantizer);
 
   SetOp(&prog, "dropout", "Dropout1", {"d"}, {"g"}, use_mkldnn);
-  SetOp(&prog, "fc", "Fc1", {"g"}, {"h"}, use_mkldnn);
+  SetOp(&prog, "fc", "Fc1", {"g", "w5", "b3"}, {"h"}, use_mkldnn,
+        use_quantizer);
   SetOp(&prog, "conv2d", "Conv3", {"h", "w3", "b1", "i"}, {"j"}, use_mkldnn,
         use_quantizer);
 
@@ -194,13 +199,13 @@ TEST(CpuQuantizePass, quantize) {
   // (d->QUANT3->IN3,w2)->Conv2->OUT3->DEQUANT3->e and
   // e->QUANT4->IN4->Pool2->OUT4->DEQUANT4->f
   //
-  // d->Dropout1->g and g->Fc1->h and
+  // d->Dropout1->g and (g->QUANT8->IN8,w5,b3)->Fc1->OUT7->DEQUANT7->h and
   // (h->QUANT5->IN5,w3,b1,i->QUANT6->IN6)->Conv3->OUT5->DEQUANT5->j
   //
   // (d->QUANT7->IN7,w4, b2)->Conv4->DEQUANT6->OUT6->i
-  // Insert nodes: 7 Quant + 7 IN + 6 OUT + 6 DEQUANT
-  int added_nodes = 7 + 7 + 6 + 6;
-  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 7, 6, added_nodes,
+  // Insert nodes: 8 Quant + 8 IN + 7 OUT + 7 DEQUANT
+  int added_nodes = 8 + 8 + 7 + 7;
+  MainTest(BuildProgramDesc(use_mkldnn, use_quantizer), 4, 2, 8, 7, added_nodes,
            2.0f * 127);
 }
 
diff --git a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
index 9cc2d3da3fceea06a7a753a88577fdff59a4a136..9b71e2abd759230dbebee9bf004d0f968bee7cc2 100644
--- a/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/fc_mkldnn_pass.cc
@@ -26,12 +26,11 @@ namespace framework {
 namespace ir {
 
 void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
-  PADDLE_ENFORCE(graph);
+  PADDLE_ENFORCE_NOT_NULL(graph,
+                          platform::errors::InvalidArgument(
+                              "Pointer to graph argument should not be NULL."));
   Init("fc_mkldnn_pass", graph);
 
-  auto* scope = param_scope();
-  PADDLE_ENFORCE(scope);
-
   GraphPatternDetector gpd;
   auto* x = gpd.mutable_pattern()
                 ->NewNode("fc_mkldnn_pass/x")
@@ -49,18 +48,25 @@ void FCMKLDNNPass::ApplyImpl(ir::Graph* graph) const {
       return;
     }
     GET_IR_NODE_FROM_SUBGRAPH(fc, fc, fc_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(input, input, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(weights, weights, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(bias, bias, fc_pattern);
     GET_IR_NODE_FROM_SUBGRAPH(output, output, fc_pattern);
 
     OpDesc* desc = fc->Op();
-    auto in_size = fc->inputs[0]->Var()->GetShape().size();
-    if (in_size != 2 && in_size != 4) {
+    auto dims = fc->inputs[0]->Var()->GetShape();
+    auto dim_num = dims.size();
+    bool are_dims_supported = dim_num == 2 || dim_num == 4;
+    constexpr size_t height_axis = 2;
+    constexpr size_t width_axis = 3;
+    bool is_size_supported =
+        dim_num == 4 ? (dims[width_axis] == 1 && dims[height_axis] == 1) : true;
+    if (!are_dims_supported || !is_size_supported) {
       VLOG(3) << "Do not enable FC MKL-DNN for dimensions different than 2 & 4";
+      VLOG(3) << "Or when width and height are different than one";
       return;
     }
     desc->SetAttr("use_mkldnn", true);
-    PADDLE_ENFORCE(subgraph.count(x));
 
     found_fc_count++;
   };
diff --git a/paddle/fluid/inference/api/analysis_predictor_tester.cc b/paddle/fluid/inference/api/analysis_predictor_tester.cc
index e990b2c7736ae51a1ac2ba2fd15362012288b9bb..dea448f9b03468eabda16d4375ea60348a09efb2 100644
--- a/paddle/fluid/inference/api/analysis_predictor_tester.cc
+++ b/paddle/fluid/inference/api/analysis_predictor_tester.cc
@@ -276,7 +276,7 @@ class MkldnnQuantizerTest : public testing::Test {
 
   std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
       const framework::LoDTensor& var_tensor, bool is_unsigned) const {
-    return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned);
+    return mkldnn_quantizer->GetMaxChScalingFactor(var_tensor, is_unsigned, 0);
   }
 
   std::pair<bool, framework::LoDTensor> GetKLScalingFactor(
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.cc b/paddle/fluid/inference/api/mkldnn_quantizer.cc
index 94c556ce52d61258475e4e9cc497b23b073938fc..dc8559e324e047cdf4addaf8fdc04ec0949742d0 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -37,6 +37,11 @@ using framework::LoDTensor;
 using framework::ir::Graph;
 using ConstEigenVectorArrayMap =
     Eigen::Map<const Eigen::Array<float, Eigen::Dynamic, 1>>;
+using EigenMatrixDoubleArray =
+    Eigen::Array<double, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using EigenMatrixArray =
+    Eigen::Array<float, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
+using ConstEigenMatrixArrayMap = Eigen::Map<const EigenMatrixArray>;
 using string::PrettyLogH1;
 static LoDTensor CreateScaleTensor(int64_t channels_num = 1);
 
@@ -66,7 +71,7 @@ bool AnalysisPredictor::MkldnnQuantizer::CalculateScales() {
             bool is_unsigned = false;
             bool compute_scale = true;
             if (is_output) {
-              if (op->Type() == "conv2d") {
+              if (op->Type() == "conv2d" || op->Type() == "fc") {
                 // output of conv2d with relu must be unsigned
                 std::string fuse_activation =
                     op->GetAttrIfExists<std::string>("fuse_activation");
@@ -138,7 +143,12 @@ void AnalysisPredictor::MkldnnQuantizer::CalculateSingleScale(
       scales_[var_name] = GetMaxScalingFactor(var_tensor, is_unsigned);
       break;
     case ScaleAlgo::MAX_CH:
-      scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned);
+      scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned,
+                                                /*is_transposed*/ false);
+      break;
+    case ScaleAlgo::MAX_CH_T:
+      scales_[var_name] = GetMaxChScalingFactor(var_tensor, is_unsigned,
+                                                /*is_transposed*/ true);
       break;
     case ScaleAlgo::KL:
       scales_[var_name] = GetKLScalingFactor(var_tensor, is_unsigned);
@@ -319,7 +329,7 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxScalingFactor(
 
 std::pair<bool, LoDTensor>
 AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
-    const LoDTensor& var_tensor, bool is_unsigned) const {
+    const LoDTensor& var_tensor, bool is_unsigned, bool is_transposed) const {
   PADDLE_ENFORCE(var_tensor.dims().size() > 0, "Tensor dimension is empty.");
 
   ConstEigenVectorArrayMap eigen_tensor{var_tensor.data<float>(),
@@ -331,18 +341,23 @@ AnalysisPredictor::MkldnnQuantizer::GetMaxChScalingFactor(
         "Tensor is claimed to be unsigned, but its min value (%f) is < 0.0",
         min_val);
 
-  int channels = var_tensor.dims()[0];
-  LoDTensor scale_tensor = CreateScaleTensor(channels);
-  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
-
-  for (int i = 0; i < channels; ++i) {
-    const auto tensor = var_tensor.Slice(i, i + 1);
+  auto dims = var_tensor.dims();
+  constexpr int num_col_dims = 1;
+  auto flattened_dims = framework::flatten_to_2d(dims, num_col_dims);
+  ConstEigenMatrixArrayMap eigen_tensor_mat{
+      var_tensor.data<float>(), flattened_dims[0], flattened_dims[1]};
 
-    ConstEigenVectorArrayMap eigen_tensor{tensor.data<float>(), tensor.numel(),
-                                          1};
-    float max_abs = eigen_tensor.abs().maxCoeff();
-    scale_ptr[i] = 1.0 / max_abs;
+  EigenMatrixDoubleArray scales;
+  if (is_transposed) {
+    scales = 1.0 / eigen_tensor_mat.cast<double>().abs().colwise().maxCoeff();
+  } else {
+    scales = 1.0 / eigen_tensor_mat.cast<double>().abs().rowwise().maxCoeff();
   }
+  int output_channel_axis = is_transposed;
+  int channels = dims[output_channel_axis];
+  LoDTensor scale_tensor = CreateScaleTensor(channels);
+  auto* scale_ptr = scale_tensor.mutable_data<double>(CPUPlace());
+  std::copy(scales.data(), scales.data() + scales.size(), scale_ptr);
 
   return std::make_pair(is_unsigned, scale_tensor);
 }
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer.h b/paddle/fluid/inference/api/mkldnn_quantizer.h
index 6c438265f0b8e2a65c0475f0b11064042549269e..eeaba7952902b04e83cc5b6c890ecf510b914c65 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer.h
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.h
@@ -79,7 +79,8 @@ class AnalysisPredictor::MkldnnQuantizer {
       const framework::LoDTensor& var_tensor, bool is_unsigned) const;
 
   std::pair<bool, framework::LoDTensor> GetMaxChScalingFactor(
-      const framework::LoDTensor& var_tensor, bool is_unsigned) const;
+      const framework::LoDTensor& var_tensor, bool is_unsigned,
+      bool is_transposed) const;
 
   std::pair<bool, framework::LoDTensor> GetMaxScalingFactor(
       const framework::LoDTensor& var_tensor, bool is_unsigned) const;
diff --git a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
index b7d6c87fd99890241cff5d3074584f8f74e3143a..4f477071ba616e4a8a36196ef8391367424bae0e 100644
--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -37,6 +37,11 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
 
   rules_["transpose2"]["X"] = ScaleAlgo::KL;
   rules_["transpose2"]["Out"] = ScaleAlgo::NONE;
+
+  rules_["fc"]["Input"] = ScaleAlgo::KL;
+  rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T;
+  rules_["fc"]["Bias"] = ScaleAlgo::NONE;
+  rules_["fc"]["Out"] = ScaleAlgo::KL;
 }
 
 ScaleAlgo MkldnnQuantizerConfig::scale_algo(
diff --git a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
index d46f842de7a2277ee5d00672386b12af7ba28deb..2ac09b82138ecaf3663a3783633c9cbf50da16bd 100644
--- a/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
+++ b/paddle/fluid/inference/api/paddle_mkldnn_quantizer_config.h
@@ -26,10 +26,12 @@ namespace paddle {
 
 // Algorithms for finding scale of quantized Tensors.
 enum class ScaleAlgo {
-  NONE,    // Do not compute scale
-  MAX,     // Find scale based on the maximum absolute value
-  MAX_CH,  // Find scale based on the maximum absolute value per channel
-  KL,      // Find scale based on KL Divergence
+  NONE,      // Do not compute scale
+  MAX,       // Find scale based on the max absolute value
+  MAX_CH,    // Find scale based on the max absolute value per output channel
+  MAX_CH_T,  // Find scale based on the max absolute value per output channel
+             // of a transposed tensor
+  KL,        // Find scale based on KL Divergence
 };
 
 struct MkldnnQuantizerConfig {
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 46ea4b6bb84c3cdf97c609230139b7dae98c7873..38af3149868a12bb6bca8842c8192ee672e63180 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -93,13 +93,21 @@ class FCOp : public framework::OperatorWithKernel {
       const framework::ExecutionContext& ctx) const override {
     framework::LibraryType library = framework::LibraryType::kPlain;
     framework::DataLayout layout = framework::DataLayout::kAnyLayout;
+    int customized_type_value =
+        framework::OpKernelType::kDefaultCustomizedTypeValue;
+    auto input_data_type =
+        OperatorWithKernel::IndicateVarDataType(ctx, "Input");
     if (ctx.Attr<bool>("use_mkldnn")) {
       library = framework::LibraryType::kMKLDNN;
       layout = framework::DataLayout::kMKLDNN;
+      using framework::proto::VarType;
+      customized_type_value = (input_data_type == VarType::INT8 ||
+                               input_data_type == VarType::UINT8)
+                                  ? kFCMKLDNNINT8
+                                  : kFCMKLDNNFP32;
     }
-    return framework::OpKernelType(
-        OperatorWithKernel::IndicateVarDataType(ctx, "Input"), ctx.GetPlace(),
-        layout, library);
+    return framework::OpKernelType(input_data_type, ctx.GetPlace(), layout,
+                                   library, customized_type_value);
   }
 };
 
@@ -132,6 +140,27 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
                   "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
+    /* int8 parameters */
+    AddAttr<bool>("use_quantizer",
+                  "(bool, default false) "
+                  "Set to true for operators that should be quantized and use "
+                  "int8 kernel. "
+                  "Only used on CPU.")
+        .SetDefault(false);
+    AddAttr<float>("Scale_in",
+                   "(float, default 1.0f), The quantize scale of input data")
+        .SetDefault(1.0f);
+    AddAttr<std::vector<float>>("Scale_weights",
+                                "(std::vector<float>, default {1.0f}), The "
+                                "quantize scale of weights data")
+        .SetDefault({1.0f});
+    AddAttr<float>("Scale_out",
+                   "(float, default 1.0f), The quantize scale of output data")
+        .SetDefault(1.0f);
+    AddAttr<bool>("force_fp32_output",
+                  "(bool, default false) Force INT8 kernel output FP32, only "
+                  "used in MKL-DNN INT8")
+        .SetDefault(false);
     AddComment(R"DOC(
 Fully Connected Operator.
 
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index 54a81812c25707393619f6aae8e4b26ab6b0b5ef..907f61196d61b12106fc526cbcabf862f870b511 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -21,6 +21,7 @@ limitations under the License. */
 
 namespace paddle {
 namespace operators {
+enum { kFCMKLDNNFP32 = 1, kFCMKLDNNINT8 = 2 };
 
 using Tensor = framework::Tensor;
 
diff --git a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
index 1d451915a150c76c21f201c8d8d6f83cde5d11be..a9f8ed74c3da5faf9744e86ba0e349d30eeae980 100644
--- a/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/dequantize_mkldnn_op.cc
@@ -78,7 +78,6 @@ class DeQuantOpKernel : public framework::OpKernel<T> {
       auto dst_md = platform::MKLDNNMemDesc(
           {dst_tz}, memory::data_type::f32,
           platform::MKLDNNFormatForSize(dst_tz.size(), memory::format::nchw));
-
       auto dst_pd = mkldnn::memory::primitive_desc(dst_md, engine);
       dst_memory = std::make_shared<mkldnn::memory>(
           dst_pd, to_void_cast<float>(output_data));
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index dfaf47653fac51c1aa7d2150b80efe0726ef36eb..458d56e40239bfea50f27cd8ebe69f1c12be83fe 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -37,7 +37,7 @@ using mkldnn::primitive;
 using mkldnn::stream;
 using mkldnn::prop_kind;
 
-template <typename T>
+template <typename T_in, typename T_w, typename T_out>
 class FCPrimitiveFactory {
  public:
   explicit FCPrimitiveFactory(const mkldnn::engine& engine) : engine_(engine) {}
@@ -47,19 +47,29 @@ class FCPrimitiveFactory {
                                           const Tensor* bias, LoDTensor* output,
                                           const ExecutionContext& ctx) {
     RecomputeOutputDims(ctx, input, weights, output);
+    // If primitive has already been created and cached, don't create new one,
+    // but update input and output data pointers and return it.
     if (fc_) {
       UpdateDataPointers(ctx, output, input);
       return *fc_;
     }
-    auto src_desc = CreateMemDescriptor(input, input->format());
-    input_ = CreateMemory(src_desc, input);
+    auto src_desc = CreateMemDescriptor<T_in>(input, input->format());
+    input_ = CreateMemory<T_in>(src_desc, input);
 
+    // Since MKL-DNN doesn't support 4D column-major data formats in
+    // inner_product
+    // primitive, transpose the weights to be in row-major format
     weights_ = TransposeWeights(weights);
     if (src_desc.data.ndims == 4) {
       weights_ = CreateFourDimWeightsMemory(input, weights);
     }
+    // If int8 data type is desired, weights are quantized to signed int8
+    QuantizeWeights(ctx);
 
-    auto dst_desc = CreateMemDescriptor(output, MKLDNNMemoryFormat::any);
+    // Choose MKLDNNMemoryFormat::any so that MKL-DNN can determine itself what
+    // is the best format for output during the creation of inner product
+    // primitive descriptor
+    auto dst_desc = CreateMemDescriptor<T_out>(output, MKLDNNMemoryFormat::any);
 
     fc_ = CreateFcPrimitive(*input_, *weights_, dst_desc, bias, output, ctx);
     return *fc_;
@@ -68,14 +78,18 @@ class FCPrimitiveFactory {
  private:
   void UpdateDataPointers(const ExecutionContext& ctx, Tensor* out,
                           const Tensor* in) {
-    input_->set_data_handle(const_cast<T*>(in->data<T>()));
-    output_->set_data_handle(out->mutable_data<T>(ctx.GetPlace()));
+    input_->set_data_handle(to_void_cast(in->data<T_in>()));
+    output_->set_data_handle(out->mutable_data<T_out>(ctx.GetPlace()));
+    // If the primitive exists, but the output tensor has changed its
+    // variable, update its format to what has been determined in first
+    // call to CreateFcPrimitive method.
     if (out->format() == MKLDNNMemoryFormat::format_undef) {
       auto output_format = platform::GetMKLDNNFormat(*output_);
       out->set_format((MKLDNNMemoryFormat)output_format);
     }
   }
 
+  // Choose weight memory format based on input memory format
   MKLDNNMemoryFormat MatchWeightFormat(MKLDNNMemoryFormat fmt) {
     using format = MKLDNNMemoryFormat;
     switch (fmt) {
@@ -85,11 +99,14 @@ class FCPrimitiveFactory {
         return format::oIhw8i;
       case format::nchw:
         return format::oihw;
+      case format::nhwc:
+        return format::hwio;
       default:
         return format::format_undef;
     }
   }
 
+  // Convert data from one data format to another
   mkldnn::memory Reorder(const memory::desc& src_desc,
                          const memory::desc& dst_desc, const void* src_data) {
     auto src_mem = memory({src_desc, engine_}, const_cast<void*>(src_data));
@@ -101,18 +118,46 @@ class FCPrimitiveFactory {
     return dst_mem;
   }
 
+  // Convert data from one data format to another and rescale it.
+  // If the desired data type is (un)signed int8, quantization occurs here.
+  mkldnn::memory Reorder(const memory& src_mem,
+                         const memory::primitive_desc& dst_pd,
+                         const std::vector<float>& scale_data) {
+    mkldnn::memory dst_mem = mkldnn::memory(dst_pd);
+    mkldnn::primitive_attr attributes;
+    // According to MKL-DNN's documentation mask determines along which
+    // dimensions should the scale be applied.
+    // 0 - Single scale applied to whole tensor
+    // 1 - Apply Scale along a slice of each dimension which index is 1.
+    //     In case of weights quantization, that dimension is output,
+    //     becuase we perform per-output-channel quantization
+    int mask = CreateMask(0, scale_data.size() > 1);
+    attributes.set_output_scales(mask, scale_data);
+    auto reorder =
+        mkldnn::reorder(mkldnn::reorder::primitive_desc(
+                            src_mem.get_primitive_desc(), dst_pd, attributes),
+                        src_mem, dst_mem);
+
+    stream(stream::kind::eager).submit({reorder}).wait();
+
+    return dst_mem;
+  }
+
+  template <typename T>
   static mkldnn::memory::desc CreateMemDescriptor(const std::vector<int>& dims,
                                                   MKLDNNMemoryFormat format) {
     return platform::MKLDNNMemDesc(dims, platform::MKLDNNGetDataType<T>(),
                                    format);
   }
 
+  template <typename T>
   static mkldnn::memory::desc CreateMemDescriptor(const Tensor* tensor,
                                                   MKLDNNMemoryFormat format) {
     auto dims = framework::vectorize<int>(tensor->dims());
-    return CreateMemDescriptor(dims, format);
+    return CreateMemDescriptor<T>(dims, format);
   }
 
+  template <typename T>
   mkldnn::memory CreateMemory(const mkldnn::memory::desc& desc,
                               const Tensor* tensor) {
     return CreateMemory(desc, tensor->data<T>());
@@ -123,12 +168,102 @@ class FCPrimitiveFactory {
     return memory({desc, engine_}, const_cast<void*>(data));
   }
 
+  // Transpose weights through MKL-DNN's reorder from io to oi format.
   mkldnn::memory TransposeWeights(const Tensor* weights) {
     auto dims = framework::vectorize<int>(weights->dims());
     std::swap(dims[0], dims[1]);  // Correct output dimensions
-    auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::io);
-    auto dst_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oi);
-    return Reorder(src_desc, dst_desc, weights->data<T>());
+    auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::io);
+    auto dst_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oi);
+    return Reorder(src_desc, dst_desc, weights->data<float>());
+  }
+
+  // Compute the bias scales so that its values correspond to the
+  // scale of data being an output of weights and input multiplication
+  std::vector<float> ComputeBiasScales(const ExecutionContext& ctx) {
+    auto scale_in_data = ctx.Attr<float>("Scale_in");
+    auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
+    const size_t weight_scales_num = scale_weights_data.size();
+    std::vector<float> bias_scales(weight_scales_num);
+
+#pragma omp parallel for
+    for (size_t i = 0; i < weight_scales_num; i++) {
+      if (scale_weights_data[i] == 0.0)
+        bias_scales[i] = 1.0f;
+      else
+        bias_scales[i] = scale_in_data * scale_weights_data[i];
+    }
+
+    return bias_scales;
+  }
+
+  // Correct output scale, to take into account scaling of input and weights
+  // Since the data that comes out of input and weight multiplication is
+  // scaled with its own scales, this data needs to be divided by
+  // those scales to normalise them back to what their floating-point range
+  // was. Then we multiply them by desired output scale we want on the output.
+  std::vector<float> ComputeOutputShiftScale(const ExecutionContext& ctx) {
+    auto scale_in_data = ctx.Attr<float>("Scale_in");
+    auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
+    // If the output will be in floats, we don't multiply by scale_out.
+    auto scale_out_data = ctx.Attr<bool>("force_fp32_output")
+                              ? 1.0f
+                              : ctx.Attr<float>("Scale_out");
+    const size_t weight_scales_num = scale_weights_data.size();
+    std::vector<float> output_shift_scale(weight_scales_num);
+
+#pragma omp parallel for
+    for (size_t i = 0; i < weight_scales_num; i++) {
+      if (scale_weights_data[i] == 0.0)
+        output_shift_scale[i] = scale_out_data;
+      else
+        output_shift_scale[i] =
+            scale_out_data / (scale_in_data * scale_weights_data[i]);
+    }
+
+    return output_shift_scale;
+  }
+
+  // Computing MKL-DNN's scaling mask which determines along which dimension
+  // slice should the scaling be applied. For more data plase refer to:
+  // https://intel.github.io/mkl-dnn/group__c__api__attributes.html
+  // Section dnnl_status_t DNNL_API dnnl_primitive_attr_set_output_scales
+  int CreateMask(int slice_dimension, bool is_multi_channel_quantizied) {
+    return is_multi_channel_quantizied ? 1 << slice_dimension : 0;
+  }
+
+  void QuantizeWeights(const ExecutionContext& ctx) {
+    auto quantized_desc = weights_->get_primitive_desc().desc();
+    quantized_desc.data.data_type =
+        (mkldnn_data_type_t)platform::MKLDNNGetDataType<T_w>();
+    weights_ = Reorder(*weights_, {quantized_desc, engine_},
+                       ctx.Attr<std::vector<float>>("Scale_weights"));
+  }
+
+  void QuantizeBias(const inner_product_forward::primitive_desc& fc_prim_desc,
+                    const ExecutionContext& ctx) {
+    auto bias_scales = ComputeBiasScales(ctx);
+    bias_ = Reorder(*bias_, fc_prim_desc.bias_primitive_desc(), bias_scales);
+  }
+
+  // Fuse relu into FC with activation type attribute has been set to 'relu'
+  mkldnn::primitive_attr CreatePostOps(const ExecutionContext& ctx) {
+    mkldnn::primitive_attr attributes;
+    mkldnn::post_ops post_operations;
+
+    auto output_shift_scale = ComputeOutputShiftScale(ctx);
+    int mask = CreateMask(1, output_shift_scale.size() > 1);
+    attributes.set_output_scales(mask, output_shift_scale);
+
+    if (ctx.Attr<std::string>("activation_type") == "relu") {
+      constexpr float scale = 1.0f;
+      constexpr float negative_slope = 0.0f;
+      constexpr float placeholder = 1.0f;  // beta
+      post_operations.append_eltwise(scale, mkldnn::algorithm::eltwise_relu,
+                                     negative_slope, placeholder);
+    }
+
+    attributes.set_post_ops(post_operations);
+    return attributes;
   }
 
   inner_product_forward CreateFcPrimitive(const memory& src_memory,
@@ -136,21 +271,34 @@ class FCPrimitiveFactory {
                                           const memory::desc& dst_desc,
                                           const Tensor* bias, Tensor* output,
                                           const ExecutionContext& ctx) {
+    // Acquire descriptors needed for creation of inner_product primitive
+    // descriptor
     const auto weights_desc = weights_memory.get_primitive_desc().desc();
     const auto src_desc = src_memory.get_primitive_desc().desc();
+    // Based on provided attributes, create attributes used by MKL-DNN to
+    // enable fused post-op activations such as 'relu'
+    const auto attrs = CreatePostOps(ctx);
+    // If bias exists, create inner_product primitive with or without bias
     if (bias) {
-      auto bias_desc = CreateMemDescriptor(bias, bias->format());
-      bias_ = CreateMemory(bias_desc, bias);
+      auto bias_desc = CreateMemDescriptor<float>(bias, bias->format());
+      bias_ = CreateMemory<float>(bias_desc, bias);
+      // Create inner_product descriptor. At this point the format of output
+      // is determined.
       auto fc_prim_desc =
-          CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc);
+          CreateFcPrimDesc(src_desc, weights_desc, bias_desc, dst_desc, attrs);
+      // If int8 is desired, quantize bias into 32-bit signed int
+      QuantizeBias(fc_prim_desc, ctx);
 
+      // Based on format determined by inner_product, create output in desired
+      // memory format
       output_ = CreateDstMemory(fc_prim_desc, ctx, output);
 
+      // Return MKL-DNN primitive ready to be fed into pipeline and executed
       return inner_product_forward(fc_prim_desc, src_memory, weights_memory,
                                    *bias_, *output_);
     } else {
-      auto fc_prim_desc = CreateFcPrimDesc(src_desc, weights_desc, dst_desc);
-
+      auto fc_prim_desc =
+          CreateFcPrimDesc(src_desc, weights_desc, dst_desc, attrs);
       output_ = CreateDstMemory(fc_prim_desc, ctx, output);
 
       return inner_product_forward(fc_prim_desc, src_memory, weights_memory,
@@ -162,24 +310,39 @@ class FCPrimitiveFactory {
       const mkldnn::memory::desc& input_desc,
       const mkldnn::memory::desc& weights_desc,
       const mkldnn::memory::desc& bias_desc,
-      const mkldnn::memory::desc& dst_desc) {
+      const mkldnn::memory::desc& dst_desc,
+      const mkldnn::primitive_attr& attrs) {
     auto fc_desc =
         inner_product_forward::desc(prop_kind::forward_scoring, input_desc,
                                     weights_desc, bias_desc, dst_desc);
 
-    return inner_product_forward::primitive_desc(fc_desc, engine_);
+    return inner_product_forward::primitive_desc(fc_desc, attrs, engine_);
   }
 
   mkldnn::inner_product_forward::primitive_desc CreateFcPrimDesc(
       const mkldnn::memory::desc& input_desc,
       const mkldnn::memory::desc& weights_desc,
-      const mkldnn::memory::desc& dst_desc) {
+      const mkldnn::memory::desc& dst_desc,
+      const mkldnn::primitive_attr& attrs) {
     auto fc_desc = inner_product_forward::desc(prop_kind::forward, input_desc,
                                                weights_desc, dst_desc);
 
-    return inner_product_forward::primitive_desc(fc_desc, engine_);
+    return inner_product_forward::primitive_desc(fc_desc, attrs, engine_);
   }
 
+  // Since MKL-DNN requires the number of input dimensions to be
+  // equal to the number of weight dimensions, we have to convert
+  // weights to 4D memory if input is 4D. It also requires that
+  // all dimensions of weights and inputs agree, with an exception
+  // for the batch size and number of output channels (the first dim).
+  // In order to perform that we have to prepare the memory descriptor
+  // by hand, as MKL-DNN's reorder does not support conversion
+  // from one dimensionality to another. Hence, we set
+  // the first dimension of weights to resemble number of outputs
+  // and then we use the sizes of number of input channels as well
+  // as image width and height for latter dimensions. Then we create
+  // memories, find a format corresponding with input format and
+  // perform a converion.
   mkldnn::memory CreateFourDimWeightsMemory(const Tensor* input,
                                             const Tensor* weights) {
     auto input_dims = framework::vectorize<int>(input->dims());
@@ -187,19 +350,22 @@ class FCPrimitiveFactory {
     auto dims = {weight_dims[1], input_dims[1], input_dims[2], input_dims[3]};
 
     auto dst_format = MatchWeightFormat(input->format());
-    auto src_desc = CreateMemDescriptor(dims, MKLDNNMemoryFormat::oihw);
-    auto dst_desc = CreateMemDescriptor(dims, dst_format);
+    auto src_desc = CreateMemDescriptor<float>(dims, MKLDNNMemoryFormat::oihw);
+    auto dst_desc = CreateMemDescriptor<float>(dims, dst_format);
 
     return Reorder(src_desc, dst_desc, weights_->get_data_handle());
   }
 
+  // Create output memory based on output tensor and inner_product
+  // primitive descriptor format chosen for output
   mkldnn::memory CreateDstMemory(
       const mkldnn::inner_product_forward::primitive_desc& fc_prim_desc,
       const ExecutionContext& ctx, Tensor* output) {
     auto dst_prim_desc = fc_prim_desc.dst_primitive_desc();
     auto buffer_size = dst_prim_desc.get_size();
-    T* output_data = output->mutable_data<T>(ctx.GetPlace(), buffer_size);
-    memory dst_mem(dst_prim_desc, to_void_cast<T>(output_data));
+    T_out* output_data =
+        output->mutable_data<T_out>(ctx.GetPlace(), buffer_size);
+    memory dst_mem(dst_prim_desc, to_void_cast<T_out>(output_data));
     output->set_format(platform::GetMKLDNNFormat(dst_mem));
     return dst_mem;
   }
@@ -227,30 +393,63 @@ class FCPrimitiveFactory {
   boost::optional<inner_product_forward> fc_;
 };
 
-template <typename T>
-std::shared_ptr<FCPrimitiveFactory<T>> GetPrimitiveFactory(
-    const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx,
-    const Tensor* input, const Tensor* weights,
-    const mkldnn::engine& mkldnn_engine) {
+// Attempt to fetch cached primitive factory based on provided parameters
+// of input format, weight dimensions and output name.
+// If not cached, create a new one.
+template <typename T_in, typename T_w, typename T_out>
+static std::shared_ptr<FCPrimitiveFactory<T_in, T_w, T_out>>
+GetPrimitiveFactory(const MKLDNNDeviceContext& dev_ctx,
+                    const ExecutionContext& ctx, const Tensor* input,
+                    const Tensor* weights,
+                    const mkldnn::engine& mkldnn_engine) {
   const std::string key = platform::CreateKey(
+      platform::ThreadIDasStr(), input->format(),
       framework::vectorize<int>(weights->dims()), ctx.op().Output("Out"));
 
   auto prim_creator =
-      std::static_pointer_cast<FCPrimitiveFactory<T>>(dev_ctx.GetBlob(key));
+      std::static_pointer_cast<FCPrimitiveFactory<T_in, T_w, T_out>>(
+          dev_ctx.GetBlob(key));
   if (prim_creator == nullptr) {
-    prim_creator = std::make_shared<FCPrimitiveFactory<T>>(mkldnn_engine);
+    prim_creator =
+        std::make_shared<FCPrimitiveFactory<T_in, T_w, T_out>>(mkldnn_engine);
     dev_ctx.SetBlob(key, prim_creator);
   }
 
   return prim_creator;
 }
 
-template <typename T>
-class FCMKLDNNOpKernel : public framework::OpKernel<T> {
+// Choose appropriate primitive factory implementation based on inferred
+// output type (uint8, int8 or float).
+template <typename T_in, typename T_w>
+static inner_product_forward GetFcPrimitive(
+    const MKLDNNDeviceContext& dev_ctx, const ExecutionContext& ctx,
+    const LoDTensor* input, const Tensor* w, const Tensor* bias,
+    LoDTensor* output, const mkldnn::engine& mkldnn_engine, bool fuse_relu,
+    bool force_fp32_output) {
+  constexpr bool is_int8 =
+      std::is_same<T_in, int8_t>::value || std::is_same<T_in, uint8_t>::value;
+  if (!is_int8 || force_fp32_output) {
+    return GetPrimitiveFactory<T_in, T_w, float>(dev_ctx, ctx, input, w,
+                                                 mkldnn_engine)
+        ->CreateFcPrimitive(input, w, bias, output, ctx);
+  } else if (fuse_relu) {
+    return GetPrimitiveFactory<T_in, T_w, uint8_t>(dev_ctx, ctx, input, w,
+                                                   mkldnn_engine)
+        ->CreateFcPrimitive(input, w, bias, output, ctx);
+  } else {
+    return GetPrimitiveFactory<T_in, T_w, int8_t>(dev_ctx, ctx, input, w,
+                                                  mkldnn_engine)
+        ->CreateFcPrimitive(input, w, bias, output, ctx);
+  }
+}
+
+template <typename T_in, typename T_w>
+class FCMKLDNNOpKernel : public framework::OpKernel<T_in> {
  public:
   void Compute(const paddle::framework::ExecutionContext& ctx) const override {
-    PADDLE_ENFORCE(platform::is_cpu_place(ctx.GetPlace()),
-                   "It must use CPUPlace.");
+    PADDLE_ENFORCE_EQ(
+        platform::is_cpu_place(ctx.GetPlace()), true,
+        platform::errors::PreconditionNotMet("FC MKL-DNN must use CPUPlace."));
     auto& dev_ctx = ctx.template device_context<MKLDNNDeviceContext>();
     const auto& mkldnn_engine = dev_ctx.GetEngine();
 
@@ -259,9 +458,12 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T> {
     auto bias = ctx.Input<Tensor>("Bias");
     auto output = ctx.Output<LoDTensor>("Out");
 
-    auto prim_creator =
-        GetPrimitiveFactory<T>(dev_ctx, ctx, input, w, mkldnn_engine);
-    auto fc = prim_creator->CreateFcPrimitive(input, w, bias, output, ctx);
+    bool fuse_relu = ctx.Attr<std::string>("activation_type") == "relu";
+    bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
+
+    auto fc =
+        GetFcPrimitive<T_in, T_w>(dev_ctx, ctx, input, w, bias, output,
+                                  mkldnn_engine, fuse_relu, force_fp32_output);
     stream(stream::kind::eager).submit({fc}).wait();
 
     output->set_layout(DataLayout::kMKLDNN);
@@ -270,5 +472,18 @@ class FCMKLDNNOpKernel : public framework::OpKernel<T> {
 }  // namespace operators
 }  // namespace paddle
 
-REGISTER_OP_KERNEL(fc, MKLDNN, ::paddle::platform::CPUPlace,
-                   paddle::operators::FCMKLDNNOpKernel<float>);
+// Weights of FC are by default stored using fp32, template argument of weight
+// data type implies their destination data type. (What's eventually going to
+// be used during computations of kernel).
+namespace ops = paddle::operators;
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, ::paddle::platform::CPUPlace,
+                                    FP32, ops::kFCMKLDNNFP32,
+                                    ops::FCMKLDNNOpKernel<float, float>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, ::paddle::platform::CPUPlace,
+                                    U8, ops::kFCMKLDNNINT8,
+                                    ops::FCMKLDNNOpKernel<uint8_t, int8_t>);
+
+REGISTER_OP_KERNEL_WITH_CUSTOM_TYPE(fc, MKLDNN, ::paddle::platform::CPUPlace,
+                                    S8, ops::kFCMKLDNNINT8,
+                                    ops::FCMKLDNNOpKernel<int8_t, int8_t>);
diff --git a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
index b69633c8cd4398858eee0d9caa2fc5e29942cf0e..fb54bf555434fc2df561f10544cb6376effab44c 100644
--- a/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
+++ b/python/paddle/fluid/tests/unittests/mkldnn/test_fc_mkldnn_op.py
@@ -42,6 +42,7 @@ class TestFCMKLDNNOp(OpTest):
 
     def setUp(self):
         self.op_type = "fc"
+        self._cpu_only = True
         self.use_mkldnn = True
         self.create_data()
         self.inputs = {'Input': self.matrix.input, 'W': self.matrix.weights}