diff --git a/paddle/fluid/framework/ir/fc_fuse_pass.cc b/paddle/fluid/framework/ir/fc_fuse_pass.cc
index 1cb42fbe43d750acb2c9f4911ddec0ffde4815ac..ed8128c33078906f9ba6616660ac4966525096d7 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass.cc
@@ -89,6 +89,35 @@ int FCFusePass::ApplyFCPattern(Graph* graph, bool with_relu) const {
     std::string activation_type = with_relu ? "relu" : "";
     desc.SetAttr("activation_type", activation_type);
 
+    // This is to add padding for dimension 128 on concern of MKL performance
+    auto* scope = param_scope();
+    auto* weight = scope->FindVar(w->Name())->GetMutable<LoDTensor>();
+    auto place = weight->place();
+    bool use_gpu = Get<bool>("use_gpu");
+    auto weight_data = weight->data<float>();
+    auto weight_dims = weight->dims();
+    int weight_num = product(weight_dims);
+    int w_h = weight_dims[0];
+    int w_w = weight_dims[1];
+    if (!use_gpu) {
+      if (w_h % 128 == 0 && w_w % 128 == 0) {
+        float* weight_data_tmp = new float[weight_num];
+        for (int i = 0; i < w_h; i++) {
+          memcpy(weight_data_tmp + i * w_w, weight_data + i * w_w,
+                 w_w * sizeof(float));
+        }
+        weight->Resize(DDim{weight_dims[0] + 4, weight_dims[1] + 4});
+        auto weight_data_new =
+            weight->mutable_data<float>(platform::CPUPlace());
+        for (int i = 0; i < w_h; i++) {
+          memcpy(weight_data_new + i * (w_w + 4), weight_data_tmp + i * w_w,
+                 w_w * sizeof(float));
+        }
+        delete[] weight_data_tmp;
+        desc.SetAttr("padding_weights", true);
+      }
+    }
+
     // For anakin subgraph int8
     // When in anakin subgraph int8 mode, the pattern like "fake_quant + mul +
     // fake_dequant" can be detected by the quant_dequant_fuse_pass. This pass
diff --git a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
index 320d28f131f03ed118614b5f97baa4397db0fcaa..dfae572d4634e43fb288f5cc21bf53efc3834f5e 100644
--- a/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
+++ b/paddle/fluid/framework/ir/fc_fuse_pass_tester.cc
@@ -21,6 +21,24 @@ namespace paddle {
 namespace framework {
 namespace ir {
 
+void AddVarToScope(Scope* param_scope, const std::string& name,
+                   const DDim& dims) {
+  auto* tensor = param_scope->Var(name)->GetMutable<LoDTensor>();
+  tensor->Resize(dims);
+  tensor->mutable_data<float>(platform::CPUPlace());
+}
+
+Scope* CreateParamScope() {
+  auto param_scope = new Scope();
+  AddVarToScope(param_scope, "conv2d_filters_0", {});
+  AddVarToScope(param_scope, "conv2d_bias_0", {});
+  AddVarToScope(param_scope, "weights_0", {});
+  AddVarToScope(param_scope, "weights_1", {});
+  AddVarToScope(param_scope, "bias_1", {});
+  AddVarToScope(param_scope, "bias_2", {});
+  return param_scope;
+}
+
 TEST(FCFusePass, basic) {
   // inputs                     operator            output
   // --------------------------------------------------------
@@ -50,6 +68,8 @@ TEST(FCFusePass, basic) {
 
   std::unique_ptr<ir::Graph> graph(new ir::Graph(layers.main_program()));
   auto pass = PassRegistry::Instance().Get("fc_fuse_pass");
+  pass->Set("use_gpu", new bool(true));
+  graph->Set("__param_scope__", CreateParamScope());
   int num_nodes_before = graph->Nodes().size();
   int num_mul_nodes_before = GetNumOpNodes(graph, "mul");
   VLOG(3) << DebugString(graph);
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 3fa907b418cfc6982ac6eb6c5c7077b32c050676..80b68ec155b7d603b12cffb42dccdaa8d6303a9c 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -147,6 +147,9 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("auto_config_layout",
                 new bool(argument->anakin_auto_config_layout()));
     }
+    if (pass_name == "fc_fuse_pass") {
+      pass->Set("use_gpu", new bool(argument->use_gpu()));
+    }
 
     pre_pass = pass_name;
 
diff --git a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
index f679e1221821a3ef32989127e01e6af67240fab8..5035f9b358718c4b3da445f82863c5d66e2dfbe6 100644
--- a/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_bert_tester.cc
@@ -153,7 +153,6 @@ void profile(bool use_mkldnn = false, bool use_ngraph = false) {
 
   if (use_mkldnn) {
     config.EnableMKLDNN();
-    config.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   if (use_ngraph) {
@@ -193,7 +192,6 @@ void compare(bool use_mkldnn = false, bool use_ngraph = false) {
   SetConfig(&cfg);
   if (use_mkldnn) {
     cfg.EnableMKLDNN();
-    cfg.pass_builder()->AppendPass("fc_mkldnn_pass");
   }
 
   if (use_ngraph) {
diff --git a/paddle/fluid/operators/fc_op.cc b/paddle/fluid/operators/fc_op.cc
index 5a3e1bb7fdabc6995e547264bd11f5f8cf0b7c25..46ea4b6bb84c3cdf97c609230139b7dae98c7873 100644
--- a/paddle/fluid/operators/fc_op.cc
+++ b/paddle/fluid/operators/fc_op.cc
@@ -32,17 +32,33 @@ class FCOp : public framework::OperatorWithKernel {
 
     auto in_dims = ctx->GetInputDim("Input");
     auto w_dims = ctx->GetInputDim("W");
+    bool padding_weights = ctx->Attrs().Get<bool>("padding_weights");
 
     if (ctx->HasInput("Bias")) {
       auto bias_dims = ctx->GetInputDim("Bias");
+      auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
       if (bias_dims.size() == 2) {
         PADDLE_ENFORCE_EQ(bias_dims[0], 1,
-                          "The shape of Bias must be [1, dim].");
-        PADDLE_ENFORCE_EQ(bias_dims[1], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
+                          platform::errors::InvalidArgument(
+                              "The shape of Bias is invalid."
+                              "The height of Bias should be 1."
+                              "But received height of Bias is %d.",
+                              bias_dims[0]));
+        PADDLE_ENFORCE_EQ(
+            bias_dims[1], w_dims1,
+            platform::errors::InvalidArgument(
+                "The shape of Bias is invalid."
+                "The width of Bias should be equal to width of Weight."
+                "But received width of Bias is %d and width of Weight is %d.",
+                bias_dims[1], w_dims1));
       } else if (bias_dims.size() == 1) {
-        PADDLE_ENFORCE_EQ(bias_dims[0], w_dims[1],
-                          "The shape of Bias must be [1, dim].");
+        PADDLE_ENFORCE_EQ(
+            bias_dims[0], w_dims1,
+            platform::errors::InvalidArgument(
+                "The shape of Bias is invalid."
+                "The height of Bias should be equal to the width of weight."
+                "But received height of Bias is %d and width of Weight is %d.",
+                bias_dims[0], w_dims1));
       }
     }
 
@@ -65,7 +81,8 @@ class FCOp : public framework::OperatorWithKernel {
         "in_num_col_dims.");
 
     std::vector<int64_t> output_dims;
-    FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims);
+    FCOutputSize(in_dims, w_dims, output_dims, in_num_col_dims,
+                 padding_weights);
 
     ctx->SetOutputDim("Out", framework::make_ddim(output_dims));
     ctx->ShareLoD("Input", "Out");
@@ -107,6 +124,11 @@ class FCOpMaker : public framework::OpProtoAndCheckerMaker {
     AddAttr<bool>("use_mkldnn",
                   "(bool, default false) Only used in mkldnn kernel")
         .SetDefault(false);
+    AddAttr<bool>(
+        "padding_weights",
+        "(bool, default false) When padding weights in the fc fuse pass, "
+        "the 'padding_weights' attribute is set as true.")
+        .SetDefault(false);
     AddAttr<bool>(framework::kAllKernelsMustComputeRuntimeShape,
                   "Skip calling InferShape() function in the runtime.")
         .SetDefault(true);
diff --git a/paddle/fluid/operators/fc_op.h b/paddle/fluid/operators/fc_op.h
index cc133af7315eafaa2fa75d2cbfddb2def736a3f2..54a81812c25707393619f6aae8e4b26ab6b0b5ef 100644
--- a/paddle/fluid/operators/fc_op.h
+++ b/paddle/fluid/operators/fc_op.h
@@ -27,17 +27,21 @@ using Tensor = framework::Tensor;
 inline void FCOutputSize(const framework::DDim& in_dims,
                          const framework::DDim& w_dims,
                          std::vector<int64_t>& out_dims,  // NOLINT
-                         int in_num_col_dims) {
+                         int in_num_col_dims, bool padding_weights) {
   auto in_mat_dims = framework::flatten_to_2d(in_dims, in_num_col_dims);
-  PADDLE_ENFORCE_EQ(
-      in_mat_dims[1], w_dims[0],
-      "Fully Connected input and weigth size do not match. %s, %s");
+  auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
+  auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
+  PADDLE_ENFORCE_EQ(in_mat_dims[1], w_dims0,
+                    platform::errors::InvalidArgument(
+                        "Fully Connected input and weigth size do not match. "
+                        "input width: %d,weight height: %d",
+                        in_mat_dims[1], w_dims0));
 
   out_dims.reserve(static_cast<size_t>(in_num_col_dims + 1));
   for (int i = 0; i < in_num_col_dims; ++i) {
     out_dims.push_back(in_dims[i]);
   }
-  out_dims.push_back(w_dims[1]);
+  out_dims.push_back(w_dims1);
 }
 
 template <typename DeviceContext, typename T>
@@ -53,14 +57,18 @@ class FCOpKernel : public framework::OpKernel<T> {
         (ctx.Attr<std::string>("activation_type") == "relu") ? true : false;
 
     auto w_dims = w->dims();
+    bool padding_weights = ctx.Attr<bool>("padding_weights");
 
     std::vector<int64_t> output_dims;
-    FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims);
+    FCOutputSize(input->dims(), w_dims, output_dims, in_num_col_dims,
+                 padding_weights);
     output->Resize(framework::make_ddim(output_dims));
     output->set_lod(input->lod());
 
     auto out_dims = output->dims();
-    int M = framework::product(out_dims) / w_dims[1];
+    auto w_dims0 = padding_weights ? w_dims[0] - 4 : w_dims[0];
+    auto w_dims1 = padding_weights ? w_dims[1] - 4 : w_dims[1];
+    int M = framework::product(out_dims) / w_dims1;
 
     const T* input_data = input->data<T>();
     const T* w_data = w->data<T>();
@@ -68,8 +76,8 @@ class FCOpKernel : public framework::OpKernel<T> {
 
     auto& dev_ctx = ctx.template device_context<DeviceContext>();
     math::FCFunctor<DeviceContext, T> fc;
-    fc(dev_ctx, M, w_dims[1], w_dims[0], input_data, w_data, output_data,
-       bias ? bias->data<T>() : NULL, with_relu);
+    fc(dev_ctx, M, w_dims1, w_dims0, input_data, w_data, output_data,
+       bias ? bias->data<T>() : NULL, with_relu, padding_weights);
   }
 };
 
diff --git a/paddle/fluid/operators/math/fc.cc b/paddle/fluid/operators/math/fc.cc
index b5479a1b435682384e555c6607a097c9e0c82bd8..38acd7ba94817932ed4ea628a36703dbb8495de2 100644
--- a/paddle/fluid/operators/math/fc.cc
+++ b/paddle/fluid/operators/math/fc.cc
@@ -25,10 +25,53 @@ class FCFunctor<platform::CPUDeviceContext, T> {
  public:
   void operator()(const platform::CPUDeviceContext& context, const int M,
                   const int N, const int K, const T* X, const T* W, T* Y,
-                  const T* B = nullptr, bool relu = false) {
+                  const T* B = nullptr, bool relu = false,
+                  bool padding_weights = false) {
     auto blas = math::GetBlas<platform::CPUDeviceContext, T>(context);
-    blas.MatMul(M, N, K, X, W, Y);
+    framework::Tensor Y1;
+    T* Y1_data = nullptr;
+    if (N % 128 == 0 && K % 128 == 0) {
+      const int NN = N + 4;
+      const int KK = K + 4;
+      framework::Tensor X1;
+      T* X1_data = X1.Resize({M * KK}).mutable_data<T>(platform::CPUPlace());
+      Y1_data = Y1.Resize({M * (N + 4)}).mutable_data<T>(platform::CPUPlace());
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+      for (int i = 0; i < M; i++) {
+        memcpy(X1_data + i * KK, X + i * K, K * sizeof(X[0]));
+      }
+      framework::Tensor W1;
+      T* W1_data = nullptr;
+      if (!padding_weights) {
+        W1_data = W1.Resize({(K + 4) * (N + 4)})
+                      .mutable_data<T>(platform::CPUPlace());
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+        for (int i = 0; i < K; i++) {
+          memcpy(W1_data + i * NN, W + i * N, N * sizeof(W[0]));
+        }
+      }
+      blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X1_data, KK,
+                (padding_weights ? W : W1_data), NN, static_cast<T>(0.0),
+                Y1_data, NN);
+    } else {
+      blas.MatMul(M, N, K, X, W, Y);
+    }
     if (B == NULL) {
+      if (N % 128 == 0 && K % 128 == 0) {
+#ifdef PADDLE_WITH_MKLML
+#pragma omp parallel for
+#endif
+        for (int i = 0; i < M; i++) {
+          memcpy(Y + i * N, Y1_data + i * (N + 4), N * sizeof(Y[0]));
+        }
+      }
+      PADDLE_ENFORCE_EQ(relu, false,
+                        platform::errors::PermissionDenied(
+                            "When bias is NULL, relu can not be true."));
       return;
     }
     if (relu) {
@@ -37,7 +80,8 @@ class FCFunctor<platform::CPUDeviceContext, T> {
               .At(N);
       for (int i = 0; i < M; i++) {
         T* dst = Y + i * N;
-        compute(B, dst, dst, N);
+        T* src = (N % 128 == 0 && K % 128 == 0) ? Y1_data + i * (N + 4) : dst;
+        compute(B, src, dst, N);
       }
     } else {
       auto compute =
@@ -48,7 +92,8 @@ class FCFunctor<platform::CPUDeviceContext, T> {
 #endif
       for (int i = 0; i < M; i++) {
         T* dst = Y + i * N;
-        compute(B, dst, dst, N);
+        T* src = (N % 128 == 0 && K % 128 == 0) ? Y1_data + i * (N + 4) : dst;
+        compute(B, src, dst, N);
       }
     }
   }
diff --git a/paddle/fluid/operators/math/fc.cu b/paddle/fluid/operators/math/fc.cu
index 1b22b81039954bfcf8ea0f6819d778d3fa126cab..82da2dd805aef99d603944ef0a3acc5ab0e64c19 100644
--- a/paddle/fluid/operators/math/fc.cu
+++ b/paddle/fluid/operators/math/fc.cu
@@ -41,7 +41,12 @@ class FCFunctor<platform::CUDADeviceContext, T> {
  public:
   void operator()(const platform::CUDADeviceContext& context, const int M,
                   const int N, const int K, const T* X, const T* W, T* Y,
-                  const T* B = nullptr, bool relu = false) {
+                  const T* B = nullptr, bool relu = false,
+                  bool padding_weights = false) {
+    PADDLE_ENFORCE_EQ(
+        padding_weights, false,
+        platform::errors::PermissionDenied(
+            "Weight padding in fc can not be used in GPU scope."));
     auto blas = math::GetBlas<platform::CUDADeviceContext, T>(context);
     blas.GEMM(false, false, M, N, K, static_cast<T>(1.0), X, K, W, N,
               static_cast<T>(0.0), Y, N);
diff --git a/paddle/fluid/operators/math/fc.h b/paddle/fluid/operators/math/fc.h
index 9bef496fb9d3977b286338a79f641fde514d8303..02f81587c739f2b47ef70a92f01d083c932deae3 100644
--- a/paddle/fluid/operators/math/fc.h
+++ b/paddle/fluid/operators/math/fc.h
@@ -26,7 +26,8 @@ class FCFunctor {
  public:
   void operator()(const DeviceContext& context, const int M, const int N,
                   const int K, const T* X, const T* W, T* Y,
-                  const T* B = nullptr, bool relu = false);
+                  const T* B = nullptr, bool relu = false,
+                  bool weight_pass = false);
 };
 
 }  // namespace math
diff --git a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
index 349dbffb386f84662b0ce8e6668126263d95b88d..dfaf47653fac51c1aa7d2150b80efe0726ef36eb 100644
--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -207,8 +207,13 @@ class FCPrimitiveFactory {
   void RecomputeOutputDims(const ExecutionContext& ctx, const LoDTensor* input,
                            const Tensor* w, LoDTensor* output) {
     int in_num_col_dims = ctx.Attr<int>("in_num_col_dims");
+    bool padding_weights = ctx.Attr<bool>("padding_weights");
+    PADDLE_ENFORCE_EQ(padding_weights, false,
+                      platform::errors::PermissionDenied(
+                          "Weight padding in fc can not be used in MKLDNN."));
     std::vector<int64_t> output_dims;
-    FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims);
+    FCOutputSize(input->dims(), w->dims(), output_dims, in_num_col_dims,
+                 padding_weights);
     output->Resize(framework::make_ddim(output_dims));
     output->set_lod(input->lod());
   }
diff --git a/python/paddle/fluid/tests/unittests/test_fc_op.py b/python/paddle/fluid/tests/unittests/test_fc_op.py
index 0da0fd0789a770db20dd219273d5c21ec7572e29..9028210b8fe9cc7ee4b6af3598ea0bc79fa2cde0 100644
--- a/python/paddle/fluid/tests/unittests/test_fc_op.py
+++ b/python/paddle/fluid/tests/unittests/test_fc_op.py
@@ -124,6 +124,13 @@ class TestFCOpWithBias3(TestFCOp):
         self.matrix = MatrixGenerate(1, 64, 32, 3, 3, 1)
 
 
+class TestFCOpWithPadding(TestFCOp):
+    def config(self):
+        self.with_bias = True
+        self.with_relu = True
+        self.matrix = MatrixGenerate(1, 4, 3, 128, 128, 2)
+
+
 class TestFCOpError(OpTest):
     def test_errors(self):
         with program_guard(Program(), Program()):