[ONEDNN] Upgrade oneDNN version to v3.1 (#52463)

* squash pick the poc code * fix build after rebase * fix int8 conv and fc uts * Fix and clean-up Get_SRC_Scale_Memory * fix floating point fc uts * fix test_analyzer_int8_googlenet * test_analyzer_int8_mobilenetv1 * fix int8 mobilenet v2 and v3 * fix build error after rebase * [oneDNN] rename library version * fix conv bias datatype * try to fix import error * fix rebase error * [oneDNN] pack library into python wheel * add MKLDNN_SHARED_LIB_3 to env_dict * fix test_analyzer_bert * fix fill_constant op kernel * fix ernie and matmul op ut * fix softplus ut * fix conv+relu6 fusion ut * fix hardswish fusion * fix quant+transpose fusion ut * fixsgd ut * fix int8 matmul with flatten * fix fc+scale fusion * fix conv/matmul+gelu fusion uts * fix rebase error * Revert "fix conv/matmul+gelu fusion uts" This reverts commit 47eb5e49972bd8f7271a233def9bfb3e98ce78e1. * upgrade to onednn v3.1 * remove older version onednn * use densetensor::data() for achieving mean and var in layernorm impl * comments for atol of integer tests * fix clang-format * Revert "remove older version onednn" This reverts commit 783e57ddfd4401254596eae7d47adb9b03590c09. * improve binary handle * fix expand kernel * Revert "use densetensor::data() for achieving mean and var in layernorm impl" * always use forward_inference for conv * remove activation scales * rollback changes to mkldnn.cmake * address comments * port changes to dequantize kernel * fix merge error * fix fused_elementwise_kernel * upgrade onednn version to v3.1.1 * fix some approval error * fix error msg format * remove old onednn libs * try to fix symbolic link issue * fix cinn test case segfault * do not explicit link test with onednn * remove unnecessary changes * integrate CINN with onednn v3 * link with mkldnn project * fix cinn build file --------- Co-authored-by: N Tomasz Socha <tomasz.socha@intel.com> Co-authored-by: N Chen, Xinyu1 <xinyu1.chen@intel.com> Co-authored-by: N tianshuo78520a <707759223@qq.com>

[ONEDNN] Upgrade oneDNN version to v3.1 (#52463)
* squash pick the poc code * fix build after rebase * fix int8 conv and fc uts * Fix and clean-up Get_SRC_Scale_Memory * fix floating point fc uts * fix test_analyzer_int8_googlenet * test_analyzer_int8_mobilenetv1 * fix int8 mobilenet v2 and v3 * fix build error after rebase * [oneDNN] rename library version * fix conv bias datatype * try to fix import error * fix rebase error * [oneDNN] pack library into python wheel * add MKLDNN_SHARED_LIB_3 to env_dict * fix test_analyzer_bert * fix fill_constant op kernel * fix ernie and matmul op ut * fix softplus ut * fix conv+relu6 fusion ut * fix hardswish fusion * fix quant+transpose fusion ut * fixsgd ut * fix int8 matmul with flatten * fix fc+scale fusion * fix conv/matmul+gelu fusion uts * fix rebase error * Revert "fix conv/matmul+gelu fusion uts" This reverts commit 47eb5e49972bd8f7271a233def9bfb3e98ce78e1. * upgrade to onednn v3.1 * remove older version onednn * use densetensor::data() for achieving mean and var in layernorm impl * comments for atol of integer tests * fix clang-format * Revert "remove older version onednn" This reverts commit 783e57ddfd4401254596eae7d47adb9b03590c09. * improve binary handle * fix expand kernel * Revert "use densetensor::data() for achieving mean and var in layernorm impl" * always use forward_inference for conv * remove activation scales * rollback changes to mkldnn.cmake * address comments * port changes to dequantize kernel * fix merge error * fix fused_elementwise_kernel * upgrade onednn version to v3.1.1 * fix some approval error * fix error msg format * remove old onednn libs * try to fix symbolic link issue * fix cinn test case segfault * do not explicit link test with onednn * remove unnecessary changes * integrate CINN with onednn v3 * link with mkldnn project * fix cinn build file --------- Co-authored-by: N Tomasz Socha <tomasz.socha@intel.com> Co-authored-by: N Chen, Xinyu1 <xinyu1.chen@intel.com> Co-authored-by: N tianshuo78520a <707759223@qq.com>
cfa513f7 · YangQun · GitHub · be3a6fa7 · cfa513f7 · cfa513f7
63 changed file
--- a/cmake/cinn.cmake
+++ b/cmake/cinn.cmake
@@ -174,8 +174,8 @@ if(WITH_MKL)
  target_link_libraries(cinnapi cinn_mklml)
  add_dependencies(cinnapi cinn_mklml)
  if(WITH_MKLDNN)
-    target_link_libraries(cinnapi mkldnn)
-    add_dependencies(cinnapi mkldnn)
+    target_link_libraries(cinnapi ${MKLDNN_LIB})
+    add_dependencies(cinnapi ${MKLDNN_PROJECT})
  endif()
 endif()

@@ -224,8 +224,8 @@ function(gen_cinncore LINKTYPE)
    target_link_libraries(${CINNCORE_TARGET} cinn_mklml)
    add_dependencies(${CINNCORE_TARGET} cinn_mklml)
    if(WITH_MKLDNN)
-      target_link_libraries(${CINNCORE_TARGET} mkldnn)
-      add_dependencies(${CINNCORE_TARGET} mkldnn)
+      target_link_libraries(${CINNCORE_TARGET} ${MKLDNN_LIB})
+      add_dependencies(${CINNCORE_TARGET} ${MKLDNN_PROJECT})
    endif()
  endif()


--- a/cmake/external/mkldnn.cmake
+++ b/cmake/external/mkldnn.cmake
-# Copyright (c) 2017 PaddlePaddle Authors. All Rights Reserved.
+# Copyright (c) 2017-2023 PaddlePaddle Authors. All Rights Reserved.
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
@@ -21,7 +21,6 @@ set(MKLDNN_INC_DIR
    "${MKLDNN_INSTALL_DIR}/include"
    CACHE PATH "mkldnn include directory." FORCE)
 set(SOURCE_DIR ${PADDLE_SOURCE_DIR}/third_party/mkldnn)
-set(MKLDNN_TAG 2089770c4818be8933c5e9d1dd3cbaeba1457667)

 # Introduce variables:
 # * CMAKE_INSTALL_LIBDIR
@@ -128,16 +127,12 @@ if(WIN32)
    VERBATIM)
  add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_LIB})
 else()
-  set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libmkldnn.so.0)
-  set(MKLDNN_SHARED_LIB_1 ${MKLDNN_INSTALL_DIR}/libdnnl.so.1)
-  set(MKLDNN_SHARED_LIB_2 ${MKLDNN_INSTALL_DIR}/libdnnl.so.2)
+  set(MKLDNN_SHARED_LIB ${MKLDNN_INSTALL_DIR}/libdnnl.so.3)
  add_custom_command(
-    OUTPUT ${MKLDNN_SHARED_LIB_2}
+    OUTPUT ${MKLDNN_SHARED_LIB}
    COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB}
-    COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_1}
-    COMMAND ${CMAKE_COMMAND} -E copy ${MKLDNN_LIB} ${MKLDNN_SHARED_LIB_2}
    DEPENDS ${MKLDNN_PROJECT})
-  add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB_2})
+  add_custom_target(mkldnn_cmd ALL DEPENDS ${MKLDNN_SHARED_LIB})
 endif()

 # generate a static dummy target to track mkldnn dependencies

--- a/cmake/inference_lib.cmake
+++ b/cmake/inference_lib.cmake
@@ -130,17 +130,9 @@ function(copy_part_of_thrid_party TARGET DST)
        add_custom_command(
          TARGET ${TARGET}
          POST_BUILD
-          COMMAND strip -s ${dst_dir}/lib/libmkldnn.so.0
-          COMMENT "striping libmkldnn.so.0")
+          COMMAND strip -s ${dst_dir}/lib/libdnnl.so.3
+          COMMENT "striping libdnnl.so.3")
      endif()
-      add_custom_command(
-        TARGET ${TARGET}
-        POST_BUILD
-        COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0
-                ${dst_dir}/lib/libdnnl.so.1
-        COMMAND ${CMAKE_COMMAND} -E create_symlink libmkldnn.so.0
-                ${dst_dir}/lib/libdnnl.so.2
-        COMMENT "Make a symbol link of libmkldnn.so.0")
    endif()
  endif()


--- a/paddle/cinn/runtime/cpu/mkldnn_math.cc
+++ b/paddle/cinn/runtime/cpu/mkldnn_math.cc
@@ -58,9 +58,13 @@ void cinn_cpu_mkldnn_softmax_fp32(int batch,
  auto src_mem =
      memory(src_md, engine, reinterpret_cast<float*>(inputs->memory));
  auto dst_mem = memory(src_md, engine, reinterpret_cast<float*>(out->memory));
-  auto softmax_d = dnnl::softmax_forward::desc(
-      dnnl::prop_kind::forward_inference, src_md, axis);
-  auto softmax_pd = dnnl::softmax_forward::primitive_desc(softmax_d, engine);
+  auto softmax_pd =
+      dnnl::softmax_forward::primitive_desc(engine,
+                                            dnnl::prop_kind::forward_inference,
+                                            dnnl::algorithm::softmax_accurate,
+                                            src_md,
+                                            src_md,
+                                            axis);
  auto softmax_prim = dnnl::softmax_forward(softmax_pd);

  softmax_prim.execute(engine_stream,
@@ -117,19 +121,17 @@ void cinn_cpu_mkldnn_conv2d_nchw_fp32(int batch_size,
  auto conv_weights_md = memory::desc({conv_weights_tz}, dt::f32, tag::any);
  auto conv_dst_md = memory::desc({conv_dst_tz}, dt::f32, tag::nchw);

-  auto conv_desc =
-      dnnl::convolution_forward::desc(dnnl::prop_kind::forward_inference,
-                                      dnnl::algorithm::convolution_direct,
-                                      conv_src_md,
-                                      conv_weights_md,
-                                      conv_dst_md,
-                                      conv_strides,
-                                      conv_dilations,
-                                      conv_paddings,
-                                      conv_paddings);
-
-  auto conv_prim_desc =
-      dnnl::convolution_forward::primitive_desc(conv_desc, cpu_engine);
+  auto conv_prim_desc = dnnl::convolution_forward::primitive_desc(
+      cpu_engine,
+      dnnl::prop_kind::forward_inference,
+      dnnl::algorithm::convolution_direct,
+      conv_src_md,
+      conv_weights_md,
+      conv_dst_md,
+      conv_strides,
+      conv_dilations,
+      conv_paddings,
+      conv_paddings);

  auto conv_src_memory = conv_user_src_memory;
  auto conv_weights_memory = conv_user_weights_memory;

--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -185,7 +185,6 @@ if(WITH_MKLDNN)
  pass_library(elementwise_act_onednn_fuse_pass inference DIR mkldnn)
  pass_library(matmul_elementwise_add_mkldnn_fuse_pass inference DIR mkldnn)
  pass_library(matmul_activation_mkldnn_fuse_pass inference DIR mkldnn)
-  pass_library(layer_norm_onednn_optimization_pass inference DIR mkldnn)
  pass_library(operator_scale_onednn_fuse_pass inference DIR mkldnn)
  pass_library(quant_transpose2_dequant_onednn_fuse_pass inference DIR mkldnn)
  pass_library(squeeze2_transpose2_onednn_fuse_pass inference DIR mkldnn)

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -942,29 +942,6 @@ PDNode *patterns::ConvBN::operator()(paddle::framework::ir::PDNode *conv_input,
  return bn_out_var;
 }

-PDNode *patterns::LayerNormShiftScale::operator()() {
-  auto layer_norm_in = pattern->NewNode(layer_norm_in_repr())
-                           ->AsInput()
-                           ->assert_is_op_input("layer_norm", "X");
-  auto layer_norm_bias = pattern->NewNode(layer_norm_bias_repr())
-                             ->AsInput()
-                             ->assert_is_op_input("layer_norm", "Bias");
-  auto layer_norm_scale = pattern->NewNode(layer_norm_scale_repr())
-                              ->AsInput()
-                              ->assert_is_op_input("layer_norm", "Scale");
-
-  auto layer_norm_op =
-      pattern->NewNode(layer_norm_op_repr())->assert_is_op("layer_norm");
-
-  auto layer_norm_out = pattern->NewNode(layer_norm_out_repr())
-                            ->assert_is_op_output("layer_norm", "Y")
-                            ->AsOutput();
-
-  layer_norm_op->LinksFrom({layer_norm_in, layer_norm_bias, layer_norm_scale})
-      .LinksTo({layer_norm_out});
-  return layer_norm_out;
-}
-
 PDNode *patterns::OperatorActivation::operator()(
    const std::string &operator_type, const std::string &activation_type) {
  auto *preceding_op =

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -526,19 +526,6 @@ struct ConvBN : public PatternBase {
  PATTERN_DECL_NODE(bn_saved_variance);
 };

-struct LayerNormShiftScale : public PatternBase {
-  LayerNormShiftScale(PDPattern* pattern, const std::string& name_scope)
-      : PatternBase(pattern, name_scope, "layer_norm_shift_scale") {}
-
-  PDNode* operator()();
-
-  PATTERN_DECL_NODE(layer_norm_in);
-  PATTERN_DECL_NODE(layer_norm_op);
-  PATTERN_DECL_NODE(layer_norm_bias);
-  PATTERN_DECL_NODE(layer_norm_scale);
-  PATTERN_DECL_NODE(layer_norm_out);
-};
-
 struct OperatorActivation : public PatternBase {
  OperatorActivation(PDPattern* pattern, const std::string& name_scope)
      : PatternBase(pattern, name_scope, "operator_activation") {}

--- a/paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/activation_onednn_fuse_pass.h
@@ -42,7 +42,7 @@ inline std::unordered_map<std::string, std::string> GetAttributeMap(
  if (act_type == "swish") {
    attr_map.emplace("beta", "fuse_alpha");
  } else if (act_type == "relu6") {
-    attr_map.emplace("threshold", "fuse_alpha");
+    attr_map.emplace("threshold", "fuse_beta");
  } else if (act_type == "hard_sigmoid") {
    attr_map.emplace("slope", "fuse_alpha");
    attr_map.emplace("offset", "fuse_beta");
@@ -73,6 +73,11 @@ inline void SetActivationAttrs(paddle::framework::OpDesc* fused_op,
    }
  }

+  if (act_type == "hard_swish") {
+    fused_op->SetAttr("fuse_alpha", 1.f / 6.f);
+    fused_op->SetAttr("fuse_beta", 1.f / 2.f);
+  }
+
  if (act_type == "gelu" && act_op->HasAttr("approximate")) {
    std::string gelu_act_type =
        PADDLE_GET_CONST(bool, act_op->GetAttr("approximate")) ? "gelu_tanh"

--- a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.cc
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h"
-
-#include "paddle/fluid/framework/convert_utils.h"
-#include "paddle/fluid/framework/op_version_registry.h"
-#include "paddle/phi/backends/onednn/onednn_reuse.h"
-#include "paddle/utils/string/pretty_log.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-using string::PrettyLogDetail;
-
-void LayerNormOneDNNOptimizationPass::ApplyImpl(Graph *graph) const {
-  PADDLE_ENFORCE_NOT_NULL(
-      graph, platform::errors::InvalidArgument("Graph cannot be nullptr."));
-  FusePassBase::Init("layer_norm_onednn_optimization_pass", graph);
-
-  GraphPatternDetector gpd;
-  patterns::LayerNormShiftScale layer_norm_shift_scale_pattern(
-      gpd.mutable_pattern(), "layer_norm_onednn_optimization_pass");
-  layer_norm_shift_scale_pattern();
-
-  int found_layer_norm = 0;
-
-  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
-                     Graph *g) {
-    GET_IR_NODE_FROM_SUBGRAPH(
-        layer_norm_op, layer_norm_op, layer_norm_shift_scale_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        layer_norm_bias, layer_norm_bias, layer_norm_shift_scale_pattern);
-    GET_IR_NODE_FROM_SUBGRAPH(
-        layer_norm_scale, layer_norm_scale, layer_norm_shift_scale_pattern);
-
-    if (layer_norm_op->Op()->HasAttr("use_mkldnn") &&
-        !(PADDLE_GET_CONST(bool, layer_norm_op->Op()->GetAttr("use_mkldnn")))) {
-      VLOG(4) << "Only oneDNN version of layer_norm can be optimized to "
-                 "include Bias and Shift in a single tensor.";
-      return;
-    }
-
-    auto *scope = param_scope();
-
-    auto ln_bias_name = layer_norm_op->Op()->Input("Bias");
-    auto ln_scale_name = layer_norm_op->Op()->Input("Scale");
-
-    auto *ln_bias_tensor =
-        scope->FindVar(ln_bias_name[0])->GetMutable<phi::DenseTensor>();
-    auto *ln_scale_tensor =
-        scope->FindVar(ln_scale_name[0])->GetMutable<phi::DenseTensor>();
-
-    const int channels = ln_bias_tensor->dims()[0];
-
-    VarDesc scale_shift_desc(patterns::PDNodeName(
-        "layer_norm_onednn_optimization_pass", "ScaleShift"));
-    scale_shift_desc.SetShape({channels * 2});
-    scale_shift_desc.SetDataType(
-        framework::TransToProtoVarType(ln_bias_tensor->dtype()));
-    scale_shift_desc.SetPersistable(true);
-
-    auto scale_shift_node = g->CreateVarNode(&scale_shift_desc);
-    auto *scale_shift_tensor =
-        scope->Var(scale_shift_node->Name())->GetMutable<phi::DenseTensor>();
-
-    scale_shift_tensor->Resize(phi::make_ddim({channels * 2}));
-
-    memcpy(scale_shift_tensor->mutable_data<float>(phi::CPUPlace()),
-           ln_scale_tensor->data<float>(),
-           channels * sizeof(float));
-
-    memcpy(scale_shift_tensor->data<float>() + channels,
-           ln_bias_tensor->data<float>(),
-           channels * sizeof(float));
-
-    layer_norm_op->Op()->SetInput("ScaleShift", {scale_shift_node->Name()});
-
-    IR_NODE_LINK_TO(scale_shift_node, layer_norm_op);
-    found_layer_norm++;
-  };
-
-  gpd(graph, handler);
-  AddStatis(found_layer_norm);
-  if ((!Has("disable_logs") || !Get<bool>("disable_logs")) &&
-      found_layer_norm > 0)
-    PrettyLogDetail("---    optimized %d layer_norms by merging Scale and Bias",
-                    found_layer_norm);
-}
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
-
-REGISTER_PASS(layer_norm_onednn_optimization_pass,
-              paddle::framework::ir::LayerNormOneDNNOptimizationPass);
-REGISTER_PASS_CAPABILITY(layer_norm_onednn_optimization_pass)
-    .AddCombination(
-        paddle::framework::compatible::OpVersionComparatorCombination().GE(
-            "layer_norm", 0));
--- a/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/layer_norm_onednn_optimization_pass.h
-// Copyright (c) 2022 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-
-#include "paddle/fluid/framework/ir/fuse_pass_base.h"
-#include "paddle/fluid/framework/ir/graph.h"
-
-namespace paddle {
-namespace framework {
-namespace ir {
-
-class LayerNormOneDNNOptimizationPass : public FusePassBase {
- public:
-  virtual ~LayerNormOneDNNOptimizationPass() {}
-
- protected:
-  void ApplyImpl(Graph *graph) const override;
-};
-
-}  // namespace ir
-}  // namespace framework
-}  // namespace paddle
--- a/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
+++ b/paddle/fluid/inference/api/demo_ci/CMakeLists.txt
@@ -158,7 +158,7 @@ if(WITH_MKL)
    if(WIN32)
      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
    else()
-      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libdnnl.so.3)
    endif()
  endif()
 else()

--- a/paddle/fluid/inference/api/mkldnn_quantizer.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer.cc
@@ -601,10 +601,7 @@ void AnalysisPredictor::MkldnnQuantizer::PrepareArgument() const {
  arg->main_graph().SetNotOwned(framework::ir::kParamScopeAttr, scope_ptr);

  auto* builder = predictor_.config_.pass_builder();
-  builder->SetPasses({"cpu_quantize_pass",
-                      "cpu_quantize_squash_pass",
-                      "int8_scale_calculation_mkldnn_pass",
-                      "params_quantization_mkldnn_pass"});
+  builder->SetPasses({"cpu_quantize_pass", "cpu_quantize_squash_pass"});
  if (predictor_.config_.ir_debug_) builder->TurnOnDebug();
  auto passes = builder->AllPasses();
  predictor_.argument_->SetIrAnalysisPasses(passes);

--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -371,7 +371,6 @@ void CpuPassStrategy::EnableMKLDNN() {
             "softplus_activation_onednn_fuse_pass",  //
             "shuffle_channel_mkldnn_detect_pass",    //
             "elementwise_act_onednn_fuse_pass",      //
-             "layer_norm_onednn_optimization_pass",   //
             "operator_scale_onednn_fuse_pass",       //
             "operator_unsqueeze2_onednn_fuse_pass",  //
             "operator_reshape2_onednn_fuse_pass",    //
@@ -465,7 +464,6 @@ void CpuPassStrategy::EnableMkldnnInt8() {
    passes_.push_back("scale_matmul_fuse_pass");
    passes_.push_back("reshape_transpose_matmul_mkldnn_fuse_pass");
    passes_.push_back("matmul_elementwise_add_mkldnn_fuse_pass");
-    passes_.push_back("layer_norm_onednn_optimization_pass");
    passes_.push_back("operator_scale_onednn_fuse_pass");
    passes_.push_back("operator_unsqueeze2_onednn_fuse_pass");
    passes_.push_back("operator_reshape2_onednn_fuse_pass");
@@ -473,8 +471,6 @@ void CpuPassStrategy::EnableMkldnnInt8() {
    passes_.push_back("cpu_quantize_pass");
    passes_.push_back("cpu_quantize_squash_pass");
    passes_.push_back("quant_transpose2_dequant_onednn_fuse_pass");
-    passes_.push_back("int8_scale_calculation_mkldnn_pass");
-    passes_.push_back("params_quantization_mkldnn_pass");
  }
  use_mkldnn_int8_ = true;
 #else

--- a/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
+++ b/paddle/fluid/operators/fused/mkldnn/fusion_rnn_mkldnn.h
@@ -81,7 +81,7 @@ class RNNMKLDNNHandler : public phi::funcs::OneDNNHandlerT<T, T_alg> {

  bool is_NTC(const dnnl::memory::desc& md) {
    auto ntc_md = dnnl::memory::desc(
-        md.dims(), md.data_type(), dnnl::memory::format_tag::ntc);
+        md.get_dims(), md.get_data_type(), dnnl::memory::format_tag::ntc);
    return md == ntc_md;
  }


--- a/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
+++ b/paddle/fluid/operators/fused/mkldnn/multi_gru_mkldnn_op.cc
@@ -198,7 +198,8 @@ class MultiGRUHandler {
                                               : OneDNNGetDataType<T>(),
                        OneDNNMemoryFormat::ntc);

-      auto desc = std::make_shared<dnnl::gru_forward::desc>(
+      pd = std::make_shared<dnnl::gru_forward::primitive_desc>(
+          engine_,
          dnnl::prop_kind::forward_inference,
          dir,
          x_md,
@@ -207,9 +208,8 @@ class MultiGRUHandler {
          wh_md,
          b_md,
          h_md,
-          dnnl::memory::desc());
-      pd = std::make_shared<dnnl::gru_forward::primitive_desc>(
-          *desc, attrs_[2 * layer + (dir == R2L)], engine_);
+          dnnl::memory::desc(),
+          attrs_[2 * layer + (dir == R2L)]);
      PADDLE_ENFORCE_NOT_NULL(
          pd,
          platform::errors::InvalidArgument(
@@ -234,7 +234,7 @@ class MultiGRUHandler {

      std::vector<dnnl::memory::desc> src_mds{in_md, in_md};
      pd = std::make_shared<dnnl::concat::primitive_desc>(
-          axis, src_mds, engine_);
+          engine_, axis, src_mds);
      dev_ctx_.SetBlob(pd_key, pd);
    }
    concat_pds_[layer] = pd;
@@ -612,7 +612,7 @@ class MultiGRUHandler {

  bool isNTC(const dnnl::memory::desc& md) {
    auto ntc_md = dnnl::memory::desc(
-        md.dims(), md.data_type(), dnnl::memory::format_tag::ntc);
+        md.get_dims(), md.get_data_type(), dnnl::memory::format_tag::ntc);
    return md == ntc_md;
  }


--- a/paddle/fluid/operators/generator/get_expected_kernel_func.cc
+++ b/paddle/fluid/operators/generator/get_expected_kernel_func.cc
@@ -323,9 +323,7 @@ phi::KernelKey GetPad3dExpectedKernelType(
  // only constant mode and non-blocked layouts are supported for oneDNN
  if (op_ptr->CanMKLDNNBeUsed(ctx, input_data_type) &&
      ctx.Attr<std::string>("mode") == "constant" &&
-      ctx.Input<phi::DenseTensor>("X")
-              ->mem_desc()
-              .data.format_desc.blocking.inner_nblks == 0) {
+      ctx.Input<phi::DenseTensor>("X")->mem_desc().get_inner_nblks() == 0) {
    return phi::KernelKey(phi::Backend::ONEDNN,
                          phi::DataLayout::ONEDNN,
                          phi::TransToPhiDataType(input_data_type));

--- a/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/fc_mkldnn_op.cc
@@ -33,7 +33,37 @@ struct InnerProductCache {
  dnnl::memory weights_mem;
  dnnl::memory bias_mem;
  dnnl::memory dst_mem;
+  dnnl::memory src_scales_mem;
+  dnnl::memory wei_scales_mem;
+  dnnl::memory dst_scales_mem;
 };
+
+std::tuple<std::vector<float>,
+           std::vector<float>,
+           std::vector<float>,
+           std::vector<float>>
+GetDNNLScales(const ExecutionContext& ctx) {
+  auto scale_in_data = ctx.Attr<float>("Scale_in");
+  auto scale_out = ctx.Attr<float>("Scale_out");
+  auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
+  auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise")
+                                   ? ctx.Attr<float>("Scale_in_eltwise")
+                                   : 1.0f;
+
+  std::vector<float> dnnl_src_scales = {1.f / scale_in_data};
+  size_t count = scale_weights_data.size();
+  std::vector<float> dnnl_wei_scales(count);
+#pragma omp parallel for if (count > 50)
+  for (size_t i = 0; i < count; i++) {
+    dnnl_wei_scales[i] = 1.f / scale_weights_data[i];
+  }
+  std::vector<float> dnnl_psum_scales = {1.f / scale_in_eltwise_data};
+  std::vector<float> dnnl_dst_scales = {1.f / scale_out};
+
+  return std::make_tuple(
+      dnnl_src_scales, dnnl_wei_scales, dnnl_psum_scales, dnnl_dst_scales);
+}
+
 template <typename T_in, typename T_w, typename T_out>
 class FCMKLDNNHandler
    : public phi::funcs::OneDNNHandlerNoCachingT<T_in,
@@ -100,11 +130,46 @@ class FCMKLDNNHandler
    float sum_scale = 1.0f;
    float activation_scale = 1.0f;
    if (phi::funcs::is_int8<T_w>()) {
-      std::vector<float> output_shift_scale;
-      std::tie(output_shift_scale, sum_scale, activation_scale) =
-          GetOutputScales(ctx);
-      int mask = CreateMask(1, output_shift_scale.size() > 1);
-      attributes.set_output_scales(mask, output_shift_scale);
+      std::vector<float> src_scales, wei_scales, psum_scales, dst_scales;
+      std::tie(src_scales, wei_scales, psum_scales, dst_scales) =
+          GetDNNLScales(ctx);
+
+      bool force_fp32_output = ctx.HasAttr("force_fp32_output") &&
+                               ctx.Attr<bool>("force_fp32_output");
+
+      attributes.set_scales_mask(DNNL_ARG_SRC, 0);
+
+      dnnl::memory::desc src_scales_md(
+          {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+      src_scales_mem_ = dnnl::memory(src_scales_md, this->engine_);
+      memcpy(src_scales_mem_.get_data_handle(),
+             src_scales.data(),
+             src_scales.size() * sizeof(float));
+
+      int mask = wei_scales.size() > 1 ? 1 : 0;
+      attributes.set_scales_mask(DNNL_ARG_WEIGHTS, mask);
+
+      dnnl::memory::desc wei_scales_md(
+          {static_cast<int64_t>(wei_scales.size())},
+          dnnl::memory::data_type::f32,
+          dnnl::memory::format_tag::x);
+      wei_scales_mem_ = dnnl::memory(wei_scales_md, this->engine_);
+      memcpy(wei_scales_mem_.get_data_handle(),
+             wei_scales.data(),
+             wei_scales.size() * sizeof(float));
+
+      if (!force_fp32_output) {
+        attributes.set_scales_mask(DNNL_ARG_DST, 0);
+
+        dnnl::memory::desc dst_scales_md(
+            {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+        dst_scales_mem_ = dnnl::memory(dst_scales_md, this->engine_);
+        memcpy(dst_scales_mem_.get_data_handle(),
+               dst_scales.data(),
+               dst_scales.size() * sizeof(float));
+      }
+
+      sum_scale = psum_scales[0];
    }

    if (ctx.HasAttr("fuse_residual_connection") &&
@@ -114,41 +179,20 @@ class FCMKLDNNHandler

    // ReLU from "fc_fuse_pass"
    if (ctx.Attr<std::string>("activation_type") == "relu") {
-      post_operations.append_eltwise(
-          activation_scale, dnnl::algorithm::eltwise_relu, 0.0f, 0.0f);
+      post_operations.append_eltwise(dnnl::algorithm::eltwise_relu, 0.0f, 0.0f);
    }
    AppendActivation(ctx, post_operations, activation_scale);

    if (ctx.HasAttr("fused_output_scale")) {
      float scale_alpha = ctx.Attr<float>("fused_output_scale");
      post_operations.append_eltwise(
-          1.0, dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
+          dnnl::algorithm::eltwise_linear, scale_alpha, 0.0f);
    }

    attributes.set_post_ops(post_operations);
    return attributes;
  }

-  // Compute the bias scales so that its values correspond to the
-  // scale of data being an output of weights and input multiplication
-  std::vector<float> GetBiasScales(const ExecutionContext& ctx) {
-    if (ctx.HasAttr("Bias_scales")) {
-      return ctx.Attr<std::vector<float>>("Bias_scales");
-    } else {
-      const float scale_in = ctx.Attr<float>("Scale_in");
-      const auto& scale_weights = ctx.Attr<std::vector<float>>("Scale_weights");
-      std::vector<float> bias_scales(scale_weights.size());
-
-      for (size_t i = 0; i < bias_scales.size(); ++i) {
-        if (scale_weights[i] == 0.0)
-          bias_scales[i] = 1.0f;
-        else
-          bias_scales[i] = scale_in * scale_weights[i];
-      }
-      return bias_scales;
-    }
-  }
-
  void AppendActivation(const ExecutionContext& ctx,
                        dnnl::post_ops& post_ops,  // NOLINT
                        float activation_scale = 1.0f) {
@@ -174,55 +218,9 @@ class FCMKLDNNHandler
            "Activation '%s' not found in oneDNN algorithms mapper",
            fuse_activation));

+    post_ops.append_eltwise(activation_type->second, fuse_alpha, fuse_beta);
    post_ops.append_eltwise(
-        activation_scale, activation_type->second, fuse_alpha, fuse_beta);
-  }
-
-  // Correct output scale, to take into account scaling of input and weights
-  // Since the data that comes out of input and weight multiplication is
-  // scaled with its own scales, this data needs to be divided by
-  // those scales to normalise them back to what their floating-point range
-  // was. Then we multiply them by desired output scale we want on the output.
-  std::tuple<std::vector<float>, float, float> GetOutputScales(
-      const ExecutionContext& ctx) {
-    if (ctx.HasAttr("Sum_scale")) {
-      return std::make_tuple(ctx.Attr<std::vector<float>>("Output_shift_scale"),
-                             ctx.Attr<float>("Sum_scale"),
-                             ctx.Attr<float>("Activation_scale"));
-    } else {
-      auto scale_in_data = ctx.Attr<float>("Scale_in");
-      auto scale_weights_data = ctx.Attr<std::vector<float>>("Scale_weights");
-      bool has_activation = !ctx.Attr<std::string>("activation_type").empty() ||
-                            (ctx.HasAttr("fuse_activation") &&
-                             !ctx.Attr<std::string>("fuse_activation").empty());
-      bool force_fp32_output = ctx.Attr<bool>("force_fp32_output");
-      bool fuse_residual_conn = ctx.HasAttr("fuse_residual_connection") &&
-                                ctx.Attr<bool>("fuse_residual_connection");
-      auto scale_in_eltwise_data = ctx.HasAttr("Scale_in_eltwise")
-                                       ? ctx.Attr<float>("Scale_in_eltwise")
-                                       : 1.0f;
-
-      // If the output will be in floats, we don't multiply by scale_out.
-
-      float activation_scale = (!force_fp32_output && has_activation)
-                                   ? ctx.Attr<float>("Scale_out")
-                                   : 1.0f;
-      float scale_out_data = (force_fp32_output || has_activation)
-                                 ? 1.0f
-                                 : ctx.Attr<float>("Scale_out");
-      float sum_scale =
-          fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
-      const size_t weight_scales_num = scale_weights_data.size();
-
-      for (size_t i = 0; i < weight_scales_num; ++i) {
-        if (scale_weights_data[i] == 0.0)
-          scale_weights_data[i] = scale_out_data;
-        else
-          scale_weights_data[i] =
-              scale_out_data / (scale_in_data * scale_weights_data[i]);
-      }
-      return std::make_tuple(scale_weights_data, sum_scale, activation_scale);
-    }
+        dnnl::algorithm::eltwise_linear, activation_scale, 0.0f);
  }

  // Computing oneDNN's scaling mask which determines along which dimension
@@ -235,7 +233,8 @@ class FCMKLDNNHandler
      const dnnl::memory::desc& user_md,
      const dnnl::memory::desc& target_md,
      void* ptr,
-      const dnnl::primitive_attr& attrs) {
+      const dnnl::primitive_attr& attrs,
+      const std::vector<float>& scale_data) {
    std::shared_ptr<dnnl::memory> target_memory_p;

    auto user_memory_p =
@@ -244,16 +243,21 @@ class FCMKLDNNHandler
    auto reorder_p = std::make_shared<dnnl::reorder>(
        *user_memory_p, *target_memory_p, attrs);

+    auto scales_md =
+        dnnl::memory::desc({static_cast<int64_t>(scale_data.size())},
+                           dnnl::memory::data_type::f32,
+                           dnnl::memory::format_tag::x);
+    auto scale_mem =
+        dnnl::memory(scales_md,
+                     this->engine_,
+                     phi::funcs::to_void_cast<float>(scale_data.data()));
+
    auto& astream = OneDNNContext::tls().get_stream();
    {
-      platform::RecordEvent record_reorder(
-          "int_reorder",
-          platform::TracerEventType::UserDefined,
-          1,
-          platform::EventRole::kUniqueOp);
-      reorder_p->execute(
-          astream,
-          {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
+      reorder_p->execute(astream,
+                         {{DNNL_ARG_FROM, *user_memory_p},
+                          {DNNL_ARG_TO, *target_memory_p},
+                          {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scale_mem}});
      astream.wait();
    }

@@ -262,6 +266,9 @@ class FCMKLDNNHandler

  std::string memory_key_;
  const OneDNNContext& dev_ctx_;
+  dnnl::memory src_scales_mem_;
+  dnnl::memory wei_scales_mem_;
+  dnnl::memory dst_scales_mem_;

 public:
  std::shared_ptr<dnnl::memory> AcquireSrcMemoryWithReorder(
@@ -272,7 +279,7 @@ class FCMKLDNNHandler
    if (x->dims().size() != 2) {
      // reshape restrictions are always satisfied because in case of 3 or 4 dim
      // input, plain layout is enforced
-      user_md = user_md.reshape(this->fwd_pd_->src_desc().dims());
+      user_md = user_md.reshape(this->fwd_pd_->src_desc().get_dims());
    }

    return this->AcquireMemoryWithReorder(
@@ -282,36 +289,8 @@ class FCMKLDNNHandler
  std::shared_ptr<dnnl::memory> AcquireBiasMemoryWithReorder(
      const ExecutionContext& ctx, const phi::DenseTensor* bias) {
    const float* bias_data = bias->data<float>();
-
-    if (phi::funcs::is_int8<T_w>() == false) {
-      // for BF16/FP32 bias is 1D and has no scales, so reorder is not needed
-      return this->AcquireMemoryFromPrimitive(this->fwd_pd_->bias_desc(),
-                                              to_void_cast<float>(bias_data));
-    } else {
-      const std::string bias_key = this->memory_key_ + "@bias";
-      auto memory_p = std::static_pointer_cast<dnnl::memory>(
-          this->dev_ctx_.GetBlob(bias_key));
-
-      if (!memory_p) {
-        const auto& scale_data = GetBiasScales(ctx);
-        dnnl::primitive_attr attrs;
-
-        int mask = CreateMask(0, scale_data.size() > 1);
-        attrs.set_output_scales(mask, scale_data);
-
-        auto user_md = dnnl::memory::desc({bias->dims()[0]},
-                                          OneDNNGetDataType<float>(),
-                                          dnnl::memory::format_tag::a);
-
-        memory_p = this->AcquireMemoryWithReorderAndAttrs(
-            user_md,
-            this->fwd_pd_->bias_desc(),
-            to_void_cast<float>(bias_data),
-            attrs);
-        this->dev_ctx_.SetBlob(bias_key, memory_p);
-      }
-      return memory_p;
-    }
+    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->bias_desc(),
+                                            to_void_cast<float>(bias_data));
  }

  std::shared_ptr<dnnl::memory> AcquireWeightsMemoryWithReorder(
@@ -322,7 +301,7 @@ class FCMKLDNNHandler

    if (!memory_p) {
      const float* weights_data = weights->data<float>();
-      auto weights_dims = this->fwd_pd_->weights_desc().dims();
+      auto weights_dims = this->fwd_pd_->weights_desc().get_dims();

      auto user_md = dnnl::memory::desc(weights_dims,
                                        OneDNNGetDataType<float>(),
@@ -331,13 +310,14 @@ class FCMKLDNNHandler
      if (phi::funcs::is_int8<T_w>()) {
        dnnl::primitive_attr attrs;
        int mask = CreateMask(0, scale_data.size() > 1);
-        attrs.set_output_scales(mask, scale_data);
+        attrs.set_scales_mask(DNNL_ARG_SRC, mask);

        memory_p = this->AcquireMemoryWithReorderAndAttrs(
            user_md,
            this->fwd_pd_->weights_desc(),
            to_void_cast<float>(weights_data),
-            attrs);
+            attrs,
+            scale_data);
      } else {
        memory_p =
            this->AcquireMemoryWithReorder(user_md,
@@ -370,7 +350,18 @@ class FCMKLDNNHandler
    }
    return this->template AcquireDstMemory<T_out>(out);
  }  // namespace operators
-};   // namespace paddle
+
+  void SetScalesIfNeeded(std::unordered_map<int, dnnl::memory>* args) {
+    if (src_scales_mem_.get_desc().is_zero() != true) {
+      args->insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, src_scales_mem_});
+      args->insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, wei_scales_mem_});
+    }
+    // dst scales may be empty when force fp32 output
+    if (dst_scales_mem_.get(true)) {
+      args->insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, dst_scales_mem_});
+    }
+  }
+};  // namespace paddle

 #define IF_CHANGE_FC_TW_TYPENAME(condition, ...) \
  if (condition) {                               \
@@ -408,7 +399,7 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
                     const std::shared_ptr<dnnl::memory>& src_mem,
                     const phi::DenseTensor* x,
                     const dnnl::engine& engine) const {
-    auto x_md = x->mem_desc().reshape(src_mem->get_desc().dims());
+    auto x_md = x->mem_desc().reshape(src_mem->get_desc().get_dims());
    if (x_md != src_mem->get_desc()) {
      dnnl::memory x_mem(x_md, engine, to_void_cast<T_in>(x->data<T_in>()));
      auto reorder_p = dnnl::reorder(x_mem, *src_mem);
@@ -453,6 +444,8 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {

    RecomputeOutputDims(ctx, x, weights, out);

+    std::unordered_map<int, dnnl::memory> fc_args;
+
    if (inner_product_cache) {
      fc_p = std::make_shared<dnnl::inner_product_forward>(
          inner_product_cache->inner_product_p);
@@ -474,9 +467,25 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
          ctx.GetPlace(), dst_memory_p->get_desc().get_size());
      dst_memory_p->set_data_handle(out_ptr);

+      fc_args.insert({DNNL_ARG_SRC, *src_memory_p});
+      fc_args.insert({DNNL_ARG_WEIGHTS, *weights_memory_p});
+      fc_args.insert({DNNL_ARG_DST, *dst_memory_p});
+
      if (bias) {
        bias_memory_p =
            std::make_shared<dnnl::memory>(inner_product_cache->bias_mem);
+        fc_args.insert({DNNL_ARG_BIAS, *bias_memory_p});
+      }
+
+      if (inner_product_cache->src_scales_mem.get(true)) {
+        fc_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC,
+                        inner_product_cache->src_scales_mem});
+        fc_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS,
+                        inner_product_cache->wei_scales_mem});
+      }
+      if (inner_product_cache->dst_scales_mem.get(true)) {
+        fc_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST,
+                        inner_product_cache->dst_scales_mem});
      }
    } else {
      auto in_col_dims = ctx.Attr<int>("in_num_col_dims");
@@ -495,25 +504,23 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
      weights_memory_p =
          handler.AcquireWeightsMemoryWithReorder(weights, scale_weights);
      dst_memory_p = handler.AcquireCustomDstMemory(ctx, out);
+      fc_args.insert({DNNL_ARG_SRC, *src_memory_p});
+      fc_args.insert({DNNL_ARG_WEIGHTS, *weights_memory_p});
+      fc_args.insert({DNNL_ARG_DST, *dst_memory_p});

      if (bias) {
        bias_memory_p = handler.AcquireBiasMemoryWithReorder(ctx, bias);
+        fc_args.insert({DNNL_ARG_BIAS, *bias_memory_p});
+      }
+
+      if (phi::funcs::is_int8<T_in>()) {
+        handler.SetScalesIfNeeded(&fc_args);
      }

      fc_p = handler.AcquireForwardPrimitive();
    }

    auto& astream = OneDNNContext::tls().get_stream();
-
-    std::unordered_map<int, dnnl::memory> fc_args = {
-        {DNNL_ARG_SRC, *src_memory_p},
-        {DNNL_ARG_WEIGHTS, *weights_memory_p},
-        {DNNL_ARG_DST, *dst_memory_p}};
-
-    if (bias) {
-      fc_args.insert({DNNL_ARG_BIAS, *bias_memory_p});
-    }
-
    fc_p->execute(astream, fc_args);
    astream.wait();

@@ -526,6 +533,18 @@ class FCMKLDNNKernel : public framework::OpKernel<T_in> {
      if (bias) {
        ip_cache->bias_mem = *bias_memory_p;
      }
+      if (fc_args.count(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC)) {
+        ip_cache->src_scales_mem =
+            fc_args.at(DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC);
+        ip_cache->wei_scales_mem =
+            fc_args.at(DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS);
+      }
+
+      if (fc_args.count(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST)) {
+        ip_cache->dst_scales_mem =
+            fc_args.at(DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST);
+      }
+
      dev_ctx.SetBlob(cache_key, ip_cache);
    }


--- a/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/layer_norm_mkldnn_op.cc
@@ -37,49 +37,34 @@ class LayerNormOneDNNHandler
            engine, cpu_place) {
    const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference
                                       : dnnl::prop_kind::forward_training;
+
    this->AcquireForwardPrimitiveDescriptor(
-        fwd_prop_kind, x->mem_desc(), epsilon, flags);
+        fwd_prop_kind, x->mem_desc(), x->mem_desc(), epsilon, flags);
  }

-  std::shared_ptr<dnnl::memory> AcquireScaleShiftMemory(
-      const phi::DenseTensor* scale,
-      const phi::DenseTensor* shift,
-      const framework::ExecutionContext& ctx) {
-    // OneDNN requires a single piece of memory for scale and shift data. During
-    // inference both pieces of memory are merged inside
-    // layer_norm_onednn_optimization_pass, but during training we have to
-    // manually copy them into new memory buffer
-    auto* scaleshift = ctx.Input<phi::DenseTensor>("ScaleShift");
-    if (scaleshift) {
-      return this->AcquireMemoryFromPrimitive(
-          this->fwd_pd_->weights_desc(),
-          phi::funcs::to_void_cast(scaleshift->data<float>()));
-    } else {
-      const unsigned int C = phi::vectorize(scale->dims())[0];
-
-      auto scaleshift_memory =
-          this->AcquireMemoryFromPrimitive(this->fwd_pd_->weights_desc());
-
-      auto mem_data_handle =
-          reinterpret_cast<float*>(scaleshift_memory->get_data_handle());
-      std::copy(
-          scale->data<float>(), scale->data<float>() + C, mem_data_handle);
-      std::copy(
-          shift->data<float>(), shift->data<float>() + C, mem_data_handle + C);
-      return scaleshift_memory;
-    }
+  std::tuple<std::shared_ptr<dnnl::memory>, std::shared_ptr<dnnl::memory>>
+  AcquireScaleShiftMemory(const phi::DenseTensor* scale,
+                          const phi::DenseTensor* shift) {
+    auto scale_memory = this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->weights_desc(),
+        phi::funcs::to_void_cast<float>(scale->data<float>()));
+    auto shift_memory = this->AcquireMemoryFromPrimitive(
+        this->fwd_pd_->weights_desc(),
+        phi::funcs::to_void_cast<float>(shift->data<float>()));
+
+    return std::make_tuple(scale_memory, shift_memory);
  }

  std::shared_ptr<dnnl::memory> AcquireMeanMemory(phi::DenseTensor* mean) {
-    T* mean_data = mean->mutable_data<T>(this->place_,
-                                         this->fwd_pd_->mean_desc().get_size());
+    float* mean_data = mean->mutable_data<float>(
+        this->place_, this->fwd_pd_->mean_desc().get_size());
    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->mean_desc(),
                                            mean_data);
  }

  std::shared_ptr<dnnl::memory> AcquireVarianceMemory(
      phi::DenseTensor* variance) {
-    T* variance_data = variance->mutable_data<T>(
+    float* variance_data = variance->mutable_data<float>(
        this->place_, this->fwd_pd_->variance_desc().get_size());
    return this->AcquireMemoryFromPrimitive(this->fwd_pd_->variance_desc(),
                                            variance_data);
@@ -114,7 +99,8 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    dnnl::normalization_flags flags{};

    if (with_scaleshift) {
-      flags |= dnnl::normalization_flags::use_scale_shift;
+      flags |= dnnl::normalization_flags::use_scale |
+               dnnl::normalization_flags::use_shift;
    }

    LayerNormOneDNNHandler<T> handler(
@@ -141,9 +127,9 @@ class LayerNormMKLDNNOpKernel : public paddle::framework::OpKernel<T> {
    }

    if (with_scaleshift) {
-      std::shared_ptr<dnnl::memory> scaleshift_memory =
-          handler.AcquireScaleShiftMemory(scale, bias, ctx);
-      args.insert({DNNL_ARG_SCALE_SHIFT, *scaleshift_memory});
+      auto scaleshift_mems = handler.AcquireScaleShiftMemory(scale, bias);
+      args.insert({DNNL_ARG_SCALE, *(std::get<0>(scaleshift_mems))});
+      args.insert({DNNL_ARG_SHIFT, *(std::get<1>(scaleshift_mems))});
    }

    layer_norm_p->execute(astream, args);

--- a/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/lrn_mkldnn_op.cc
@@ -50,6 +50,7 @@ class LRNOneDNNHandler
                : dnnl::prop_kind::forward_training,
        dnnl::algorithm::lrn_across_channels,
        input->mem_desc(),
+        input->mem_desc(),
        n,
        alpha,
        beta,
@@ -80,6 +81,7 @@ class LRNOneDNNHandler
        dnnl::prop_kind::forward_training,
        dnnl::algorithm::lrn_across_channels,
        in_x->mem_desc(),
+        in_x->mem_desc(),
        n,
        alpha,
        beta,
@@ -87,8 +89,9 @@ class LRNOneDNNHandler

    this->AcquireBackwardPrimitiveDescriptor(
        dnnl::algorithm::lrn_across_channels,
-        in_x->mem_desc(),
        out_grad->mem_desc(),
+        out_grad->mem_desc(),
+        in_x->mem_desc(),
        n,
        alpha,
        beta,

--- a/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/matmul_mkldnn_op.cc
@@ -160,7 +160,7 @@ class MatMulV1OneDNNHandler
    dnnl::primitive_attr matmul_attrs;
    float scale_out = ComputeOutputScale(ctx);
    if (scale_out != 1.0f) {
-      matmul_attrs.set_output_scales(0, {scale_out});
+      matmul_attrs.set_scales_mask(DNNL_ARG_SRC, 0);
    }
    return matmul_attrs;
  }
@@ -226,7 +226,9 @@ class MatMulOneDNNHandler
    auto out_md = memory::desc(out_dims, OneDNNGetDataType<OT>(), out_strides);

    dnnl::primitive_attr attrs;
-    if (scale != 1.0f) attrs.set_output_scales(0, {scale});
+    if (scale != 1.0f) {
+      attrs.set_scales_mask(DNNL_ARG_SRC, 0);
+    }

    this->AcquireForwardPrimitiveDescriptor(attrs, x_md, y_md, out_md);
  }
@@ -330,6 +332,15 @@ void ExecuteMatMulV1(const ExecutionContext &ctx,
      {DNNL_ARG_WEIGHTS, *weights_memory_p},
      {DNNL_ARG_DST, *dst_memory_p}};

+  float computed_scale_x = handler.ComputeOutputScale(ctx);
+  if (std::fabs(computed_scale_x - 1.f) > 1e-6f) {
+    auto scale_x_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto scale_x_mem =
+        dnnl::memory(scale_x_md, onednn_engine, &computed_scale_x);
+    matmul_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scale_x_mem});
+  }
+
  auto &astream = OneDNNContext::tls().get_stream();
  matmul_p->execute(astream, matmul_args);
  astream.wait();
@@ -602,6 +613,16 @@ class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
      y_combined = *y;
    }

+    float alpha = ctx.HasAttr("alpha") ? ctx.Attr<float>("alpha") : 1.0f;
+
+    auto alpha_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto scale_mem =
+        alpha != 1.0f
+            ? dnnl::memory(
+                  alpha_md, engine, phi::funcs::to_void_cast<float>(&alpha))
+            : dnnl::memory();
+
    MatMulOneDNNHandler<T, T, T> handler(engine,
                                         ctx.GetPlace(),
                                         &x_combined,
@@ -621,6 +642,9 @@ class MatMulGradMKLDNNKernel : public paddle::framework::OpKernel<T> {
        {DNNL_ARG_SRC, *src_memory_p},
        {DNNL_ARG_WEIGHTS, *weights_memory_p},
        {DNNL_ARG_DST, *dst_memory_p}};
+    if (alpha != 1.0f) {
+      matmul_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scale_mem});
+    }

    auto &astream = OneDNNContext::tls().get_stream();
    matmul_p->execute(astream, matmul_args);

--- a/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/quantize_mkldnn_op.cc
@@ -36,7 +36,8 @@ class QuantOpKernel : public framework::OpKernel<T> {
    auto* out = ctx.Output<phi::DenseTensor>("Output");

    const auto quantization_scale = ctx.Attr<float>("Scale");
-    const auto quantization_shift = ctx.Attr<float>("Shift");
+    const auto quantization_shift =
+        static_cast<int32_t>(ctx.Attr<float>("Shift"));
    const bool with_scale = quantization_scale != 1.0f;
    const bool with_shift = quantization_shift != 0.0f;

@@ -61,12 +62,11 @@ class QuantOpKernel : public framework::OpKernel<T> {
    static constexpr int32_t mask = 0;

    if (with_scale) {
-      attrs.set_output_scales(mask, {quantization_scale});
+      attrs.set_scales_mask(DNNL_ARG_SRC, mask);
    }

    if (with_shift) {
-      attrs.set_zero_points(
-          DNNL_ARG_DST, mask, {static_cast<int32_t>(quantization_shift)});
+      attrs.set_zero_points_mask(DNNL_ARG_DST, mask);
    }

    auto x_type = phi::funcs::ToOneDNNDataType(x->dtype());
@@ -94,7 +94,32 @@ class QuantOpKernel : public framework::OpKernel<T> {
        reorder_dst_memory_p, reorder_src_memory_p, attrs);

    auto& astream = phi::OneDNNContext::tls().get_stream();
-    reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+
+    auto scales_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto scales_mem =
+        dnnl::memory(scales_md,
+                     dev_ctx.GetEngine(),
+                     phi::funcs::to_void_cast<float>(&quantization_scale));
+    auto zero_points_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::s32, dnnl::memory::format_tag::x);
+    auto zero_points_mem =
+        dnnl::memory(zero_points_md,
+                     dev_ctx.GetEngine(),
+                     phi::funcs::to_void_cast<int32_t>(&quantization_shift));
+
+    std::unordered_map<int, dnnl::memory> reorder_args;
+    reorder_args.insert({DNNL_ARG_SRC, *reorder_src_memory_p});
+    reorder_args.insert({DNNL_ARG_DST, *reorder_dst_memory_p});
+    if (with_scale) {
+      reorder_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scales_mem});
+    }
+    if (with_shift) {
+      reorder_args.insert(
+          {DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, zero_points_mem});
+    }
+
+    reorder_p->execute(astream, reorder_args);
    astream.wait();

    out->set_mem_desc(reorder_dst_memory_p->get_desc());

--- a/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/requantize_mkldnn_op.cc
@@ -39,10 +39,10 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
  void Compute(const framework::ExecutionContext& ctx) const override {
    auto* input = ctx.Input<phi::DenseTensor>("Input");
    auto scale_in = ctx.Attr<float>("Scale_in");
-    auto shift_in = ctx.Attr<float>("Shift_in");
+    auto shift_in = static_cast<int32_t>(ctx.Attr<float>("Shift_in"));
    auto scale_out = ctx.Attr<float>("Scale_out");
-    auto shift_out = ctx.Attr<float>("Shift_out");
-    bool with_shift = shift_in != 0.0f || shift_out != 0.0f;
+    auto shift_out = static_cast<int32_t>(ctx.Attr<float>("Shift_out"));
+    bool with_shift = shift_in != 0 || shift_out != 0;
    auto* output = ctx.Output<phi::DenseTensor>("Output");

    PADDLE_ENFORCE_NE(
@@ -53,7 +53,7 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
        scale_out,
        0.0f,
        platform::errors::InvalidArgument("Scale of output cannot be 0.0"));
-    if (shift_in != 0.0f) {
+    if (shift_in != 0) {
      PADDLE_ENFORCE_EQ(
          input->dtype(),
          DataType::UINT8,
@@ -68,19 +68,26 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
    auto src_paddle_dt = input->dtype();
    auto dst_paddle_dt = with_shift ? DataType::UINT8 : src_paddle_dt;

-    auto xstrides = input->mem_desc().data.format_desc.blocking.strides;
-    std::vector<dnnl_dim_t> vstrides(xstrides,
-                                     xstrides + input->mem_desc().data.ndims);
+    auto xstrides = input->mem_desc().get_strides();

    dnnl::primitive_attr attrs;
    int mask = 0;
-    float reorder_scale = scale_out / scale_in;
-    attrs.set_output_scales(mask, {reorder_scale});
+    float reorder_scale = scale_in / scale_out;
+    attrs.set_scales_mask(DNNL_ARG_DST, mask);
+    auto scales_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto scales_mem =
+        dnnl::memory(scales_md,
+                     dev_ctx.GetEngine(),
+                     phi::funcs::to_void_cast<float>(&reorder_scale));
+
+    uint32_t reorder_shift =
+        with_shift
+            ? clip_to_uint8(shift_out - (1.0f / reorder_scale) * shift_in)
+            : 0;
+
    if (with_shift) {
-      uint8_t reorder_shift =
-          clip_to_uint8(shift_out - reorder_scale * shift_in);
-      attrs.set_zero_points(
-          DNNL_ARG_DST, mask, {static_cast<int32_t>(reorder_shift)});
+      attrs.set_zero_points_mask(DNNL_ARG_DST, mask);
    }

    phi::funcs::ReorderOneDNNHandler reorder_handler(
@@ -94,13 +101,29 @@ class ReQuantOpKernel : public framework::OpKernel<T> {
    auto src_memory_p = reorder_handler.AcquireSrcMemory(
        input->mem_desc(), phi::funcs::to_void_cast(input->data<T>()));
    auto dst_memory_p = reorder_handler.AcquireDstMemory(
-        output, src_tz, vstrides, dev_ctx.GetPlace());
+        output, src_tz, xstrides, dev_ctx.GetPlace());

    auto reorder_p =
        reorder_handler.AcquireReorder(dst_memory_p, src_memory_p, attrs);

    auto& astream = phi::OneDNNContext::tls().get_stream();
-    reorder_p->execute(astream, *src_memory_p, *dst_memory_p);
+
+    auto zero_points_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::s32, dnnl::memory::format_tag::x);
+    auto zero_points_out_mem =
+        dnnl::memory(zero_points_md, dev_ctx.GetEngine(), &reorder_shift);
+
+    std::unordered_map<int, dnnl::memory> reorder_args;
+    reorder_args.insert({DNNL_ARG_SRC, *src_memory_p});
+    reorder_args.insert({DNNL_ARG_DST, *dst_memory_p});
+    reorder_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, scales_mem});
+    // shift for DST
+    if (with_shift) {
+      reorder_args.insert(
+          {DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_DST, zero_points_out_mem});
+    }
+
+    reorder_p->execute(astream, reorder_args);
    astream.wait();

    output->set_mem_desc(dst_memory_p->get_desc());

--- a/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/shuffle_channel_mkldnn_op.cc
@@ -29,8 +29,11 @@ class ShuffleChannelMKLDNNHandler
      : phi::funcs::OneDNNHandlerNoCachingT<T, dnnl::shuffle_forward>(
            engine, cpu_place) {
    static constexpr int channel_axis = 1;
-    this->AcquireForwardPrimitiveDescriptor(
-        dnnl::prop_kind::forward_training, x->mem_desc(), channel_axis, group);
+    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_training,
+                                            x->mem_desc(),
+                                            x->mem_desc(),
+                                            channel_axis,
+                                            group);
  }
 };


--- a/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
+++ b/paddle/fluid/operators/mkldnn/transpose_mkldnn_op.cc
@@ -55,13 +55,13 @@ class TransposeMKLDNNOpKernel : public paddle::framework::OpKernel<T> {

    auto dst_md =
        dnnl::memory::desc(x_vec_dims,
-                           x->mem_desc().data_type(),
+                           x->mem_desc().get_data_type(),
                           phi::funcs::GetPlainOneDNNFormat(x_vec_dims.size()));
    auto dst_strides =
-        phi::funcs::FakeTransposeStrides(dst_md.dims(), transpose_axis);
+        phi::funcs::FakeTransposeStrides(dst_md.get_dims(), transpose_axis);

-    dst_md =
-        dnnl::memory::desc(x_vec_dims, x->mem_desc().data_type(), dst_strides);
+    dst_md = dnnl::memory::desc(
+        x_vec_dims, x->mem_desc().get_data_type(), dst_strides);
    auto dst_data =
        out->mutable_data(ctx.GetPlace(), x->type(), dst_md.get_size());


--- a/paddle/fluid/operators/pad2d_op.cc
+++ b/paddle/fluid/operators/pad2d_op.cc
@@ -703,9 +703,7 @@ class Pad2dOp : public framework::OperatorWithKernel {
    // only constant mode and non-blocked layouts are supported for oneDNN
    if (this->CanMKLDNNBeUsed(ctx, input_data_type) &&
        ctx.Attr<std::string>("mode") == "constant" &&
-        ctx.Input<phi::DenseTensor>("X")
-                ->mem_desc()
-                .data.format_desc.blocking.inner_nblks == 0) {
+        ctx.Input<phi::DenseTensor>("X")->mem_desc().get_inner_nblks() == 0) {
      return phi::KernelKey(phi::Backend::ONEDNN,
                            phi::DataLayout::ONEDNN,
                            phi::TransToPhiDataType(input_data_type));

--- a/paddle/fluid/operators/slice_op.cc
+++ b/paddle/fluid/operators/slice_op.cc
@@ -164,7 +164,7 @@ class SliceOp : public framework::OperatorWithKernel {
        // created, so in that scenario a fallback is needed
        if (ctx.Input<phi::DenseTensor>("Input")
                ->mem_desc()
-                .data.format_desc.blocking.inner_nblks == 0) {
+                .get_inner_nblks() == 0) {
          return phi::KernelKey(phi::Backend::ONEDNN,
                                phi::DataLayout::ONEDNN,
                                phi::TransToPhiDataType(input_data_type));
@@ -341,7 +341,7 @@ class SliceOpGrad : public framework::OperatorWithKernel {
      // created, so in that scenario a fallback is needed
      if (ctx.Input<phi::DenseTensor>(framework::GradVarName("Out"))
              ->mem_desc()
-              .data.format_desc.blocking.inner_nblks == 0) {
+              .get_inner_nblks() == 0) {
        return phi::KernelKey(phi::Backend::ONEDNN,
                              phi::DataLayout::ONEDNN,
                              phi::TransToPhiDataType(input_data_type));

--- a/paddle/fluid/operators/split_op.cc
+++ b/paddle/fluid/operators/split_op.cc
@@ -124,7 +124,7 @@ class SplitOp : public framework::OperatorWithKernel {
      // 16(depending on which blocking format is used) submemory cannot be
      // created, so in that scenario a fallback is needed
      const auto x_md = ctx.Input<phi::DenseTensor>("X")->mem_desc();
-      if (x_md.data.format_desc.blocking.inner_nblks == 0) {
+      if (x_md.get_inner_nblks() == 0) {
        return phi::KernelKey(phi::Backend::ONEDNN,
                              phi::DataLayout::ONEDNN,
                              phi::TransToPhiDataType(input_data_type));

--- a/paddle/phi/backends/onednn/axpy_handler.cc
+++ b/paddle/phi/backends/onednn/axpy_handler.cc
@@ -33,14 +33,19 @@ class AXPYHandler {
        {n}, OneDNNGetDataType<T>(), dnnl::memory::format_tag::x);
    src_mem_ = dnnl::memory(md, onednn_engine, DNNL_MEMORY_NONE);
    dst_mem_ = dnnl::memory(md, onednn_engine, DNNL_MEMORY_NONE);
+
    dnnl::primitive_attr reorder_attr;
-    dnnl::post_ops post_operations;
+
    if (alpha != 1.f) {
-      std::vector<float> scales(1, alpha);
-      reorder_attr.set_output_scales(0, scales);
+      reorder_attr.set_scales_mask(DNNL_ARG_FROM, 0);  // Ax + b
+      auto scales_md = dnnl::memory::desc(
+          {n}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+      src_scales_mem_ = dnnl::memory(scales_md, onednn_engine);
+      *reinterpret_cast<float *>(src_scales_mem_.get_data_handle()) = alpha;
    }
-    post_operations.append_sum(1.0f);

+    dnnl::post_ops post_operations;
+    post_operations.append_sum(1.0f);
    reorder_attr.set_post_ops(post_operations);
    reorder_p_ = dnnl::reorder(src_mem_, dst_mem_, reorder_attr);
  }
@@ -50,6 +55,8 @@ class AXPYHandler {
    return src_mem_;
  }

+  dnnl::memory &AcquireAlphaMemory() { return this->src_scales_mem_; }
+
  dnnl::memory &AcquireDstMemory(T *y) {
    dst_mem_.set_data_handle(y);
    return dst_mem_;
@@ -59,6 +66,7 @@ class AXPYHandler {

 private:
  dnnl::memory src_mem_;
+  dnnl::memory src_scales_mem_;
  dnnl::memory dst_mem_;
  dnnl::reorder reorder_p_;
 };
@@ -107,7 +115,16 @@ void OneDNNAXPYHandler<T>::Impl::operator()(const T *x, T *y) {
  auto &reorder_dst_mem_p = handler_->AcquireDstMemory(y);
  auto reorder_p = handler_->AcquireReorder();
  auto &astream = OneDNNContext::tls().get_stream();
-  reorder_p.execute(astream, reorder_src_mem_p, reorder_dst_mem_p);
+
+  std::unordered_map<int, dnnl::memory> reorder_args;
+  reorder_args.insert({DNNL_ARG_SRC, reorder_src_mem_p});
+  reorder_args.insert({DNNL_ARG_DST, reorder_dst_mem_p});
+  if (static_cast<float>(this->alpha_) != 1.f) {
+    reorder_args.insert(
+        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, handler_->AcquireAlphaMemory()});
+  }
+
+  reorder_p.execute(astream, reorder_args);
  astream.wait();
 }


--- a/paddle/phi/backends/onednn/onednn_reuse.h
+++ b/paddle/phi/backends/onednn/onednn_reuse.h
--- a/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_elementwise_kernel.cc
@@ -36,10 +36,12 @@ void FusedElementwiseKernel(const OneDNNContext& dev_ctx,

  dnnl::post_ops post_operations;
  funcs::AppendActivation(
-      dev_ctx, post_operations, 1.0f, fuse_activation, fuse_alpha, fuse_beta);
+      dev_ctx, post_operations, fuse_activation, fuse_alpha, fuse_beta);
  if (fused_output_scale != 1.0) {
+    // linear post op's formula is `alpha * dst + beta`. Here we only want to
+    // scale the output not shift it, so the beta is set to 0.0f.
    post_operations.append_eltwise(
-        1.0, dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
+        dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
  }

  auto* non_const_x = &x;
@@ -96,10 +98,19 @@ void FusedElementwiseKernel(const OneDNNContext& dev_ctx,

  auto& astream = OneDNNContext::tls().get_stream();

-  const std::unordered_map<int, dnnl::memory> args = {
-      {DNNL_ARG_SRC_0, *src_x_memory},
-      {DNNL_ARG_SRC_1, *src_y_memory},
-      {DNNL_ARG_DST, *dst_memory}};
+  std::unordered_map<int, dnnl::memory> args = {{DNNL_ARG_SRC_0, *src_x_memory},
+                                                {DNNL_ARG_SRC_1, *src_y_memory},
+                                                {DNNL_ARG_DST, *dst_memory}};
+
+  if (handler.Has_SRC_0_Scale()) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0,
+                 handler.Get_SRC_0_Scale_Memory()});
+  }
+
+  if (handler.Has_SRC_1_Scale()) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1,
+                 handler.Get_SRC_1_Scale_Memory()});
+  }

  binary_prim->execute(astream, args);
  astream.wait();
@@ -107,7 +118,7 @@ void FusedElementwiseKernel(const OneDNNContext& dev_ctx,
  auto out_md = dst_memory->get_desc();

  if (handler.use_broadcasting_hack) {
-    auto dims = out_md.dims();
+    auto dims = out_md.get_dims();
    dims.insert(dims.begin(), non_const_x->dims()[0]);
    dims[1] /= dims[0];
    out_md = out_md.reshape(dims);

--- a/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_matmul_kernel.cc
@@ -139,17 +139,6 @@ class FusedMatmulOneDNNHandler
    this->AcquireForwardPrimitiveDescriptor(matmul_attrs, x_md, y_md, out_md);
  }

-  float ComputeOutputScale(float matmul_alpha,
-                           const float scale_x,
-                           const float scale_y,
-                           const float scale_in_eltwise UNUSED,
-                           const float scale_out,
-                           const bool force_fp32_output) {
-    float f_scale_out = force_fp32_output ? 1.0f : scale_out;
-    matmul_alpha *= f_scale_out / (scale_x * scale_y);
-    return matmul_alpha;
-  }
-
  dnnl::primitive_attr CreateMatmulAttrs(const OneDNNContext &dev_ctx,
                                         const DenseTensor *residual_data,
                                         const float matmul_alpha,
@@ -165,14 +154,17 @@ class FusedMatmulOneDNNHandler
    dnnl::primitive_attr matmul_attrs;
    dnnl::post_ops post_operations;

-    float computed_scale_out = ComputeOutputScale(matmul_alpha,
-                                                  scale_x,
-                                                  scale_y,
-                                                  scale_in_eltwise,
-                                                  scale_out,
-                                                  force_fp32_output);
-    if (computed_scale_out != 1.0f) {
-      matmul_attrs.set_output_scales(0, {computed_scale_out});
+    if (scale_x != 1.0f) {
+      matmul_attrs.set_scales_mask(DNNL_ARG_SRC, 0);
+    }
+
+    // alpha can be folded to weight scale
+    if (scale_y != 1.0f || matmul_alpha != 1.0f) {
+      matmul_attrs.set_scales_mask(DNNL_ARG_WEIGHTS, 0);
+    }
+
+    if (!force_fp32_output && scale_out != 1.0f) {
+      matmul_attrs.set_scales_mask(DNNL_ARG_DST, 0);
    }

    if (residual_data) {
@@ -183,17 +175,17 @@ class FusedMatmulOneDNNHandler
      post_operations.append_binary(dnnl::algorithm::binary_add,
                                    residual_data_md);
      if (scale_in_eltwise != 0.0f) {
-        float sum_scale = scale_out / scale_in_eltwise;
+        float sum_scale = 1.f / scale_in_eltwise;
        post_operations.append_sum(sum_scale);
      }
    }

    funcs::AppendActivation(
-        dev_ctx, post_operations, 1.0f, fuse_activation, fuse_alpha, fuse_beta);
+        dev_ctx, post_operations, fuse_activation, fuse_alpha, fuse_beta);

    if (fused_output_scale != 1.0f) {
      post_operations.append_eltwise(
-          1.0, dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
+          dnnl::algorithm::eltwise_linear, fused_output_scale, 0.0f);
    }

    matmul_attrs.set_post_ops(post_operations);
@@ -281,6 +273,37 @@ void ExecuteFusedMatmul(const OneDNNContext &dev_ctx,
                        *residual_data_memory_p});
  }

+  if (scale_x != 1.0f) {
+    dnnl::memory::desc src_scales_md(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto src_scales_mem =
+        std::make_shared<dnnl::memory>(src_scales_md, dev_ctx.GetEngine());
+    *reinterpret_cast<float *>(src_scales_mem->get_data_handle()) =
+        1.f / scale_x;
+    matmul_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, *src_scales_mem});
+  }
+
+  if (scale_y != 1.0f || matmul_alpha != 1.0f) {
+    dnnl::memory::desc wei_scales_md(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto wei_scales_mem =
+        std::make_shared<dnnl::memory>(wei_scales_md, dev_ctx.GetEngine());
+    *reinterpret_cast<float *>(wei_scales_mem->get_data_handle()) =
+        matmul_alpha / scale_y;
+    matmul_args.insert(
+        {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, *wei_scales_mem});
+  }
+
+  if (!force_fp32_output && scale_out != 1.0f) {
+    dnnl::memory::desc dst_scales_md(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto dst_scales_mem =
+        std::make_shared<dnnl::memory>(dst_scales_md, dev_ctx.GetEngine());
+    *reinterpret_cast<float *>(dst_scales_mem->get_data_handle()) =
+        1.f / scale_out;
+    matmul_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, *dst_scales_mem});
+  }
+
  auto &astream = OneDNNContext::tls().get_stream();
  matmul_p->execute(astream, matmul_args);
  astream.wait();

--- a/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
+++ b/paddle/phi/kernels/fusion/onednn/fused_transpose_kernel.cc
@@ -27,7 +27,7 @@ void SetInMemDescWithSqueeze2FuseSupport(
    const dnnl::memory::desc& in_md) {
  const std::set<int64_t> squeeze2_axes_set(fused_squeeze2_axes.begin(),
                                            fused_squeeze2_axes.end());
-  const std::vector<int64_t>& x_vec_dims = in_md.dims();
+  const std::vector<int64_t>& x_vec_dims = in_md.get_dims();
  std::vector<int64_t> squeezed_op_tz(
      x_vec_dims.size() - fused_squeeze2_axes.size(), 0);

@@ -113,12 +113,12 @@ void FusedTransposeKernel(const Context& dev_ctx,
  const int32_t mask = 0;

  if (scale != 1.0f) {
-    attrs.set_output_scales(mask, {scale});
+    attrs.set_scales_mask(DNNL_ARG_SRC, mask);
  }

  if (shift != 0.0f) {
-    auto dst = output_data_type == "fp32" ? DNNL_ARG_SRC : DNNL_ARG_DST;
-    attrs.set_zero_points(dst, mask, {static_cast<int32_t>(shift)});
+    auto arg = output_data_type == "fp32" ? DNNL_ARG_SRC : DNNL_ARG_DST;
+    attrs.set_zero_points_mask(arg, mask);
  }

  DataType out_dtype;
@@ -149,8 +149,31 @@ void FusedTransposeKernel(const Context& dev_ctx,
  auto reorder_p = reorder_handler.AcquireReorder(
      reorder_dst_memory_p, reorder_src_memory_p, attrs);

+  std::unordered_map<int, dnnl::memory> args = {
+      {DNNL_ARG_SRC, *reorder_src_memory_p},
+      {DNNL_ARG_DST, *reorder_dst_memory_p},
+  };
+
+  if (scale != 1.0f) {
+    auto scales_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto scales = dnnl::memory(
+        scales_md, dev_ctx.GetEngine(), const_cast<float*>(&scale));
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scales});
+  }
+
+  if (shift != 0.0f) {
+    auto zps_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::s32, dnnl::memory::format_tag::x);
+    auto zps = dnnl::memory(zps_md, dev_ctx.GetEngine());
+    *reinterpret_cast<int32_t*>(zps.get_data_handle()) =
+        static_cast<int32_t>(shift);
+    auto arg = output_data_type == "fp32" ? DNNL_ARG_SRC : DNNL_ARG_DST;
+    args.insert({DNNL_ARG_ATTR_ZERO_POINTS | arg, zps});
+  }
+
  auto& astream = OneDNNContext::tls().get_stream();
-  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+  reorder_p->execute(astream, args);
  astream.wait();

  auto out_md = reorder_dst_memory_p->get_desc().permute_axes(
@@ -164,7 +187,7 @@ void FusedTransposeKernel(const Context& dev_ctx,
        fused_reshape2_shape, out, out_md);
  } else if (!fused_squeeze2_axes.empty()) {
    out->set_mem_desc(out_md);
-    out->Resize(make_ddim(out_md.dims()));
+    out->Resize(make_ddim(out_md.get_dims()));
  } else {
    out->set_mem_desc(out_md);
  }

--- a/paddle/phi/kernels/onednn/activation_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_grad_kernel.cc
@@ -247,7 +247,11 @@ void HardSwishGradKernel(const Context& dev_ctx,
                         const DenseTensor& dout,
                         DenseTensor* dx) {
  HardSwishOneDNNGradFunctor<T> functor;
-  functor(dev_ctx, x, dout, 0, 0, dx);
+  // the formula of oneDNN hardswish primitive is:
+  // d=s*max(0,min(1,alpha*s+beta)). here, we set alpha=1/6, beta=1/2, to make
+  // the formula equal to the hardswish definition in Paddle:
+  // https://www.paddlepaddle.org.cn/documentation/docs/en/api/paddle/nn/functional/hardswish_en.html
+  functor(dev_ctx, x, dout, 1.0 / 6.0, 1.0 / 2.0, dx);
 }

 template <typename T, typename Context>

--- a/paddle/phi/kernels/onednn/activation_kernel.cc
+++ b/paddle/phi/kernels/onednn/activation_kernel.cc
@@ -160,7 +160,7 @@ void HardSwishKernel(const Context& dev_ctx,
                     const DenseTensor& x,
                     DenseTensor* out) {
  HardSwishOneDNNFunctor<T> functor;
-  functor(dev_ctx, x, 6, 0, out);
+  functor(dev_ctx, x, 1.0 / 6.0, 1.0 / 2.0, out);
 }

 template <typename T, typename Context>

--- a/paddle/phi/kernels/onednn/add_n_kernel.cc
+++ b/paddle/phi/kernels/onednn/add_n_kernel.cc
@@ -49,16 +49,6 @@ class SumOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::sum> {
    this->AcquireForwardPrimitiveDescriptor(dst_md, scales, srcs_md);
  }

-  // (jczaja) sum oneDNN prim is not having .desc attribute so
-  // we cannot use base AcquireForwardPrimitiveDescriptor
-  void AcquireForwardPrimitiveDescriptor(
-      const dnnl::memory::desc& dst_md,
-      const std::vector<float>& scales,
-      const std::vector<dnnl::memory::desc>& srcs_md) {
-    this->fwd_pd_.reset(
-        new dnnl::sum::primitive_desc(dst_md, scales, srcs_md, this->engine_));
-  }
-
  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor* input,
                                                 int i) {
    const T* input_data = input->data<T>();

--- a/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/batch_norm_grad_kernel.cc
@@ -43,19 +43,17 @@ void BatchNormGradRawKernel(const Context& dev_ctx,
  funcs::BatchNormOneDNNHandler<T> handler(
      dev_ctx.GetEngine(), dev_ctx.GetPlace(), epsilon, &x, &scale, &y_grad);

-  const unsigned int C = vectorize(scale.dims())[0];
-  const size_t scaleshift_size = 2 * C;
-  std::vector<T> diff_scaleshift_data;
-  diff_scaleshift_data.reserve(scaleshift_size);
+  T* diff_scale_data = dev_ctx.template Alloc<T>(scale_grad);
+  T* diff_shift_data = dev_ctx.template Alloc<T>(bias_grad);

  auto src_memory = handler.AcquireSrcMemory(&x);
  auto mean_memory = handler.AcquireMeanMemory(&saved_mean);
  auto variance_memory = handler.AcquireVarianceMemory(&saved_variance);
  auto diff_dst_memory = handler.AcquireDiffDstMemory(&y_grad);
-  auto scaleshift_memory = handler.AcquireScaleShiftMemory(&scale, &bias);
+  auto scaleshift_mems = handler.AcquireScaleShiftMemory(&scale, &bias);
  auto diff_src_memory = handler.AcquireDiffSrcMemory(x_grad);
-  auto diff_scaleshift_memory =
-      handler.AcquireDiffScaleShiftMemory(diff_scaleshift_data.data());
+  auto diff_scaleshift_mems =
+      handler.AcquireDiffScaleShiftMemory(diff_scale_data, diff_shift_data);

  auto batch_norm_bwd_p = handler.AcquireBackwardPrimitive();

@@ -66,20 +64,12 @@ void BatchNormGradRawKernel(const Context& dev_ctx,
       {DNNL_ARG_MEAN, *mean_memory},
       {DNNL_ARG_VARIANCE, *variance_memory},
       {DNNL_ARG_DIFF_DST, *diff_dst_memory},
-       {DNNL_ARG_SCALE_SHIFT, *scaleshift_memory},
+       {DNNL_ARG_SCALE, *(std::get<0>(scaleshift_mems))},
       {DNNL_ARG_DIFF_SRC, *diff_src_memory},
-       {DNNL_ARG_DIFF_SCALE_SHIFT, *diff_scaleshift_memory}});
+       {DNNL_ARG_DIFF_SCALE, *(std::get<0>(diff_scaleshift_mems))},
+       {DNNL_ARG_DIFF_SHIFT, *(std::get<1>(diff_scaleshift_mems))}});
  astream.wait();

-  T* diff_scale_data = dev_ctx.template Alloc<T>(scale_grad);
-  T* diff_shift_data = dev_ctx.template Alloc<T>(bias_grad);
-
-  // copy back diff scale/shift to output tensors (diff scale/shift)
-  diff_scaleshift_data.resize(scaleshift_size);
-  auto it = std::begin(diff_scaleshift_data);
-  std::copy(it, std::next(it, C), diff_scale_data);
-  std::copy(std::next(it, C), std::end(diff_scaleshift_data), diff_shift_data);
-
  // set memory descriptor of out tensor
  x_grad->set_mem_desc(diff_src_memory->get_desc());
 }

--- a/paddle/phi/kernels/onednn/batch_norm_kernel.cc
+++ b/paddle/phi/kernels/onednn/batch_norm_kernel.cc
@@ -58,7 +58,7 @@ void BatchNormKernel(const Context &dev_ctx,
                                           test_mode);

  auto src_memory = handler.AcquireSrcMemory(&x);
-  auto scaleshift_memory = handler.AcquireScaleShiftMemory(&scale, &bias);
+  auto scaleshift_mems = handler.AcquireScaleShiftMemory(&scale, &bias);
  auto dst_memory = handler.AcquireDstMemory(y);
  auto batch_norm_p = handler.AcquireForwardPrimitive();

@@ -79,7 +79,8 @@ void BatchNormKernel(const Context &dev_ctx,
  auto &astream = OneDNNContext::tls().get_stream();
  batch_norm_p->execute(astream,
                        {{DNNL_ARG_SRC, *src_memory},
-                         {DNNL_ARG_SCALE_SHIFT, *scaleshift_memory},
+                         {DNNL_ARG_SCALE, *(std::get<0>(scaleshift_mems))},
+                         {DNNL_ARG_SHIFT, *(std::get<1>(scaleshift_mems))},
                         {DNNL_ARG_MEAN, *mean_memory},
                         {DNNL_ARG_VARIANCE, *variance_memory},
                         {DNNL_ARG_DST, *dst_memory}});

--- a/paddle/phi/kernels/onednn/concat_kernel.cc
+++ b/paddle/phi/kernels/onednn/concat_kernel.cc
@@ -63,16 +63,6 @@ class ConcatOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
    this->AcquireForwardPrimitiveDescriptor(dst_md, concat_axis, srcs_md);
  }

-  // (jczaja) concat oneDNN prim is not having .desc attribute so
-  // we cannot use base AcquireForwardPrimitiveDescriptor
-  void AcquireForwardPrimitiveDescriptor(
-      const memory::desc& dst_md,
-      const int concat_axis,
-      const std::vector<memory::desc>& srcs_md) {
-    this->fwd_pd_.reset(new dnnl::concat::primitive_desc(
-        dst_md, concat_axis, srcs_md, this->engine_));
-  }
-
  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor& input,
                                                 int i) {
    const T* input_data = input.data<T>();

--- a/paddle/phi/kernels/onednn/conv_function.h
+++ b/paddle/phi/kernels/onednn/conv_function.h
@@ -241,30 +241,23 @@ void ComputeINT8(const OneDNNContext& dev_ctx,
            {DNNL_ARG_DST, *dst_memory_p}};

        if (bias) {
-          std::vector<float> bias_scales;
-          auto p_scales_tuple =
-              std::make_shared<std::tuple<float, std::vector<float>>>(
-                  std::make_tuple(static_cast<float>(mask_reorder),
-                                  bias_scales));
-          if (dev_ctx.HasDnnAttr("Bias_scales")) {
-            bias_scales = PADDLE_GET_CONST(std::vector<float>,
-                                           dev_ctx.GetDnnAttr("Bias_scales"));
-            p_scales_tuple =
-                std::make_shared<std::tuple<float, std::vector<float>>>(
-                    std::make_tuple(static_cast<float>(mask_reorder),
-                                    bias_scales));
-          } else {
-            p_scales_tuple = handler.get_int8_bias_scales(
-                filter, groups, scale_weights_data);
-          }
-          auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(
-              bias,
-              true,
-              std::get<1>(*p_scales_tuple),
-              std::get<0>(*p_scales_tuple));
+          auto bias_memory_p = handler.AcquireBiasMemoryWithReorder(bias, true);
          args.insert({DNNL_ARG_BIAS, *bias_memory_p});
        }

+        auto src_scales_memory = handler.AcquireScalesMemory(DNNL_ARG_SRC);
+        args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, *src_scales_memory});
+
+        auto wei_scales_memory = handler.AcquireScalesMemory(DNNL_ARG_WEIGHTS);
+        args.insert(
+            {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, *wei_scales_memory});
+
+        if (!force_fp32_output) {
+          auto dst_scales_memory = handler.AcquireScalesMemory(DNNL_ARG_DST);
+          args.insert(
+              {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, *dst_scales_memory});
+        }
+
        auto& astream = OneDNNContext::tls().get_stream();
        conv_p->execute(astream, args);
        astream.wait();

--- a/paddle/phi/kernels/onednn/conv_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_grad_kernel.cc
@@ -127,7 +127,7 @@ void ConvGradKernel(const Context& dev_ctx,
            // goidhw) for 2d conv with groups (five dimensional data reorder
            // to goihw) auto weights_tz = phi::vectorize(filter->dims());

-            auto weights_tz = diff_weights_memory_p->get_desc().dims();
+            auto weights_tz = diff_weights_memory_p->get_desc().get_dims();
            dnnl::memory::format_tag out_format =
                weights_tz.size() == 6 ? dnnl::memory::format_tag::goidhw
                                       : dnnl::memory::format_tag::goihw;

--- a/paddle/phi/kernels/onednn/conv_handler.h
+++ b/paddle/phi/kernels/onednn/conv_handler.h
@@ -183,8 +183,7 @@ class ConvOneDNNHandlerT

      const auto dst_md = funcs::OneDNNMemDesc(
          dst_tz, funcs::OneDNNGetDataType<T_out>(), chosen_memory_format);
-      const auto fwd_prop_kind = is_test ? dnnl::prop_kind::forward_inference
-                                         : dnnl::prop_kind::forward_training;
+      const auto fwd_prop_kind = dnnl::prop_kind::forward_inference;
      const dnnl::primitive_attr conv_attr = CreateConvAttrs(filter,
                                                             groups,
                                                             force_fp32_output,
@@ -193,15 +192,10 @@ class ConvOneDNNHandlerT

      if (bias) {
        auto bias_tz = phi::vectorize(bias->dims());
-        dnnl::memory::desc bias_md;
-        if (funcs::is_int8<T>()) {
-          bias_md = funcs::OneDNNMemDesc(bias_tz,
-                                         dnnl::memory::data_type::s32,
-                                         funcs::OneDNNMemoryFormat::x);
-        } else {
-          bias_md = funcs::OneDNNMemDesc(
-              bias_tz, data_type, funcs::OneDNNMemoryFormat::x);
-        }
+        dnnl::memory::desc bias_md =
+            funcs::OneDNNMemDesc(bias_tz,
+                                 dnnl::memory::data_type::f32,
+                                 funcs::OneDNNMemoryFormat::x);

        this->AcquireForwardPrimitiveDescriptor(
            conv_attr,
@@ -340,20 +334,14 @@ class ConvOneDNNHandlerT
      dnnl::primitive_attr conv_attr;
      if (bias) {
        auto bias_tz = phi::vectorize(bias->dims());
-        dnnl::memory::desc bias_md;
-        if (funcs::is_int8<T>()) {
-          bias_md = funcs::OneDNNMemDesc(bias_tz,
-                                         dnnl::memory::data_type::s32,
-                                         funcs::OneDNNMemoryFormat::x);
-        } else {
-          bias_md = funcs::OneDNNMemDesc(bias_tz,
-                                         dnnl::memory::data_type::f32,
-                                         funcs::OneDNNMemoryFormat::x);
-        }
+        dnnl::memory::desc bias_md =
+            funcs::OneDNNMemDesc(bias_tz,
+                                 dnnl::memory::data_type::f32,
+                                 funcs::OneDNNMemoryFormat::x);

        this->AcquireForwardPrimitiveDescriptor(
            conv_attr,
-            dnnl::prop_kind::forward_training,
+            dnnl::prop_kind::forward_inference,
            dnnl::algorithm::convolution_direct,
            src_md,
            weights_md,
@@ -366,7 +354,7 @@ class ConvOneDNNHandlerT
      } else {
        this->AcquireForwardPrimitiveDescriptor(
            conv_attr,
-            dnnl::prop_kind::forward_training,
+            dnnl::prop_kind::forward_inference,
            dnnl::algorithm::convolution_direct,
            src_md,
            weights_md,
@@ -399,110 +387,6 @@ class ConvOneDNNHandlerT
    }
  }

-  std::shared_ptr<std::tuple<float, std::vector<float>>> get_int8_bias_scales(
-      const DenseTensor* filter,
-      int groups,
-      const std::vector<float>& scale_weights_data) {
-    // Get scales int8 bias key
-    const std::string key_bs = this->key_ + "@bs";
-
-    // Scales for int8 bias are to be cached to avoid
-    // computing them each iteration
-    groups = std::max(groups, 1);
-    auto bias_scale_tuple =
-        std::static_pointer_cast<std::tuple<float, std::vector<float>>>(
-            this->dev_ctx_.GetBlob(key_bs));
-    if (bias_scale_tuple) return bias_scale_tuple;
-
-    const auto& weights_tz = phi::vectorize(filter->dims());
-
-    const auto& scale_in_data =
-        this->dev_ctx_.HasDnnAttr("Scale_in")
-            ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_in"))
-            : 1.0f;
-
-    bool is_multi_channel = scale_weights_data.size() > 1;
-    int mask_reorder = is_multi_channel ? 1 << 0 : 1;
-
-    int count = 1;
-    if (is_multi_channel) {
-      count *= weights_tz[0];
-      if (groups > 1) {
-        count *= weights_tz[1];
-      }
-    }
-
-    bias_scale_tuple =
-        std::make_shared<std::tuple<float, std::vector<float>>>(std::make_tuple(
-            static_cast<float>(mask_reorder), std::vector<float>(count)));
-    for (int i = 0; i < count; i++) {
-      std::get<1>(*bias_scale_tuple)[i] = scale_in_data * scale_weights_data[i];
-    }
-
-    this->dev_ctx_.SetBlob(key_bs, bias_scale_tuple);
-
-    return bias_scale_tuple;
-  }
-
-  std::tuple<float, std::vector<float>, float> get_int8_scales(
-      const DenseTensor* filter,
-      int groups,
-      bool force_fp32_output,
-      bool fuse_residual_conn,
-      const std::string& fuse_activation) const {
-    const auto& weights_tz = phi::vectorize(filter->dims());
-    groups = std::max(groups, 1);
-
-    const auto& scale_weights_data =
-        this->dev_ctx_.HasDnnAttr("Scale_weights")
-            ? PADDLE_GET_CONST(std::vector<float>,
-                               this->dev_ctx_.GetDnnAttr("Scale_weights"))
-            : std::vector<float>{1.0f};
-    const auto& scale_in_data =
-        this->dev_ctx_.HasDnnAttr("Scale_in")
-            ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_in"))
-            : 1.0f;
-    const auto& scale_in_eltwise_data =
-        this->dev_ctx_.HasDnnAttr("Scale_in_eltwise")
-            ? PADDLE_GET_CONST(float,
-                               this->dev_ctx_.GetDnnAttr("Scale_in_eltwise"))
-            : 1.0f;
-
-    bool is_multi_channel = scale_weights_data.size() > 1;
-    bool has_activation = !fuse_activation.empty();
-    const auto& scale_out =
-        this->dev_ctx_.HasDnnAttr("Scale_out")
-            ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Scale_out"))
-            : 1.0f;
-    float activation_scale =
-        (!force_fp32_output && has_activation) ? scale_out : 1.0f;
-
-    float scale_out_data =
-        (force_fp32_output || has_activation) ? 1.0f : scale_out;
-    float sum_scale =
-        fuse_residual_conn ? scale_out_data / scale_in_eltwise_data : 1.0f;
-    int count =
-        is_multi_channel
-            ? (groups > 1 ? (weights_tz)[1] * (weights_tz)[0] : (weights_tz)[0])
-            : 1;
-    std::vector<float> output_shift_scale(count);
-
-#pragma omp parallel for if (count > 50)
-    for (int i = 0; i < count; i++) {
-      if (scale_weights_data[i] == 0.0)
-        // weights data will contain 0 in some models, then weights
-        // scale couldn't be calculated
-        output_shift_scale[i] = scale_out_data;
-      else
-        output_shift_scale[i] =
-            static_cast<float>(static_cast<double>(scale_out_data) /
-                               (static_cast<double>(scale_in_data) *
-                                static_cast<double>(scale_weights_data[i])));
-    }
-
-    return std::make_tuple(sum_scale, output_shift_scale, activation_scale);
-  }
-
  dnnl::primitive_attr CreateConvAttrs(const DenseTensor* filter,
                                       int groups,
                                       bool force_fp32_output,
@@ -512,36 +396,30 @@ class ConvOneDNNHandlerT
    dnnl::post_ops post_operations;

    float sum_scale = 1.0f;
-    float activation_scale = 1.0f;
    std::vector<float> output_shift_scale;
    if (funcs::is_int8<T>()) {
-      if (this->dev_ctx_.HasDnnAttr("Sum_scale")) {
-        sum_scale =
-            PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr("Sum_scale"));
-        activation_scale =
-            this->dev_ctx_.HasDnnAttr("Activation_scale")
-                ? PADDLE_GET_CONST(
-                      float, this->dev_ctx_.GetDnnAttr("Activation_scale"))
-                : activation_scale;
-        output_shift_scale =
-            this->dev_ctx_.HasDnnAttr("Output_shift_scale")
-                ? PADDLE_GET_CONST(
-                      std::vector<float>,
-                      this->dev_ctx_.GetDnnAttr("Output_shift_scale"))
-                : output_shift_scale;
-      } else {
-        std::tie(sum_scale, output_shift_scale, activation_scale) =
-            get_int8_scales(filter,
-                            groups,
-                            force_fp32_output,
-                            fuse_residual_conn,
-                            fuse_activation);
+      conv_attr.set_scales_mask(DNNL_ARG_SRC, 0);
+
+      auto wei_scales = ConvertToDNNLScales("Scale_weights");
+      // By oneDNN API definition:
+      // - For per-tensor quantization: the mask should be 0
+      // - For per-dimension quantization: the mask should be 1 <<
+      // dimension_index Here, wei_scales.size() != 1 means per-channel
+      // quantization, the channel index in oneDNN is always 0, so we use mask =
+      // 1 << 0. If the conv is group, the weights shape will be [g, oc/g, ic,
+      // h, w], we need to do scaling along both group dim and oc dim, so the
+      // mask = (1 << 0) + (1 << 1).
+      int mask = wei_scales.size() == 1
+                     ? 0
+                     : (groups > 1 ? ((1 << 0) + (1 << 1)) : 1 << 0);
+      conv_attr.set_scales_mask(DNNL_ARG_WEIGHTS, mask);
+
+      if (!force_fp32_output) {
+        conv_attr.set_scales_mask(DNNL_ARG_DST, 0);
      }

-      if (output_shift_scale.size() > 0) {
-        int mask = output_shift_scale.size() > 1 ? 1 << 1 : 0;
-        conv_attr.set_output_scales(mask, output_shift_scale);
-      }
+      auto psum_scales = ConvertToDNNLScales("Scale_in_eltwise");
+      sum_scale = psum_scales[0];
    }

    // Fusion with Elementwise layer relies on adding a sum post-operation with
@@ -553,7 +431,7 @@ class ConvOneDNNHandlerT
      post_operations.append_sum(sum_scale);
    }

-    funcs::AppendActivation(this->dev_ctx_, post_operations, activation_scale);
+    funcs::AppendActivation(this->dev_ctx_, post_operations);

    conv_attr.set_post_ops(post_operations);
    return conv_attr;
@@ -750,6 +628,69 @@ class ConvOneDNNHandlerT
    this->AcquireReorder(residual_memory_p, dst_memory_p);
    return dst_memory_p;
  }
+
+  // Currently, 4 kind of onednn scales are supported: src scales, weight
+  // scales, post-sum scales and dst scales. This function is used to convert
+  // paddle scales to onednn scales
+  std::vector<float> ConvertToDNNLScales(const std::string& attr_name) {
+    std::vector<float> paddle_scales;
+    // weight scales is vector but other scales are scalar
+    if (attr_name == "Scale_weights") {
+      paddle_scales =
+          this->dev_ctx_.HasDnnAttr(attr_name)
+              ? PADDLE_GET_CONST(std::vector<float>,
+                                 this->dev_ctx_.GetDnnAttr(attr_name))
+              : std::vector<float>{1.0f};
+    } else {
+      float scale =
+          this->dev_ctx_.HasDnnAttr(attr_name)
+              ? PADDLE_GET_CONST(float, this->dev_ctx_.GetDnnAttr(attr_name))
+              : 1.0f;
+      paddle_scales = std::vector<float>{scale};
+    }
+
+    size_t count = paddle_scales.size();
+    std::vector<float> dnnl_scales(count);
+#pragma omp parallel for if (count > 50)
+    for (size_t i = 0; i < count; i++) {
+      dnnl_scales[i] = 1.f / paddle_scales[i];
+    }
+    return dnnl_scales;
+  }
+
+  std::shared_ptr<dnnl::memory> AcquireScalesMemory(int dnnl_arg) {
+    // <dnnl_arg, {cache_key_suffix, attr_name}>
+    std::unordered_map<int, std::pair<std::string, std::string>> map = {
+        {DNNL_ARG_SRC, {"@src_scales", "Scale_in"}},
+        {DNNL_ARG_WEIGHTS, {"@wei_scales", "Scale_weights"}},
+        {DNNL_ARG_DST, {"@dst_scales", "Scale_out"}},
+    };
+
+    std::string cache_key_suffix, attr_name;
+    std::tie(cache_key_suffix, attr_name) = map.at(dnnl_arg);
+
+    // first look up the cache
+    auto dnnl_scales_mem = this->AcquireMemory(cache_key_suffix);
+
+    if (!dnnl_scales_mem) {
+      // cache miss, so construct scales memory from the paddle scales
+      // attributes
+      auto dnnl_scales = ConvertToDNNLScales(attr_name);
+      dnnl::memory::desc dnnl_scales_md(
+          {static_cast<int64_t>(dnnl_scales.size())},
+          dnnl::memory::data_type::f32,
+          dnnl::memory::format_tag::x);
+      dnnl_scales_mem =
+          std::make_shared<dnnl::memory>(dnnl_scales_md, this->engine_);
+      memcpy(dnnl_scales_mem->get_data_handle(),
+             dnnl_scales.data(),
+             dnnl_scales.size() * sizeof(float));
+      // cache the constructed memory
+      this->CacheMemory(cache_key_suffix, dnnl_scales_mem);
+    }
+
+    return dnnl_scales_mem;
+  }
 };

 }  // namespace onednn

--- a/paddle/phi/kernels/onednn/conv_transpose_kernel.cc
+++ b/paddle/phi/kernels/onednn/conv_transpose_kernel.cc
@@ -253,7 +253,7 @@ class ConvTransposeOneDNNHandlerT
        dnnl::reorder::primitive_desc reorder_pdesc;
        if (funcs::is_int8<T>()) {
          dnnl::primitive_attr attr;
-          attr.set_output_scales(mask, scale_data);
+          attr.set_scales_mask(DNNL_ARG_DST, mask);
          reorder_pdesc = dnnl::reorder::primitive_desc(
              *user_memory_p, *target_memory_p, attr);
        } else {
@@ -264,9 +264,22 @@ class ConvTransposeOneDNNHandlerT
        dev_ctx.SetBlob(key_reorder_p, reorder_p);

        auto& astream = OneDNNContext::tls().get_stream();
-        reorder_p->execute(
-            astream,
-            {{DNNL_ARG_FROM, *user_memory_p}, {DNNL_ARG_TO, *target_memory_p}});
+
+        std::unordered_map<int, dnnl::memory> reorder_args;
+        reorder_args.insert({DNNL_ARG_SRC, *user_memory_p});
+        reorder_args.insert({DNNL_ARG_DST, *target_memory_p});
+        if (funcs::is_int8<T>()) {
+          auto scale_md =
+              dnnl::memory::desc({static_cast<int64_t>(scale_data.size())},
+                                 dnnl::memory::data_type::f32,
+                                 dnnl::memory::format_tag::x);
+          auto scale_data_mem = dnnl::memory(scale_md, this->engine_);
+          scale_data_mem.set_data_handle(
+              phi::funcs::to_void_cast(scale_data.data()));
+          reorder_args.insert(
+              {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, scale_data_mem});
+        }
+        reorder_p->execute(astream, reorder_args);
        astream.wait();
      } else {
        target_memory_p = user_memory_p;

--- a/paddle/phi/kernels/onednn/dequantize_kernel.cc
+++ b/paddle/phi/kernels/onednn/dequantize_kernel.cc
@@ -28,17 +28,21 @@ void DeQuantKernel(const Context& dev_ctx,
                   const float quantization_scale,
                   const float quantization_shift,
                   DenseTensor* out) {
-  const bool with_shift = quantization_shift != 0.0f;
-
  PADDLE_ENFORCE(quantization_scale != 0.0f,
                 phi::errors::InvalidArgument(
                     "Dequantization scale must be different than 0.0f"));

-  PADDLE_ENFORCE(quantization_shift <= 255 && quantization_shift >= 0,
-                 phi::errors::InvalidArgument(
-                     "Dequantization shift must be lower or equal to ",
-                     "255 and greater or equal to 0, but got %f",
-                     quantization_shift));
+  const auto q_shift = static_cast<int32_t>(quantization_shift);
+  PADDLE_ENFORCE_GE(q_shift,
+                    0,
+                    phi::errors::InvalidArgument(
+                        "Dequantization shift must be greater or equal to 0"));
+  PADDLE_ENFORCE_LE(q_shift,
+                    255,
+                    phi::errors::InvalidArgument(
+                        "Dequantization shift must be lower or equal to 255"));
+
+  const bool with_shift = q_shift != 0;

  auto x_tz = phi::vectorize<int64_t>(x.dims());
  auto x_type = phi::funcs::ToOneDNNDataType(x.dtype());
@@ -47,12 +51,10 @@ void DeQuantKernel(const Context& dev_ctx,
  dnnl::primitive_attr attrs;
  static constexpr int32_t mask = 0;  // same shift and scale for whole tensor

-  const float reorder_scale = 1. / quantization_scale;
-  attrs.set_output_scales(mask, {reorder_scale});
+  attrs.set_scales_mask(DNNL_ARG_DST, mask);

  if (with_shift) {
-    attrs.set_zero_points(
-        DNNL_ARG_SRC, mask, {static_cast<int32_t>(quantization_shift)});
+    attrs.set_zero_points_mask(DNNL_ARG_SRC, mask);
  }

  phi::funcs::ReorderOneDNNHandler reorder_handler(
@@ -67,7 +69,29 @@ void DeQuantKernel(const Context& dev_ctx,
      reorder_dst_memory_p, reorder_src_memory_p, attrs);

  auto& astream = phi::OneDNNContext::tls().get_stream();
-  reorder_p->execute(astream, *reorder_src_memory_p, *reorder_dst_memory_p);
+
+  auto scales_md = dnnl::memory::desc(
+      {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+  auto scales_mem =
+      dnnl::memory(scales_md,
+                   dev_ctx.GetEngine(),
+                   phi::funcs::to_void_cast<float>(&quantization_scale));
+
+  auto zero_points_md = dnnl::memory::desc(
+      {1}, dnnl::memory::data_type::s32, dnnl::memory::format_tag::x);
+  auto zero_points_mem =
+      dnnl::memory(zero_points_md,
+                   dev_ctx.GetEngine(),
+                   phi::funcs::to_void_cast<int32_t>(&q_shift));
+  std::unordered_map<int, dnnl::memory> reorder_args;
+  reorder_args.insert({DNNL_ARG_SRC, *reorder_src_memory_p});
+  reorder_args.insert({DNNL_ARG_DST, *reorder_dst_memory_p});
+  reorder_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, scales_mem});
+  if (with_shift) {
+    reorder_args.insert(
+        {DNNL_ARG_ATTR_ZERO_POINTS | DNNL_ARG_SRC, zero_points_mem});
+  }
+  reorder_p->execute(astream, reorder_args);
  astream.wait();

  out->set_mem_desc(reorder_dst_memory_p->get_desc());

--- a/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_grad_kernel.cc
@@ -49,14 +49,18 @@ inline void AddSubNonBroadcast(ReorderOneDNNHandler* reorder_handler,
                               phi::DenseTensor* grad_tensor,
                               const std::shared_ptr<dnnl::memory>& src_memory,
                               const std::shared_ptr<dnnl::memory>& dst_memory,
-                               const std::vector<float>& scales) {
+                               const dnnl::memory& scales_memory) {
  dnnl::primitive_attr reorder_attr;
-  reorder_attr.set_output_scales(0, scales);
+  reorder_attr.set_scales_mask(DNNL_ARG_DST, 0);
  auto reorder_p =
      reorder_handler->AcquireReorder(dst_memory, src_memory, reorder_attr);

-  reorder_p->execute(
-      OneDNNContext::tls().get_stream(), *src_memory, *dst_memory);
+  std::unordered_map<int, dnnl::memory> args = {
+      {DNNL_ARG_SRC, *src_memory},
+      {DNNL_ARG_DST, *dst_memory},
+      {DNNL_ARG_ATTR_SCALES | DNNL_ARG_DST, scales_memory}};
+  auto& astream = OneDNNContext::tls().get_stream();
+  reorder_p->execute(astream, args);
 }

 template <typename T>
@@ -73,7 +77,7 @@ inline void BroadcastReduction(const Place& place,
  // Broadcasting
  if (is_sub) {
    dnnl::post_ops po;
-    po.append_eltwise(1.0f, dnnl::algorithm::eltwise_linear, scales[0], 0);
+    po.append_eltwise(dnnl::algorithm::eltwise_linear, scales[0], 0);
    broadcast_reduction_attr.set_post_ops(po);
  }

@@ -126,9 +130,9 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
    swap_x_y = true;
  }

-  std::vector<float> scales{1.0};
+  float scale{1.0};
  if (swap_x_y) {
-    scales[0] = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
+    scale = (BINARY_OP == dnnl::algorithm::binary_add) ? 1 : -1;
  }

  auto tz = phi::vectorize<int64_t>(dout.dims());
@@ -143,6 +147,11 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
  std::shared_ptr<dnnl::memory> broadcast_src_memory = reorder_src_memory;

  auto& astream = OneDNNContext::tls().get_stream();
+  auto scales_md = dnnl::memory::desc(
+      {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+  auto scales_mem = dnnl::memory(scales_md, onednn_engine);
+  auto scale_memory_buf = static_cast<float*>(scales_mem.get_data_handle());
+  *scale_memory_buf = scale;
  if (dx) {
    // elementwise_add & elementwise_sub
    if (BINARY_OP == dnnl::algorithm::binary_add ||
@@ -151,7 +160,7 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
        dst_memory = reorder_handler.AcquireDstMemory(
            dx, dout.mem_desc(), dev_ctx.GetPlace());
        AddSubNonBroadcast(
-            &reorder_handler, dx, reorder_src_memory, dst_memory, scales);
+            &reorder_handler, dx, reorder_src_memory, dst_memory, scales_mem);
      }
    } else {  // elementwise_mul & elementwise_div
      funcs::BinaryOneDNNHandler<T> binary_handler(BINARY_OP,
@@ -176,7 +185,9 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
      const std::unordered_map<int, dnnl::memory> args = {
          {DNNL_ARG_SRC_0, *src_dout_memory},
          {DNNL_ARG_SRC_1, *src_y_memory},
-          {DNNL_ARG_DST, *dst_memory}};
+          {DNNL_ARG_DST, *dst_memory},
+          {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, scales_mem},
+          {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1, scales_mem}};

      binary_prim->execute(astream, args);
    }
@@ -189,7 +200,7 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
                                   &dout,
                                   broadcast_src_memory,
                                   dst_memory,
-                                   scales,
+                                   {scale},
                                   BINARY_OP == dnnl::algorithm::binary_sub);
    } else {
      dx->set_mem_desc(dst_memory->get_desc());
@@ -204,7 +215,7 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
        dst_memory = reorder_handler.AcquireDstMemory(
            dy, dout.mem_desc(), dev_ctx.GetPlace());
        AddSubNonBroadcast(
-            &reorder_handler, dy, reorder_src_memory, dst_memory, scales);
+            &reorder_handler, dy, reorder_src_memory, dst_memory, scales_mem);
      }
    } else {  // elementwise_mul & elementwise_div
      std::unordered_map<int, dnnl::memory> args;
@@ -273,7 +284,9 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
      binary_prim = binary_handler.AcquireForwardPrimitive();
      args = {{DNNL_ARG_SRC_0, *src_0_memory},
              {DNNL_ARG_SRC_1, *src_1_memory},
-              {DNNL_ARG_DST, *dst_dy_memory}};
+              {DNNL_ARG_DST, *dst_dy_memory},
+              {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, scales_mem},
+              {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1, scales_mem}};

      if (BINARY_OP == dnnl::algorithm::binary_div)
        args.insert({DNNL_ARG_ATTR_MULTIPLE_POST_OP(0) | DNNL_ARG_SRC_1,
@@ -292,7 +305,7 @@ void ElementwiseGradKernel(const OneDNNContext& dev_ctx,
                                   &dout,
                                   broadcast_src_memory,
                                   dst_memory,
-                                   scales,
+                                   {scale},
                                   BINARY_OP == dnnl::algorithm::binary_sub);
    } else {
      dy->set_mem_desc(dst_memory->get_desc());

--- a/paddle/phi/kernels/onednn/elementwise_kernel.cc
+++ b/paddle/phi/kernels/onednn/elementwise_kernel.cc
@@ -110,10 +110,19 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,

  auto& astream = OneDNNContext::tls().get_stream();

-  const std::unordered_map<int, dnnl::memory> args = {
-      {DNNL_ARG_SRC_0, *src_x_memory},
-      {DNNL_ARG_SRC_1, *src_y_memory},
-      {DNNL_ARG_DST, *dst_memory}};
+  std::unordered_map<int, dnnl::memory> args = {{DNNL_ARG_SRC_0, *src_x_memory},
+                                                {DNNL_ARG_SRC_1, *src_y_memory},
+                                                {DNNL_ARG_DST, *dst_memory}};
+
+  if (handler.Has_SRC_0_Scale()) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0,
+                 handler.Get_SRC_0_Scale_Memory()});
+  }
+
+  if (handler.Has_SRC_1_Scale()) {
+    args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1,
+                 handler.Get_SRC_1_Scale_Memory()});
+  }

  binary_prim->execute(astream, args);
  astream.wait();
@@ -121,7 +130,7 @@ void ElementwiseKernel(const OneDNNContext& dev_ctx,
  auto out_md = dst_memory->get_desc();

  if (handler.use_broadcasting_hack) {
-    auto dims = out_md.dims();
+    auto dims = out_md.get_dims();
    dims.insert(dims.begin(), non_const_x->dims()[0]);
    dims[1] /= dims[0];
    out_md = out_md.reshape(dims);

--- a/paddle/phi/kernels/onednn/expand_kernel.cc
+++ b/paddle/phi/kernels/onednn/expand_kernel.cc
@@ -65,7 +65,9 @@ void ExpandKernel(const Context& dev_ctx,
  const std::unordered_map<int, dnnl::memory> args = {
      {DNNL_ARG_SRC_0, *dst_memory_p},
      {DNNL_ARG_SRC_1, *src_memory_p},
-      {DNNL_ARG_DST, *dst_memory_p}};
+      {DNNL_ARG_DST, *dst_memory_p},
+      {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, handler.Get_Scale_Memory(0.0f)},
+      {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1, handler.Get_Scale_Memory(1.0f)}};

  auto& astream = OneDNNContext::tls().get_stream();
  binary_p->execute(astream, args);

--- a/paddle/phi/kernels/onednn/full_kernel.cc
+++ b/paddle/phi/kernels/onednn/full_kernel.cc
@@ -34,18 +34,22 @@ class FillConstantOneDNNHandler
                                            dnnl::memory::format_tag::ab);

    dnnl::primitive_attr attrs;
-    attrs.set_scales(DNNL_ARG_SRC_0, /* mask = */ 0, {0.0f});
+    attrs.set_scales_mask(DNNL_ARG_SRC_0, /* mask = */ 0);
+
+    src1_md_ = dnnl::memory::desc({1, sizeof(T)},
+                                  OneDNNGetDataType<uint8_t>(),
+                                  dnnl::memory::format_tag::ab);

    this->AcquireForwardPrimitiveDescriptor(
-        attrs, dnnl::algorithm::binary_add, src0_md, src1_md, src0_md);
+        dnnl::algorithm::binary_add, src0_md, src1_md_, src0_md, attrs);
  }

-  static const dnnl::memory::desc src1_md;
+  const dnnl::memory::desc& get_src1_md() const { return src1_md_; }
+
+ private:
+  dnnl::memory::desc src1_md_;
 };

-template <typename T>
-const dnnl::memory::desc FillConstantOneDNNHandler<T>::src1_md(
-    {1, sizeof(T)}, OneDNNGetDataType<uint8_t>(), dnnl::memory::format_tag::ab);
 }  // namespace funcs

 template <typename T, typename Context>
@@ -63,7 +67,7 @@ void FullKernel(const Context& dev_ctx,
      out, onednn_engine, dev_ctx.GetPlace());

  dnnl::memory constant_value_memory =
-      dnnl::memory(funcs::FillConstantOneDNNHandler<T>::src1_md,
+      dnnl::memory(handler.get_src1_md(),
                   onednn_engine,
                   reinterpret_cast<uint8_t*>(&fill_value));

@@ -71,10 +75,19 @@ void FullKernel(const Context& dev_ctx,
  auto fill_constant_p = handler.AcquireForwardPrimitive();

  auto& astream = OneDNNContext::tls().get_stream();
-  fill_constant_p->execute(astream,
-                           {{DNNL_ARG_SRC_0, *src0_memory_p},
-                            {DNNL_ARG_SRC_1, constant_value_memory},
-                            {DNNL_ARG_DST, *src0_memory_p}});
+
+  std::vector<float> zero(1, 0);
+  auto scales_md = dnnl::memory::desc(
+      {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+  auto scales = dnnl::memory(scales_md, onednn_engine, zero.data());
+
+  std::unordered_map<int, dnnl::memory> args;
+  args.insert({DNNL_ARG_SRC_0, *src0_memory_p});
+  args.insert({DNNL_ARG_SRC_1, constant_value_memory});
+  args.insert({DNNL_ARG_DST, *src0_memory_p});
+  args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0, scales});
+
+  fill_constant_p->execute(astream, args);
  astream.wait();

  // src0_memory_p's md was just to allow the usage of a binary

--- a/paddle/phi/kernels/onednn/log_softmax_kernel.cc
+++ b/paddle/phi/kernels/onednn/log_softmax_kernel.cc
@@ -24,18 +24,21 @@ namespace phi {

 template <typename T>
 class LogSoftmaxOneDNNHandler
-    : public funcs::OneDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward> {
+    : public funcs::OneDNNHandlerNoCachingT<T, dnnl::softmax_forward> {
 public:
  LogSoftmaxOneDNNHandler(const dnnl::engine onednn_engine,
                          Place cpu_place,
                          const DenseTensor& x,
                          const int axis)
-      : funcs::OneDNNHandlerNoCachingT<T, dnnl::logsoftmax_forward>(
-            onednn_engine, cpu_place) {
+      : funcs::OneDNNHandlerNoCachingT<T, dnnl::softmax_forward>(onednn_engine,
+                                                                 cpu_place) {
    const int rank = x.dims().size() != 0 ? x.dims().size() : 1;
    const int canonical_axis = funcs::CanonicalAxis(axis, rank);
-    this->AcquireForwardPrimitiveDescriptor(
-        dnnl::prop_kind::forward_inference, x.mem_desc(), canonical_axis);
+    this->AcquireForwardPrimitiveDescriptor(dnnl::prop_kind::forward_inference,
+                                            dnnl::algorithm::softmax_log,
+                                            x.mem_desc(),
+                                            x.mem_desc(),
+                                            canonical_axis);
  }
 };


--- a/paddle/phi/kernels/onednn/matmul_kernel.cc
+++ b/paddle/phi/kernels/onednn/matmul_kernel.cc
@@ -211,10 +211,14 @@ class MulPrimitiveFactory {
                          const std::vector<float> &scale) {
    auto mask = scale.size() > 1 ? 1 : 0;
    dnnl::primitive_attr attr;
-    attr.set_output_scales(mask, scale);
+    attr.set_scales_mask(DNNL_ARG_SRC, mask);

    auto src_mem = memory(src_desc, engine_, src_data);
    auto dst_mem = memory(dst_desc, engine_);
+    auto scales_md = dnnl::memory::desc(
+        {1}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    auto scales_mem = dnnl::memory(
+        scales_md, engine_, funcs::to_void_cast<float>(scale.data()));

    auto reorder_pd = dnnl::reorder::primitive_desc(src_mem, dst_mem, attr);

@@ -222,7 +226,11 @@ class MulPrimitiveFactory {

    auto &astream = OneDNNContext::tls().get_stream();
    {
-      reorder.execute(astream, src_mem, dst_mem);
+      std::unordered_map<int, dnnl::memory> reorder_args;
+      reorder_args.insert({DNNL_ARG_SRC, src_mem});
+      reorder_args.insert({DNNL_ARG_DST, dst_mem});
+      reorder_args.insert({DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC, scales_mem});
+      reorder.execute(astream, reorder_args);
      astream.wait();
    }

@@ -230,9 +238,7 @@ class MulPrimitiveFactory {
  }

  memory QuantInputY(memory input_y, const std::vector<float> &scale_y) {
-    const auto &dims = input_y.get_desc().data.dims;
-    auto ndims = input_y.get_desc().data.ndims;
-    auto y_dims = std::vector<int64_t>(dims, dims + ndims);
+    auto y_dims = input_y.get_desc().get_dims();

    auto user_y_desc =
        CreateMemDescriptor<YT>(y_dims, funcs::OneDNNMemoryFormat::oi);
@@ -272,7 +278,13 @@ class MulPrimitiveFactory {
            scale_out_data / (scale_x_data * scale_y_data[i]);
    }
    int mul_mask = is_multi_channel ? 1 : 0;
-    mul_attr.set_output_scales(mul_mask, output_shift_scale);
+    mul_attr.set_scales_mask(DNNL_ARG_WEIGHTS, mul_mask);
+
+    auto scales_md = dnnl::memory::desc(
+        {count}, dnnl::memory::data_type::f32, dnnl::memory::format_tag::x);
+    scales_mem_ = dnnl::memory(scales_md, engine_);
+    auto mem_buf = scales_mem_.get_data_handle();
+    memcpy(mem_buf, output_shift_scale.data(), count * sizeof(float));

    return mul_attr;
  }
@@ -286,19 +298,17 @@ class MulPrimitiveFactory {
    const auto y_desc = y_memory.get_desc();
    inner_product_forward::primitive_desc mul_prim_desc;

-    const auto &mul_desc = inner_product_forward::desc(
-        prop_kind::forward, x_desc, y_desc, dst_desc);
-
    if (is_int8_) {
      bool force_fp32_output =
          dev_ctx.HasDnnAttr("force_fp32_output")
              ? PADDLE_GET_CONST(bool, dev_ctx.GetDnnAttr("force_fp32_output"))
              : false;
      auto mul_attr = CreateMulAttr(dev_ctx, force_fp32_output);
-      mul_prim_desc =
-          inner_product_forward::primitive_desc(mul_desc, mul_attr, engine_);
+      mul_prim_desc = inner_product_forward::primitive_desc(
+          engine_, prop_kind::forward, x_desc, y_desc, dst_desc, mul_attr);
    } else {
-      mul_prim_desc = inner_product_forward::primitive_desc(mul_desc, engine_);
+      mul_prim_desc = inner_product_forward::primitive_desc(
+          engine_, prop_kind::forward, x_desc, y_desc, dst_desc);
    }

    output_ = CreateDstMemory(mul_prim_desc, dev_ctx, output);
@@ -308,10 +318,12 @@ class MulPrimitiveFactory {

  void Execute() {
    auto &astream = OneDNNContext::tls().get_stream();
+
    (*mul_).execute(astream,
                    {{DNNL_ARG_SRC, *x_input_},
                     {DNNL_ARG_WEIGHTS, *y_input_},
-                     {DNNL_ARG_DST, *output_}});
+                     {DNNL_ARG_DST, *output_},
+                     {DNNL_ARG_ATTR_SCALES | DNNL_ARG_WEIGHTS, scales_mem_}});
    astream.wait();
  }

@@ -428,6 +440,7 @@ class MulPrimitiveFactory {
  paddle::optional<memory> output_;
  paddle::optional<inner_product_forward> mul_;
  static constexpr bool is_int8_ = funcs::is_int8<XT>();
+  dnnl::memory scales_mem_;
 };

 /* OT: output data type */
@@ -511,9 +524,12 @@ void MatmulWithFlattenKernelINT8(const Context &dev_ctx,
    out->Resize(out_dims);
  }

-  auto in_md = memory::desc(*dnnl_primitive_desc_query_md(
-      mul.get_primitive_desc(), dnnl_query_dst_md, 0));
-  out->set_mem_desc(in_md.reshape(vectorize<int64_t>(out->dims())));
+  auto in_md = dnnl_primitive_desc_query_md(
+      mul.get_primitive_desc(), dnnl_query_dst_md, 0);
+  dnnl_memory_desc_t cloned_in_md = nullptr;
+  dnnl_memory_desc_clone(&cloned_in_md, in_md);
+  out->set_mem_desc(
+      memory::desc(cloned_in_md).reshape(vectorize<int64_t>(out->dims())));
 }

 template <typename T, typename Context>

--- a/paddle/phi/kernels/onednn/reduce_kernel_impl.h
+++ b/paddle/phi/kernels/onednn/reduce_kernel_impl.h
@@ -143,7 +143,11 @@ void ReduceGradKernel(const Context& dev_ctx,
  const std::unordered_map<int, dnnl::memory> args = {
      {DNNL_ARG_SRC_0, *dst_memory_p},
      {DNNL_ARG_SRC_1, *src_memory_p},
-      {DNNL_ARG_DST, *dst_memory_p}};
+      {DNNL_ARG_DST, *dst_memory_p},
+      {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_0,
+       handler.Get_Scale_Memory(scale_x)},
+      {DNNL_ARG_ATTR_SCALES | DNNL_ARG_SRC_1,
+       handler.Get_Scale_Memory(scale_y)}};

  auto& astream = OneDNNContext::tls().get_stream();
  binary_prim->execute(astream, args);

--- a/paddle/phi/kernels/onednn/reshape_kernel.cc
+++ b/paddle/phi/kernels/onednn/reshape_kernel.cc
@@ -120,7 +120,7 @@ void ExecuteReshape(const Context& dev_ctx,
                    const DDim& x_dims,
                    DenseTensor* out) {
  auto out_dims = ValidateShape(shape.GetData(), x_dims);
-  auto x_vec_dims = x.mem_desc().dims();
+  auto x_vec_dims = x.mem_desc().get_dims();

  funcs::ReorderOneDNNHandler reorder_handler(
      x_vec_dims,

--- a/paddle/phi/kernels/onednn/split_kernel.cc
+++ b/paddle/phi/kernels/onednn/split_kernel.cc
@@ -23,12 +23,12 @@ const std::vector<int64_t> get_slice_strides(
    const std::vector<int64_t>& out_vec_dims,
    const dnnl::memory::desc& full_md,
    int axis) {
-  auto strides = full_md.data.format_desc.blocking.strides;
-  auto ndims = full_md.data.ndims;
-  auto full_dims = full_md.data.dims;
+  auto strides = full_md.get_strides();
+  auto ndims = full_md.get_dims().size();
+  auto full_dims = full_md.get_dims();
  auto splitted_stride = strides[axis];
  std::vector<int64_t> slice_strides(ndims, splitted_stride);
-  for (int16_t i = 0; i < ndims; ++i) {
+  for (size_t i = 0; i < ndims; ++i) {
    slice_strides[i] = strides[i] > splitted_stride
                           ? (strides[i] / full_dims[axis]) * out_vec_dims[axis]
                           : strides[i];

--- a/paddle/phi/kernels/onednn/stack_kernel.cc
+++ b/paddle/phi/kernels/onednn/stack_kernel.cc
@@ -73,16 +73,6 @@ class StackOneDNNHandler : public OneDNNHandlerNoCachingT<T, dnnl::concat> {
    this->AcquireForwardPrimitiveDescriptor(dst_md, stack_axis, srcs_md);
  }

-  // concat oneDNN prim is not having .desc attribute so we cannot use default
-  // AcquireForwardPrimitiveDescriptor
-  void AcquireForwardPrimitiveDescriptor(
-      const memory::desc& dst_md,
-      const int stack_axis,
-      const std::vector<memory::desc>& srcs_md) {
-    this->fwd_pd_.reset(new dnnl::concat::primitive_desc(
-        dst_md, stack_axis, srcs_md, this->engine_));
-  }
-
  std::shared_ptr<dnnl::memory> AcquireSrcMemory(const DenseTensor& input,
                                                 int i) {
    const T* input_data = input.data<T>();

--- a/paddle/phi/kernels/onednn/transpose_kernel.cc
+++ b/paddle/phi/kernels/onednn/transpose_kernel.cc
@@ -73,8 +73,8 @@ void TransposeKernel(const Context& dev_ctx,
      x.mem_desc(), funcs::to_void_cast(x.data<T>()));

  auto fake_strides = funcs::FakeTransposeStrides(x_vec_dims, axis);
-  auto dst_md =
-      dnnl::memory::desc(x_vec_dims, x.mem_desc().data_type(), fake_strides);
+  auto dst_md = dnnl::memory::desc(
+      x_vec_dims, x.mem_desc().get_data_type(), fake_strides);
  auto reorder_dst_memory_p =
      reorder_handler.AcquireDstMemory(out, dst_md, dev_ctx.GetPlace());
  auto reorder_p = reorder_handler.AcquireReorder(reorder_dst_memory_p,

--- a/python/env_dict.py.in
+++ b/python/env_dict.py.in
@@ -47,8 +47,6 @@ env_dict={
    'PSLIB_VERSION_PY':'@PSLIB_VERSION_PY@',
    'WITH_MKLDNN':'@WITH_MKLDNN@',
    'MKLDNN_SHARED_LIB':'@MKLDNN_SHARED_LIB@',
-    'MKLDNN_SHARED_LIB_1':'@MKLDNN_SHARED_LIB_1@',
-    'MKLDNN_SHARED_LIB_2':'@MKLDNN_SHARED_LIB_2@',
    'MKLDNN_INSTALL_DIR':'@MKLDNN_INSTALL_DIR@',
    'WITH_ONNXRUNTIME':'@WITH_ONNXRUNTIME@',
    'ONNXRUNTIME_SHARED_LIB':'@ONNXRUNTIME_SHARED_LIB@',

--- a/python/setup.py.in
+++ b/python/setup.py.in
@@ -654,9 +654,7 @@ if '${WITH_MKLDNN}' == 'ON':
            raise Exception("patch libdnnl.so failed, command: %s" % command)
    shutil.copy('${MKLDNN_SHARED_LIB}', libs_path)
    if os.name != 'nt':
-        shutil.copy('${MKLDNN_SHARED_LIB_1}', libs_path)
-        shutil.copy('${MKLDNN_SHARED_LIB_2}', libs_path)
-        package_data['paddle.libs']+=['libmkldnn.so.0', 'libdnnl.so.1', 'libdnnl.so.2']
+        package_data['paddle.libs']+=['libdnnl.so.3']
    else:
        package_data['paddle.libs']+=['mkldnn.dll']


--- a/python/setup_cinn.py.in
+++ b/python/setup_cinn.py.in
@@ -137,7 +137,7 @@ if '${WITH_MKL}' == 'ON':
    cinnlibs.append('${MKLML_IOMP_LIB}')

 if '${WITH_MKLDNN}' == 'ON':
-    cinnlibs.append('${MKLDNN_SHARED_LIB_2}')
+    cinnlibs.append('${MKLDNN_SHARED_LIB}')

 if '${WITH_GPU}' == 'ON':
    cinnlibs.append('${CMAKE_BINARY_DIR}/dist/cinn/include/paddle/cinn/runtime/cuda/cinn_cuda_runtime_source.cuh')

--- a/setup.py
+++ b/setup.py
@@ -1088,13 +1088,7 @@ def get_package_data_and_package_dir():
                )
        shutil.copy(env_dict.get("MKLDNN_SHARED_LIB"), libs_path)
        if os.name != 'nt':
-            shutil.copy(env_dict.get("MKLDNN_SHARED_LIB_1"), libs_path)
-            shutil.copy(env_dict.get("MKLDNN_SHARED_LIB_2"), libs_path)
-            package_data['paddle.libs'] += [
-                'libmkldnn.so.0',
-                'libdnnl.so.1',
-                'libdnnl.so.2',
-            ]
+            package_data['paddle.libs'] += ['libdnnl.so.3']
        else:
            package_data['paddle.libs'] += ['mkldnn.dll']


--- a/test/cpp/inference/infer_ut/CMakeLists.txt
+++ b/test/cpp/inference/infer_ut/CMakeLists.txt
@@ -183,7 +183,7 @@ if(WITH_MKL)
    if(WIN32)
      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/mkldnn.lib)
    else()
-      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libmkldnn.so.0)
+      set(MKLDNN_LIB ${MKLDNN_PATH}/lib/libdnnl.so.3)
    endif()
  endif()
 else()

--- a/test/mkldnn/test_conv2d_int8_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_int8_mkldnn_op.py
@@ -41,6 +41,7 @@ class TestConv2DInt8Op(TestConv2DOp):
        self.mkldnn_data_type = "int8"
        self.weighttype = np.float32
        self.use_mkldnn = True
+        self.init_weight_quantization_type()
        self.init_group()
        self.init_dilation()
        self.init_test_case()
@@ -181,8 +182,9 @@ class TestConv2DInt8Op(TestConv2DOp):

    def test_check_output(self):
        # TODO(wangzhongpu): support mkldnn op in dygraph mode
+        # the atol for integer tests should be 1
        self.check_output_with_place(
-            core.CPUPlace(), atol=0, check_dygraph=False
+            core.CPUPlace(), atol=1, check_dygraph=False
        )

    def test_check_grad(self):
@@ -202,9 +204,16 @@ class TestConv2DInt8Op(TestConv2DOp):
        self.filter_size = [2, f_c, 3, 3]
        self.scale_in = 0.95
        self.scale_out = 0.5
-        self.scale_weights = [10.0]
+        self.scale_weights = (
+            [10.0] * self.filter_size[0]
+            if self.per_channel_quantize_weight
+            else [10.0]
+        )
        self.scale_in_eltwise = 0.6

+    def init_weight_quantization_type(self):
+        self.per_channel_quantize_weight = False
+
    def init_data_type(self):
        self.srctype = np.uint8
        self.dsttype = np.int8
@@ -239,15 +248,15 @@ class TestConv2D(TestConv2DInt8Op):
 class TestWithHardSwish(TestConv2D):
    def init_fuse_activation(self):
        self.fuse_activation = "hard_swish"
-        self.fuse_alpha = 0
-        self.fuse_beta = 0
+        self.fuse_alpha = 1.0 / 6.0
+        self.fuse_beta = 1.0 / 2.0


 class TestWithRelu6(TestConv2D):
    def init_fuse_activation(self):
        self.fuse_activation = "relu6"
-        self.fuse_alpha = 6
-        self.fuse_beta = 0
+        self.fuse_alpha = 0
+        self.fuse_beta = 6


 class TestWithSwish(TestConv2D):
@@ -350,6 +359,34 @@ def init_data_type_with_fusion(self, input_dt, fuse_activation, fuse_residual):
    self.fuse_residual = fuse_residual


+class TestDepthwiseConv2d(TestConv2D):
+    def init_test_case(self):
+        self.pad = [1, 1]
+        self.stride = [1, 1]
+        self.input_size = [1, 32, 112, 112]
+        self.input_residual_size = [1, 32, 112, 112]
+        assert np.mod(self.input_size[1], self.groups) == 0
+        f_c = self.input_size[1] // self.groups
+        self.filter_size = [32, f_c, 3, 3]
+        self.scale_in = 0.95
+        self.scale_out = 0.5
+        self.scale_weights = (
+            [10.0] * self.filter_size[0]
+            if self.per_channel_quantize_weight
+            else [10.0]
+        )
+        self.scale_in_eltwise = 0.8
+
+    def init_group(self):
+        self.groups = 32
+
+    def init_weight_quantization_type(self):
+        self.per_channel_quantize_weight = True
+
+    def init_fuse_residual(self):
+        self.fuse_residual = False
+
+
 def create_test_int8_class(parent):
    # --------------------test conv2d s8 in and u8 out--------------------
    class TestS8U8Case(parent):

--- a/test/mkldnn/test_conv2d_mkldnn_op.py
+++ b/test/mkldnn/test_conv2d_mkldnn_op.py
@@ -92,7 +92,7 @@ class TestConv2DMKLDNNOp(TestConv2DOp):
            output = np.maximum(output, 0).astype(self.dsttype)

        if self.fuse_activation == "relu6":
-            output = np.minimum(np.maximum(output, 0), self.fuse_alpha).astype(
+            output = np.minimum(np.maximum(output, 0), self.fuse_beta).astype(
                self.dsttype
            )
        if (
@@ -120,7 +120,7 @@ class TestWithbreluFusion(TestConv2DMKLDNNOp):
    def init_test_case(self):
        TestConv2DMKLDNNOp.init_test_case(self)
        self.fuse_activation = "relu6"
-        self.fuse_alpha = 6.0
+        self.fuse_beta = 6.0
        self.dsttype = np.float32



--- a/test/mkldnn/test_fc_int8_mkldnn_op.py
+++ b/test/mkldnn/test_fc_int8_mkldnn_op.py
@@ -24,15 +24,21 @@ class TestFCINT8OneDNNOp(OpTest):
        self.op_type = "fc"
        self._cpu_only = True
        self.configure()
+        self.set_shape()
        self.generate_data()
        self.set_inputs()

+        y_scales_size = (
+            self.bias_shape if self.per_channel_quantize_weight else 1
+        )
+
        self.attrs = {
            'use_mkldnn': True,
            'Scale_in': self.x_scale,
-            'Scale_weights': [self.y_scale],
+            'Scale_weights': [self.y_scale] * y_scales_size,
            'Scale_out': self.out_scale,
            'force_fp32_output': self.force_fp32_output,
+            'in_num_col_dims': self.in_num_col_dims,
        }

        if self.force_fp32_output:
@@ -45,6 +51,13 @@ class TestFCINT8OneDNNOp(OpTest):
    def configure(self):
        self.use_bias = True
        self.force_fp32_output = False
+        self.in_num_col_dims = 1
+        self.per_channel_quantize_weight = False
+
+    def set_shape(self):
+        self.input_shape = (10, 5)
+        self.weight_shape = (5, 10)
+        self.bias_shape = 10

    def set_inputs(self):
        self.inputs = {'Input': self.x, 'W': self.y_float, 'Bias': self.bias}
@@ -55,15 +68,26 @@ class TestFCINT8OneDNNOp(OpTest):
        return scale, quantized

    def generate_data(self):
-        self.x_float = np.random.random((10, 5)).astype("float32") * 10
+        self.x_float = np.random.random(self.input_shape).astype("float32") * 10
        self.x_scale, self.x = self.quantize(self.x_float)

-        self.y_float = np.random.random((5, 10)).astype("float32") * 10
+        self.y_float = (
+            np.random.random(self.weight_shape).astype("float32") * 10
+        )
        self.y_scale, self.y = self.quantize(self.y_float)

-        self.out_float = np.dot(self.x_float, self.y_float)
+        flatten_shape = [1, 1]
+        for i in range(len(self.input_shape)):
+            if i < self.in_num_col_dims:
+                flatten_shape[0] *= self.input_shape[i]
+            else:
+                flatten_shape[1] *= self.input_shape[i]
+
+        self.out_float = np.dot(
+            self.x_float.reshape(flatten_shape), self.y_float
+        )
        if self.use_bias:
-            self.bias = np.random.random(10).astype("float32") * 10
+            self.bias = np.random.random(self.bias_shape).astype("float32") * 10
            self.out_float += self.bias

        self.out_scale, self.out = self.quantize(self.out_float)
@@ -77,6 +101,8 @@ class TestFCINT8NoBiasOneDNNOp(TestFCINT8OneDNNOp):
    def configure(self):
        self.use_bias = False
        self.force_fp32_output = False
+        self.in_num_col_dims = 1
+        self.per_channel_quantize_weight = False

    def set_inputs(self):
        self.inputs = {
@@ -89,6 +115,21 @@ class TestFCINT8ForceFP32OutputOneDNNOp(TestFCINT8NoBiasOneDNNOp):
    def configure(self):
        self.use_bias = False
        self.force_fp32_output = True
+        self.in_num_col_dims = 1
+        self.per_channel_quantize_weight = False
+
+
+class TestFCINT8ForceFP32OutputPerChannelWeightOneDNNOp(TestFCINT8OneDNNOp):
+    def configure(self):
+        self.use_bias = True
+        self.force_fp32_output = True
+        self.in_num_col_dims = 1
+        self.per_channel_quantize_weight = True
+
+    def set_shape(self):
+        self.input_shape = (1, 8, 1, 1)
+        self.weight_shape = (8, 10)
+        self.bias_shape = 10


 if __name__ == "__main__":

--- a/mkldnn @ 64f6bcbc
+++ b/mkldnn @ 64f6bcbc
-Subproject commit 2089770c4818be8933c5e9d1dd3cbaeba1457667
+Subproject commit 64f6bcbcbab628e96f33a62c3e975f8535a7bde4