Add reshape int8 mkldnn op (#21428)

* Add reshape int8 op test=develop * Change test to CPUPlace test=develop * Correct tests test=develop

Add reshape int8 mkldnn op (#21428)
* Add reshape int8 op test=develop * Change test to CPUPlace test=develop * Correct tests test=develop
d419b859 · joanna.wozna.intel · Tao Luo · c047e713 · d419b859 · d419b859
9 changed file
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1168,6 +1168,27 @@ PDNode *patterns::Transpose::operator()() {
  return transpose_out;
 }
+PDNode *patterns::Reshape::operator()() {
+  auto prev_op = pattern->NewNode(prev_op_repr())->assert_is_op();
+  auto reshape_op =
+      pattern->NewNode(reshape_op_repr())->assert_is_op("reshape2");
+  auto reshape_in = pattern->NewNode(reshape_in_repr())
+                        ->AsInput()
+                        ->assert_is_op_input("reshape2", "X");
+  auto reshape_out = pattern->NewNode(reshape_out_repr())
+                         ->AsOutput()
+                         ->assert_is_op_output("reshape2", "Out");
+  auto next_op = pattern->NewNode(next_op_repr())->assert_is_op();
+  prev_op->LinksTo({reshape_in});
+  reshape_op->LinksFrom({reshape_in}).LinksTo({reshape_out});
+  next_op->LinksFrom({reshape_out});
+  return reshape_out;
+}
 PDNode *patterns::ConvResidual::operator()(bool with_residual_data) {
  auto conv_op = pattern->NewNode(conv_op_repr())->assert_is_op("conv2d");

--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -749,6 +749,21 @@ struct Transpose : public PatternBase {
  PATTERN_DECL_NODE(next_op);
 };
+// Reshape op
+// Forward pass for reshape.
+// reshape_out is a result of the operator.
+struct Reshape : public PatternBase {
+  Reshape(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "reshape2") {}
+  PDNode* operator()();
+  PATTERN_DECL_NODE(prev_op);
+  PATTERN_DECL_NODE(reshape_in);
+  PATTERN_DECL_NODE(reshape_op);
+  PATTERN_DECL_NODE(reshape_out);
+  PATTERN_DECL_NODE(next_op);
+};
 // Concat op
 // Forward pass for concat.
 // concat_out is a result of the operator.

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.cc
@@ -181,9 +181,7 @@ void CPUQuantizePass::QuantizeConv(Graph* graph,
    auto* conv_op_desc = conv_op->Op();
    // skip if should not be quantized
-    if (!conv_op_desc->HasAttr("use_quantizer") ||
+    if (!conv_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
-        !boost::get<bool>(conv_op_desc->GetAttr("use_quantizer")))
-      return;
    GET_IR_NODE_FROM_SUBGRAPH(conv_filter, conv_filter, conv_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(conv_input, conv_input, conv_pattern);
@@ -317,9 +315,7 @@ void CPUQuantizePass::QuantizePool(Graph* graph) const {
    auto* pool_op_desc = pool_op->Op();
    // skip if should not be quantized
-    if (!pool_op_desc->HasAttr("use_quantizer") ||
+    if (!pool_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
-        !boost::get<bool>(pool_op_desc->GetAttr("use_quantizer")))
-      return;
    GET_IR_NODE_FROM_SUBGRAPH(pool_input, pool_input, pool_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(pool_output, pool_output, pool_pattern);
@@ -359,9 +355,7 @@ void CPUQuantizePass::QuantizeConcat(Graph* graph) const {
    auto* concat_op_desc = concat_op->Op();
    // skip if should not be quantized
-    if (!concat_op_desc->HasAttr("use_quantizer") ||
+    if (!concat_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
-        !boost::get<bool>(concat_op_desc->GetAttr("use_quantizer")))
-      return;
    GET_IR_NODE_FROM_SUBGRAPH(concat_out, concat_out, concat_pattern);
@@ -401,9 +395,7 @@ void CPUQuantizePass::QuantizePriorBox(Graph* graph) const {
    auto* prior_box_op_desc = prior_box_op->Op();
    // skip if should not be quantized
-    if (!prior_box_op_desc->HasAttr("use_quantizer") ||
+    if (!prior_box_op_desc->GetAttrIfExists<bool>("use_quantizer")) return;
-        !boost::get<bool>(prior_box_op_desc->GetAttr("use_quantizer")))
-      return;
    GET_IR_NODE_FROM_SUBGRAPH(prior_box_input, prior_box_input,
                              prior_box_pattern);
@@ -446,20 +438,18 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, transpose_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, transpose_pattern);
-    // skip if prev op is not quantized
+    // skip if prev op and next op are not quantized
-    // in future we should checked if next_op is quantized
-    // transpose INT8 schould be used only between INT8 operators
    if (!(prev_op->Op()->Type() == "dequantize" ||
-          (prev_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
+          (prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
+        !(next_op->Op()->Type() == "quantize" ||
+          (next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
      return;
    }
    GET_IR_NODE_FROM_SUBGRAPH(transpose_in, transpose_in, transpose_pattern);
    GET_IR_NODE_FROM_SUBGRAPH(transpose_out, transpose_out, transpose_pattern);
    // get scales calculated after warmup, they scale variables to MAX=1.0
    auto scales = Get<VarQuantScale>("quant_var_scales");
    auto input_scale = scales[transpose_in->Name()].second.data<double>()[0];
    bool is_input_unsigned = scales[transpose_in->Name()].first;
    QuantizeInput(g, transpose_op, transpose_in, "X", input_scale,
@@ -480,6 +470,58 @@ void CPUQuantizePass::QuantizeTranspose(Graph* graph) const {
                  quantize_transpose_count);
 }
+void CPUQuantizePass::QuantizeReshape(Graph* graph) const {
+  GraphPatternDetector gpd;
+  auto pattern = gpd.mutable_pattern();
+  patterns::Reshape reshape_pattern{pattern, name_scope_};
+  reshape_pattern();
+  int quantize_reshape_count = 0;
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    VLOG(4) << "Quantize reshape op";
+    GET_IR_NODE_FROM_SUBGRAPH(reshape_op, reshape_op, reshape_pattern);
+    auto* reshape_op_desc = reshape_op->Op();
+    // skip if should not be quantized
+    if (!reshape_op_desc->GetAttrIfExists<bool>("use_quantizer")) {
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(prev_op, prev_op, reshape_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(next_op, next_op, reshape_pattern);
+    // skip if prev op  and next op is not quantized
+    if (!(prev_op->Op()->Type() == "dequantize" ||
+          (prev_op->Op()->GetAttrIfExists<bool>("use_quantizer"))) &&
+        !(next_op->Op()->Type() == "quantize" ||
+          (next_op->Op()->GetAttrIfExists<bool>("use_quantizer")))) {
+      return;
+    }
+    GET_IR_NODE_FROM_SUBGRAPH(reshape_in, reshape_in, reshape_pattern);
+    GET_IR_NODE_FROM_SUBGRAPH(reshape_out, reshape_out, reshape_pattern);
+    // get scales calculated after warmup, they scale variables to MAX=1.0
+    auto scales = Get<VarQuantScale>("quant_var_scales");
+    auto input_scale = scales[reshape_in->Name()].second.data<double>()[0];
+    bool is_input_unsigned = scales[reshape_in->Name()].first;
+    QuantizeInput(g, reshape_op, reshape_in, "X", input_scale,
+                  is_input_unsigned);
+    auto output_scale = scales[reshape_out->Name()].second.data<double>()[0];
+    bool is_output_unsigned = scales[reshape_out->Name()].first;
+    DequantizeOutput(g, reshape_op, reshape_out, "Out", output_scale,
+                     is_output_unsigned);
+    ++quantize_reshape_count;
+  };
+  gpd(graph, handler);
+  AddStatis(quantize_reshape_count);
+  PrettyLogDetail("---    quantized %d reshape ops", quantize_reshape_count);
+}
 void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
  VLOG(3) << "Quantizing the graph.";
  PADDLE_ENFORCE(graph);
@@ -494,6 +536,7 @@ void CPUQuantizePass::ApplyImpl(ir::Graph* graph) const {
  QuantizePriorBox(graph);
  QuantizeTranspose(graph);
  QuantizeFc(graph);
+  QuantizeReshape(graph);
 }
 }  // namespace ir

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass.h
@@ -56,6 +56,8 @@ class CPUQuantizePass : public FusePassBase {
  void QuantizeTranspose(Graph* graph) const;
+  void QuantizeReshape(Graph* graph) const;
  void QuantizeInput(Graph* g, Node* op, Node* input, std::string input_name,
                     double scale_to_one, bool is_unsigned,
                     std::string scale_attr_name = "") const;

--- a/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
+++ b/paddle/fluid/framework/ir/mkldnn/cpu_quantize_pass_tester.cc
@@ -50,7 +50,7 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetAttr("Scale_in", 1.0f);
    op->SetAttr("Scale_out", 1.0f);
    op->SetAttr("Scale_weights", std::vector<float>{1.0f});
-  } else if (type == "pool2d" || type == "transpose2") {
+  } else if (type == "pool2d" || type == "transpose2" || type == "reshape2") {
    op->SetInput("X", {inputs[0]});
    op->SetOutput("Out", {outputs[0]});
    op->SetAttr("use_quantizer", use_quantizer);
@@ -70,9 +70,48 @@ void SetOp(ProgramDesc* prog, const std::string& type, const std::string& name,
    op->SetInput("X", inputs);
    op->SetOutput("Out", outputs);
    op->SetAttr("use_quantizer", use_quantizer);
+  } else if (type == "dequantize") {
+    op->SetInput("Input", {inputs[0]});
+    op->SetOutput("Output", {outputs[0]});
+    op->SetAttr("Scale", 1.0f);
  }
 }
+void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
+                      const char* var_name) {
+  auto x = scope->Var(var_name);
+  auto tensor = x->GetMutable<LoDTensor>();
+  tensor->mutable_data(place, proto::VarType::FP32, 1);
+}
+void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
+                 const std::initializer_list<std::string> variable_names,
+                 int* original_nodes_num, int* current_nodes_num) {
+  auto place = paddle::platform::CPUPlace();
+  NaiveExecutor exe{place};
+  Scope scope;
+  exe.CreateVariables(prog, 0, true, &scope);
+  auto* scales = new VarQuantScale();
+  for (auto& v : variable_names) {
+    InitTensorHolder(&scope, place, v.c_str());
+    LoDTensor tensor;
+    tensor.Resize({1});
+    auto* ptr = tensor.mutable_data<double>(place);
+    ptr[0] = 2.0;
+    (*scales)[v] = std::make_pair(false, std::move(tensor));
+  }
+  (*graph)->SetNotOwned(kParamScopeAttr, &scope);
+  std::unique_ptr<Pass> pass =
+      PassRegistry::Instance().Get("cpu_quantize_pass");
+  pass->Set("quant_var_scales", scales);
+  *original_nodes_num = (*graph)->Nodes().size();
+  (*graph).reset(pass->Apply((*graph).release()));
+  *current_nodes_num = (*graph)->Nodes().size();
+}
 namespace {
 static const std::initializer_list<std::string> variable_names{
    "a",  "w1", "c", "d", "w2", "e",  "f",  "g", "h",
@@ -113,41 +152,6 @@ ProgramDesc BuildProgramDesc(bool use_mkldnn, bool use_quantizer) {
  return prog;
 }
-void InitTensorHolder(Scope* scope, const paddle::platform::Place& place,
-                      const char* var_name) {
-  auto x = scope->Var(var_name);
-  auto tensor = x->GetMutable<LoDTensor>();
-  tensor->mutable_data(place, proto::VarType::FP32, 1);
-}
-void PreparePass(std::unique_ptr<ir::Graph>* graph, const ProgramDesc& prog,
-                 const std::initializer_list<std::string> variable_names,
-                 int* original_nodes_num, int* current_nodes_num) {
-  auto place = paddle::platform::CPUPlace();
-  NaiveExecutor exe{place};
-  Scope scope;
-  exe.CreateVariables(prog, 0, true, &scope);
-  auto* scales = new VarQuantScale();
-  for (auto& v : variable_names) {
-    InitTensorHolder(&scope, place, v.c_str());
-    LoDTensor tensor;
-    tensor.Resize({1});
-    auto* ptr = tensor.mutable_data<double>(place);
-    ptr[0] = 2.0;
-    (*scales)[v] = std::make_pair(false, std::move(tensor));
-  }
-  (*graph)->SetNotOwned(kParamScopeAttr, &scope);
-  std::unique_ptr<Pass> pass =
-      PassRegistry::Instance().Get("cpu_quantize_pass");
-  pass->Set("quant_var_scales", scales);
-  *original_nodes_num = (*graph)->Nodes().size();
-  (*graph).reset(pass->Apply((*graph).release()));
-  *current_nodes_num = (*graph)->Nodes().size();
-}
 void MainTest(const ProgramDesc& prog, int conv_count, int pool_count,
              int quant_count, int dequant_count, int added_nodes_count,
              float scale) {
@@ -217,9 +221,6 @@ TEST(CpuQuantizePass, do_not_quantize) {
           1.0f);
 }
-}  // namespace
-namespace {
 static const std::initializer_list<std::string> variable_names_concat = {
    "a1", "b1", "a2", "b2", "c", "d"};
@@ -283,9 +284,7 @@ TEST(CpuQuantizePass, concat) {
  MainTestConcat(BuildProgramDescConcat(), pool_count, concat_count,
                 quant_count, dequant_count, added_nodes_count);
 }
-}  // namespace
-namespace {
 static const std::initializer_list<std::string> variable_names_transpose = {
    "a", "w1", "b", "c", "w2", "d", "e", "f"};
@@ -365,11 +364,119 @@ TEST(CpuQuantizePass, transpose) {
  int quant_count = 4;
  int dequant_count = 4;
  // 4 Quant + 4 IN + 4 DeQuant + 4 OUT
-  int added_nodes_count = 16;
+  int added_nodes_count = 4 + 4 + 4 + 4;
  MainTestTranspose(BuildProgramDescTranspose(), conv_count, transpose_count,
                    quant_count, dequant_count, added_nodes_count, 2.0f * 127);
 }
+static const std::initializer_list<std::string> variable_names_reshape = {
+    "a", "w1", "b", "c", "d", "e", "f"};
+// a->Dequantize->b
+// b->Reshape->c
+// c->Dropout->d
+ProgramDesc BuildProgramDescReshape() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_transpose) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "dequantize", "Dequantize1", {"a"}, {"b"}, true);
+  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true);
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false);
+  return prog;
+}
+// a->Transpose->b
+// b->Reshape->c
+// c->Dropout->d
+ProgramDesc BuildProgramDescReshapeBetweenNonQuantizedOp() {
+  ProgramDesc prog;
+  for (auto& v : variable_names_transpose) {
+    prog.MutableBlock(0)->Var(v);
+  }
+  SetOp(&prog, "transpose2", "Transpose2", {"a"}, {"b"}, true, false);
+  SetOp(&prog, "reshape2", "Reshape2", {"b"}, {"c"}, true, true);
+  SetOp(&prog, "dropout", "Dropout", {"c"}, {"d"}, true, false);
+  return prog;
+}
+void MainTestReshape(const ProgramDesc& prog, int transpose_count,
+                     int reshape_count, int quant_count, int dequant_count,
+                     int added_nodes_count, float scale) {
+  std::unique_ptr<ir::Graph> graph(new ir::Graph(prog));
+  int original_nodes_num, current_nodes_num;
+  PreparePass(&graph, prog, variable_names_reshape, &original_nodes_num,
+              &current_nodes_num);
+  float quant_scale = 1.0f;
+  float dequant_scale = 1.0f;
+  int quantize_nodes_count = 0;
+  int dequantize_nodes_count = 0;
+  int transpose_nodes_count = 0;
+  int reshape_nodes_count = 0;
+  for (auto* node : graph->Nodes()) {
+    if (node->IsOp()) {
+      auto* op = node->Op();
+      if (op->Type() == "transpose2") {
+        transpose_nodes_count++;
+      } else if (op->Type() == "reshape2") {
+        reshape_nodes_count++;
+      } else if (op->Type() == "quantize") {
+        quantize_nodes_count++;
+        quant_scale = boost::get<float>(op->GetAttr("Scale"));
+        EXPECT_EQ(quant_scale, scale) << "Scale for node '" + op->Type() + "'.";
+      } else if (op->Type() == "dequantize") {
+        dequantize_nodes_count++;
+        auto op_name = op->GetAttrIfExists<std::string>("name");
+        std::cout << op_name << " \n";
+        if (op_name != "Dequantize1") {
+          dequant_scale = boost::get<float>(op->GetAttr("Scale"));
+          EXPECT_EQ(dequant_scale, scale)
+              << "Scale for node '" + op->Type() + "'.";
+        }
+      }
+    }
+  }
+  EXPECT_EQ(transpose_nodes_count, transpose_count);
+  EXPECT_EQ(reshape_nodes_count, reshape_count);
+  EXPECT_EQ(quantize_nodes_count, quant_count);
+  EXPECT_EQ(dequantize_nodes_count, dequant_count);
+  EXPECT_EQ(original_nodes_num + added_nodes_count, current_nodes_num);
+}
+TEST(CpuQuantizePass, reshape) {
+  // a->Dequantize->b
+  // b2->Quant->b3->Reshape2->c1->Dequant->c2
+  // c2->Dropout->d
+  int reshape_count = 1;
+  int transpose_count = 0;
+  int quant_count = 1;
+  int dequant_count = 2;
+  // 1 Quant + 1 IN + 1 DeQuant + 1 OUT
+  int added_nodes_count = 4;
+  MainTestReshape(BuildProgramDescReshape(), transpose_count, reshape_count,
+                  quant_count, dequant_count, added_nodes_count, 2.0f * 127);
+}
+TEST(CpuQuantizePass, reshapeBetweenNonQuantizedOp) {
+  // a->Transpos2->b
+  // b->Reshape2->c
+  // c->Dropout->d
+  int reshape_count = 1;
+  int transpose_count = 1;
+  int quant_count = 0;
+  int dequant_count = 0;
+  // 0 Quant + 0 IN + 0 DeQuant + 0 OUT
+  int added_nodes_count = 0;
+  MainTestReshape(BuildProgramDescReshapeBetweenNonQuantizedOp(),
+                  transpose_count, reshape_count, quant_count, dequant_count,
+                  added_nodes_count, 2.0f * 127);
+}
 }  // namespace
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle

--- a/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
+++ b/paddle/fluid/inference/api/mkldnn_quantizer_config.cc
@@ -35,6 +35,8 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
  rules_["prior_box"]["Boxes"] = ScaleAlgo::NONE;
  rules_["prior_box"]["Variances"] = ScaleAlgo::NONE;
+  // Transpose2 does not perform calculation on the data. Scale is calculated on
+  // input data and assign to Quantize and Dequantize scale.
  rules_["transpose2"]["X"] = ScaleAlgo::KL;
  rules_["transpose2"]["Out"] = ScaleAlgo::NONE;
@@ -42,6 +44,15 @@ MkldnnQuantizerConfig::MkldnnQuantizerConfig() {
  rules_["fc"]["W"] = ScaleAlgo::MAX_CH_T;
  rules_["fc"]["Bias"] = ScaleAlgo::NONE;
  rules_["fc"]["Out"] = ScaleAlgo::KL;
+  // Reshape2 does not perform calculation on the data and shapes are not
+  // changed. Scale is calculated on input data and assign to Quantize and
+  // Dequantize scale.
+  rules_["reshape2"]["X"] = ScaleAlgo::KL;
+  rules_["reshape2"]["Shape"] = ScaleAlgo::NONE;
+  rules_["reshape2"]["ShapeTensor"] = ScaleAlgo::NONE;
+  rules_["reshape2"]["XShape"] = ScaleAlgo::NONE;
+  rules_["reshape2"]["Out"] = ScaleAlgo::NONE;
 }
 ScaleAlgo MkldnnQuantizerConfig::scale_algo(

--- a/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
+++ b/paddle/fluid/inference/tests/api/analyzer_int8_object_detection_tester.cc
@@ -269,7 +269,7 @@ TEST(Analyzer_int8_mobilenet_ssd, quantization) {
  q_cfg.EnableMkldnnQuantizer();
  q_cfg.mkldnn_quantizer_config();
  std::unordered_set<std::string> quantize_operators(
-      {"conv2d", "depthwise_conv2d", "prior_box", "transpose2"});
+      {"conv2d", "depthwise_conv2d", "prior_box", "transpose2", "reshape2"});
  q_cfg.mkldnn_quantizer_config()->SetEnabledOpTypes(quantize_operators);
  q_cfg.mkldnn_quantizer_config()->SetWarmupData(warmup_data);
  q_cfg.mkldnn_quantizer_config()->SetWarmupBatchSize(FLAGS_warmup_batch_size);

--- a/paddle/fluid/operators/reshape_op.cc
+++ b/paddle/fluid/operators/reshape_op.cc
@@ -419,6 +419,13 @@ class Reshape2OpMaker : public ReshapeOpMaker {
              "XShape is just used to store the shape and lod of X, which will "
              "be used in FlattenGradOp.")
        .AsIntermediate();
+    /* int8 parameters */
+    AddAttr<bool>("use_quantizer",
+                  "(bool, default false) "
+                  "Set to true for operators that should be quantized and use "
+                  "int8 kernel. "
+                  "Used only on CPU.")
+        .SetDefault(false);
  }
 };
@@ -572,8 +579,9 @@ REGISTER_OPERATOR(reshape2_grad_grad, ops::Reshape2DoubleGradOp,
                  ops::ReshapeDoubleGradInplaceInToOut);
 REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2, float, ops::ReshapeKernel, double,
-                               ops::ReshapeKernel, int, ops::ReshapeKernel,
+                               ops::ReshapeKernel, int8_t, ops::ReshapeKernel,
-                               int64_t, ops::ReshapeKernel);
+                               uint8_t, ops::ReshapeKernel, int,
+                               ops::ReshapeKernel, int64_t, ops::ReshapeKernel);
 REGISTER_OP_CPU_KERNEL_FUNCTOR(reshape2_grad, float, ops::ReshapeGradKernel,
                               double, ops::ReshapeGradKernel, int,
                               ops::ReshapeGradKernel, int64_t,

--- a/python/paddle/fluid/tests/unittests/test_reshape_op.py
+++ b/python/paddle/fluid/tests/unittests/test_reshape_op.py
@@ -40,7 +40,6 @@ class TestReshapeOp(OpTest):
        self.infered_shape = (12, 10)
    def test_check_output(self):
        self.check_output(no_check_set=['XShape'])
    def test_check_grad(self):
@@ -185,6 +184,47 @@ class TestReshapeOpDimInfer2_attr_OnlyShape(TestReshapeOp_attr_OnlyShape):
        self.shape = (2, 0, 3, -1)
+# test int8 data type on CPU
+class TestReshapeInt8Op(OpTest):
+    def setUp(self):
+        self.init_dtype()
+        self.init_data()
+        self.use_mkldnn = True
+        self._cpu_only = True
+        self.op_type = "reshape2"
+        input = np.random.randint(0, 127, self.ori_shape).astype(self.dtype)
+        self.inputs = {'X': OpTest.np_dtype_to_fluid_dtype(input)}
+        self.attrs = {
+            'shape': self.new_shape,
+            'use_mkldnn': self.use_mkldnn,
+        }
+        self.outputs = {
+            "Out": self.inputs["X"].reshape(self.infered_shape),
+            'XShape': np.random.random(self.ori_shape).astype(np.float32)
+        }
+    def init_dtype(self):
+        self.dtype = np.int8
+    def init_data(self):
+        self.ori_shape = (2, 2, 6)
+        self.new_shape = (2, 0, 3, -1)
+        self.infered_shape = (2, 2, 3, -1)
+    def test_check_output(self):
+        self.check_output_with_place(
+            fluid.core.CPUPlace(), atol=1e-5, no_check_set=['XShape'])
+    def test_check_grad(self):
+        pass
+# test unt8 data type on CPU
+class TestReshapeUint8Op(TestReshapeInt8Op):
+    def init_dtype(self):
+        self.dtype = np.uint8
 # Test python API
 class TestReshapeAPI(unittest.TestCase):
    # situation 1: have shape( list, no tensor), no actual shape(Tensor)