cherry-pick from feature/anakin-engine: Anakin support facebox #16111

a1d200a5 · nhzlx · a32d4200 · a1d200a5 · a1d200a5 · a1d200a5
25 changed file
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -71,6 +71,7 @@ pass_library(transpose_flatten_concat_fuse_pass inference)
 pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
+pass_library(simplify_anakin_detection_pattern_pass inference)

 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -81,6 +82,10 @@ foreach (index RANGE 3 6)
   file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
 endforeach()

+foreach (index RANGE 3 6)
+   file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
+endforeach()
+
 if(WITH_MKLDNN)
    pass_library(mkldnn_placement_pass base mkldnn)
    pass_library(depthwise_conv_mkldnn_pass base mkldnn)

--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1454,6 +1454,136 @@ PDNode *patterns::TransposeFlattenConcat::operator()(
  return concat_out;
 }

+PDNode *patterns::AnakinDetectionPattern::operator()(
+    std::vector<PDNode *> conv_in, int times) {
+  // The times represents the repeat times of the
+  // {prior_box, prior_box_loc_out, flatten, prior_box_var_out, reshape}
+  const int kNumFields = 7;
+  const int kPriorBoxLocOffset = 1;
+  const int kReshape1Offset = 2;
+  const int kReshape1OutOffset = 3;
+  const int kPriorBoxVarOffset = 4;
+  const int kReshape2Offset = 5;
+  const int kReshape2OutOffset = 6;
+
+  const int kBoxCoderThirdInputOffset = times;
+  const int kMultiClassSecondInputNmsOffset = times + 1;
+
+  std::vector<PDNode *> nodes;
+
+  for (int i = 0; i < times; i++) {
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("prior_box" + std::to_string(i)))
+            ->assert_is_op("density_prior_box"));
+    nodes.push_back(pattern->NewNode(GetNodeName("box_out" + std::to_string(i)))
+                        ->assert_is_op_output("density_prior_box", "Boxes")
+                        ->assert_is_op_input("reshape2", "X")
+                        ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape1_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("box_var_out" + std::to_string(i)))
+            ->assert_is_op_output("density_prior_box", "Variances")
+            ->assert_is_op_input("reshape2", "X")
+            ->AsIntermediate());
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2" + std::to_string(i)))
+            ->assert_is_op("reshape2"));
+
+    nodes.push_back(
+        pattern->NewNode(GetNodeName("reshape2_out" + std::to_string(i)))
+            ->assert_is_op_output("reshape2")
+            ->assert_is_op_nth_input("concat", "X", i)
+            ->AsIntermediate());
+  }
+
+  auto concat_op1 = pattern->NewNode(GetNodeName("concat1"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out1 = pattern->NewNode(GetNodeName("concat1_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+
+  auto concat_op2 = pattern->NewNode(GetNodeName("concat2"))
+                        ->assert_is_op("concat")
+                        ->assert_op_has_n_inputs("concat", times);
+  auto concat_out2 = pattern->NewNode(GetNodeName("concat2_out"))
+                         ->assert_is_op_output("concat")
+                         ->AsIntermediate();
+
+  auto box_coder_op = pattern->NewNode(GetNodeName("box_coder"))
+                          ->assert_is_op("box_coder")
+                          ->assert_op_has_n_inputs("box_coder", 3);
+
+  auto box_coder_out = pattern->NewNode(GetNodeName("box_coder_out"))
+                           ->assert_is_op_output("box_coder")
+                           ->AsIntermediate();
+
+  auto multiclass_nms_op = pattern->NewNode(GetNodeName("multiclass_nms"))
+                               ->assert_is_op("multiclass_nms")
+                               ->assert_op_has_n_inputs("multiclass_nms", 2);
+
+  auto multiclass_nms_out = pattern->NewNode(GetNodeName("multiclass_nms_out"))
+                                ->assert_is_op_output("multiclass_nms")
+                                ->AsOutput();
+
+  std::vector<PDNode *> reshape1_outs;
+  std::vector<PDNode *> reshape2_outs;
+
+  for (int i = 0; i < times; i++) {
+    conv_in[i]->AsInput();
+    // prior_box
+    nodes[i * kNumFields]->LinksFrom({conv_in[i]});
+    // prior_box box out
+    nodes[i * kNumFields + kPriorBoxLocOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape1Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxLocOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape1OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape1Offset]});
+
+    nodes[i * kNumFields + kPriorBoxVarOffset]->LinksFrom(
+        {nodes[i * kNumFields]});
+    // reshape
+    nodes[i * kNumFields + kReshape2Offset]->LinksFrom(
+        {nodes[i * kNumFields + kPriorBoxVarOffset]});
+    // reshape_out
+    nodes[i * kNumFields + kReshape2OutOffset]->LinksFrom(
+        {nodes[i * kNumFields + kReshape2Offset]});
+
+    reshape1_outs.push_back(nodes[i * kNumFields + kReshape1OutOffset]);
+    reshape2_outs.push_back(nodes[i * kNumFields + kReshape2OutOffset]);
+  }
+
+  concat_op1->LinksFrom(reshape1_outs);
+  concat_op2->LinksFrom(reshape2_outs);
+  concat_out1->LinksFrom({concat_op1});
+  concat_out2->LinksFrom({concat_op2});
+
+  conv_in[kBoxCoderThirdInputOffset]->AsInput();
+  conv_in[kMultiClassSecondInputNmsOffset]->AsInput();
+
+  box_coder_op->LinksFrom(
+      {concat_out1, concat_out2, conv_in[kBoxCoderThirdInputOffset]});
+  box_coder_out->LinksFrom({box_coder_op});
+
+  multiclass_nms_op
+      ->LinksFrom({box_coder_out, conv_in[kMultiClassSecondInputNmsOffset]})
+      .LinksTo({multiclass_nms_out});
+
+  return multiclass_nms_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -841,6 +841,21 @@ struct TransposeFlattenConcat : public PatternBase {
  }
 };

+struct AnakinDetectionPattern : public PatternBase {
+  AnakinDetectionPattern(PDPattern* pattern, const std::string& name_scope)
+      : PatternBase(pattern, name_scope, "anakin_detect_pattern") {}
+
+  PDNode* operator()(std::vector<PDNode*> conv_inputs, int times);
+
+  std::string GetNodeName(const std::string& op_type) {
+    return PDNodeName(name_scope_, repr_, id_, op_type);
+  }
+
+  PDNode* GetPDNode(const std::string& op_type) {
+    return pattern->RetrieveNode(GetNodeName(op_type));
+  }
+};
+
 }  // namespace patterns

 // Link two ir::Nodes from each other.

--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <string>
+#include <vector>
+
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+#include "paddle/fluid/framework/ir/node.h"
+#include "paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+template <int times>
+std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name =
+      "simplify_anakin_detection_pattern_pass" + std::to_string(times);
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  std::vector<PDNode *> input_nodes;
+  for (int i = 0; i < times; i++) {
+    input_nodes.push_back(gpd.mutable_pattern()
+                              ->NewNode("x" + std::to_string(i))
+                              ->assert_is_op_input("density_prior_box", "Input")
+                              ->AsInput());
+  }
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times))
+                            ->assert_is_op_input("box_coder", "TargetBox")
+                            ->AsInput());
+
+  input_nodes.push_back(gpd.mutable_pattern()
+                            ->NewNode("x" + std::to_string(times + 1))
+                            ->assert_is_op_input("multiclass_nms", "Scores")
+                            ->AsInput());
+
+  patterns::AnakinDetectionPattern pattern(gpd.mutable_pattern(), pattern_name);
+  pattern(input_nodes, times);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t &subgraph,
+                     Graph *g) {
+    const int kNumFields = 7;
+    const int kPriorBoxLocOffset = 1;
+    const int kReshape1Offset = 2;
+    const int kReshape1OutOffset = 3;
+    const int kPriorBoxVarOffset = 4;
+    const int kReshape2Offset = 5;
+    const int kReshape2OutOffset = 6;
+    std::vector<Node *> nodes;
+
+    for (int i = 0; i < times; i++) {
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+
+      PADDLE_ENFORCE(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("prior_box" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape1_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("box_var_out" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2" + std::to_string(i))));
+      nodes.push_back(
+          subgraph.at(pattern.GetPDNode("reshape2_out" + std::to_string(i))));
+    }
+
+    Node *concat_op1 = subgraph.at(pattern.GetPDNode("concat1"));
+    Node *concat_out1 = subgraph.at(pattern.GetPDNode("concat1_out"));
+
+    Node *concat_op2 = subgraph.at(pattern.GetPDNode("concat2"));
+    Node *concat_out2 = subgraph.at(pattern.GetPDNode("concat2_out"));
+
+    Node *box_coder_third_input = subgraph.at(input_nodes[times]);
+    Node *box_coder_op = subgraph.at(pattern.GetPDNode("box_coder"));
+    Node *box_coder_out = subgraph.at(pattern.GetPDNode("box_coder_out"));
+
+    Node *multiclass_nms_second_input = subgraph.at(input_nodes[times + 1]);
+    Node *multiclass_nms = subgraph.at(pattern.GetPDNode("multiclass_nms"));
+    Node *multiclass_nms_out =
+        subgraph.at(pattern.GetPDNode("multiclass_nms_out"));
+
+    std::string code_type =
+        boost::get<std::string>(box_coder_op->Op()->GetAttr("code_type"));
+    bool box_normalized =
+        boost::get<bool>(box_coder_op->Op()->GetAttr("box_normalized"));
+    // auto variance =
+    // boost::get<std::vector<float>>(box_coder_op->Op()->GetAttr("variance"));
+    int background_label =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("background_label"));
+    float score_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("score_threshold"));
+    int nms_top_k = boost::get<int>(multiclass_nms->Op()->GetAttr("nms_top_k"));
+    float nms_threshold =
+        boost::get<float>(multiclass_nms->Op()->GetAttr("nms_threshold"));
+    float nms_eta = boost::get<float>(multiclass_nms->Op()->GetAttr("nms_eta"));
+    int keep_top_k =
+        boost::get<int>(multiclass_nms->Op()->GetAttr("keep_top_k"));
+
+    std::vector<std::string> concat1_input_names;
+    for (int i = 0; i < times; i++) {
+      concat1_input_names.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]->Name());
+    }
+
+    int axis = boost::get<int>(concat_op1->Op()->GetAttr("axis"));
+    framework::OpDesc concat1_desc;
+    concat1_desc.SetType("concat");
+    concat1_desc.SetInput("X", concat1_input_names);
+    concat1_desc.SetAttr("axis", axis);
+    concat1_desc.SetOutput("Out", {concat_out1->Name()});
+
+    auto *new_add_concat_op = graph->CreateOpNode(&concat1_desc);
+
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(
+          new_add_concat_op);
+      new_add_concat_op->inputs.push_back(
+          nodes[i * kNumFields + kPriorBoxLocOffset]);
+    }
+
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("detection_out");
+    new_op_desc.SetInput("PriorBox", {concat_out1->Name()});
+    new_op_desc.SetInput("TargetBox", {box_coder_third_input->Name()});
+    new_op_desc.SetInput("Scores", {multiclass_nms_second_input->Name()});
+    new_op_desc.SetAttr("code_type", code_type);
+    new_op_desc.SetAttr("box_normalized", box_normalized);
+    new_op_desc.SetAttr("background_label", background_label);
+    new_op_desc.SetAttr("score_threshold", score_threshold);
+    new_op_desc.SetAttr("nms_top_k", nms_top_k);
+    new_op_desc.SetAttr("nms_threshold", nms_threshold);
+    new_op_desc.SetAttr("nms_eta", nms_eta);
+    new_op_desc.SetAttr("keep_top_k", keep_top_k);
+    new_op_desc.SetOutput("Out", {multiclass_nms_out->Name()});
+    new_op_desc.Flush();
+
+    // Create a new node for the fused op.
+    auto *detection_out_op = graph->CreateOpNode(&new_op_desc);
+
+    std::unordered_set<const Node *> delete_nodes;
+
+    for (int i = 0; i < times; i++) {
+      nodes[i * kNumFields + kPriorBoxLocOffset]->outputs.push_back(concat_op1);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape1OutOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kPriorBoxVarOffset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2Offset]);
+      delete_nodes.insert(nodes[i * kNumFields + kReshape2OutOffset]);
+    }
+
+    delete_nodes.insert(concat_op1);
+    delete_nodes.insert(concat_op2);
+    delete_nodes.insert(concat_out2);
+    delete_nodes.insert(box_coder_op);
+    delete_nodes.insert(box_coder_out);
+    delete_nodes.insert(multiclass_nms);
+
+    new_add_concat_op->outputs.push_back(concat_out1);
+    concat_out1->inputs.push_back(new_add_concat_op);
+
+    detection_out_op->inputs.push_back(concat_out1);
+    detection_out_op->inputs.push_back(box_coder_third_input);
+    detection_out_op->inputs.push_back(multiclass_nms_second_input);
+    detection_out_op->outputs.push_back(multiclass_nms_out);
+
+    concat_out1->outputs.push_back(detection_out_op);
+    box_coder_third_input->outputs.push_back(detection_out_op);
+    multiclass_nms_second_input->outputs.push_back(detection_out_op);
+    multiclass_nms_out->inputs.push_back(detection_out_op);
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(), delete_nodes);
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+template class SimplifyAnakinDetectionPatternPass<1>;
+template class SimplifyAnakinDetectionPatternPass<3>;
+template class SimplifyAnakinDetectionPatternPass<4>;
+template class SimplifyAnakinDetectionPatternPass<5>;
+template class SimplifyAnakinDetectionPatternPass<6>;
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass4,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<4>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass5,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<5>);
+
+REGISTER_PASS(simplify_anakin_detection_pattern_pass6,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<6>);
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include <unordered_set>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+// There may be many transpose-flatten structures in a model, and the output of
+// these structures will be used as inputs to the concat Op. This pattern will
+// be detected by our pass. The times here represents the repeat times of this
+// structure.
+template <int times>
+class SimplifyAnakinDetectionPatternPass : public FusePassBase {
+ public:
+  virtual ~SimplifyAnakinDetectionPatternPass() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
 cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
-elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc DEPS anakin_engine framework_proto scope op_registry)
+elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc DEPS anakin_engine framework_proto scope op_registry)
 cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
 cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
 cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
 cc_test(test_anakin_pool2d SRCS test_pool2d_op.cc DEPS anakin_op_converter pool_op pooling)
 cc_test(test_anakin_concat SRCS test_concat_op.cc DEPS anakin_op_converter concat_op concat_and_split)
 cc_test(test_anakin_split SRCS test_split_op.cc DEPS anakin_op_converter split_op concat_and_split)
-cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS
-anakin_op_converter elementwise_add_op)
+cc_test(test_anakin_elementwise SRCS test_elementwise_op.cc DEPS anakin_op_converter elementwise_add_op)
 cc_test(test_anakin_relu SRCS test_relu_op.cc DEPS activation_op anakin_op_converter SERIAL)
 cc_test(test_anakin_softmax SRCS test_softmax_op.cc DEPS anakin_op_converter softmax_op softmax)
 cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter reshape_op)

--- a/paddle/fluid/inference/anakin/convert/batch_norm.cc
+++ b/paddle/fluid/inference/anakin/convert/batch_norm.cc
@@ -14,6 +14,7 @@

 #include "paddle/fluid/inference/anakin/convert/batch_norm.h"
 #include <math.h>
+#include <algorithm>
 #include <map>
 #include <string>
 #include <vector>
@@ -41,7 +42,6 @@ void BatchNormOpConverter::operator()(const framework::proto::OpDesc &op,

  auto output = op_desc.Output("Y").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Y").front();
-  bool is_test = boost::get<bool>(op_desc.GetAttr("is_test"));
  auto epsilon = boost::get<float>(op_desc.GetAttr("epsilon"));

  auto bn_op_name = op_name + ":bn";

--- a/paddle/fluid/inference/anakin/convert/concat.cc
+++ b/paddle/fluid/inference/anakin/convert/concat.cc
@@ -34,8 +34,8 @@ void ConcatOpConverter::operator()(const framework::proto::OpDesc &op,
  framework::OpDesc op_desc(op, nullptr);
  int axis = boost::get<int>(op_desc.GetAttr("axis"));
  auto input_names = op_desc.Input("X");
-  PADDLE_ENFORCE(axis > 0,
-                 "The axis attr of Concat op should be large than 0 for trt");
+  // PADDLE_ENFORCE(axis > 0,
+  //               "The axis attr of Concat op should be large than 0 for trt");

  auto y_name = op_desc.Output("Out").front();
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();

--- a/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/density_prior_box.h"
+#include <algorithm>
+#include <map>
+#include <vector>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+using anakin::PTuple;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void DensityPriorBoxOpConverter::operator()(const framework::proto::OpDesc &op,
+                                            const framework::Scope &scope,
+                                            bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto input_name = op_desc.Input("Input").front();
+  auto image_name = op_desc.Input("Image").front();
+  auto output_name = op_desc.Output("Boxes").front();
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Boxes").front();
+
+  auto fixed_sizes =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_sizes"));
+  auto fixed_ratios =
+      boost::get<std::vector<float>>(op_desc.GetAttr("fixed_ratios"));
+  auto densities = boost::get<std::vector<int>>(op_desc.GetAttr("densities"));
+
+  // lack flip
+  auto clip = boost::get<bool>(op_desc.GetAttr("clip"));
+  auto variances = boost::get<std::vector<float>>(op_desc.GetAttr("variances"));
+
+  // lack img_h, img_w
+  auto step_h = boost::get<float>(op_desc.GetAttr("step_h"));
+  auto step_w = boost::get<float>(op_desc.GetAttr("step_w"));
+  auto offset = boost::get<float>(op_desc.GetAttr("offset"));
+  std::vector<std::string> order = {"MIN", "COM", "MAX"};
+  std::vector<float> temp_v = {};
+
+  engine_->AddOp(op_name, "PriorBox", {input_name, image_name}, {output_name});
+  engine_->AddOpAttr<PTuple<float>>(op_name, "min_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "max_size", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "aspect_ratio", temp_v);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_sizes", fixed_sizes);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "fixed_ratios", fixed_ratios);
+  engine_->AddOpAttr<PTuple<int>>(op_name, "density", densities);
+  engine_->AddOpAttr(op_name, "is_flip", false);
+  engine_->AddOpAttr(op_name, "is_clip", clip);
+  engine_->AddOpAttr<PTuple<float>>(op_name, "variance", variances);
+  engine_->AddOpAttr(op_name, "img_h", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "img_w", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "step_h", step_h);
+  engine_->AddOpAttr(op_name, "step_w", step_w);
+  engine_->AddOpAttr(op_name, "offset", offset);
+  engine_->AddOpAttr<PTuple<std::string>>(op_name, "order", order);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(density_prior_box, DensityPriorBoxOpConverter);
--- a/paddle/fluid/inference/anakin/convert/density_prior_box.h
+++ b/paddle/fluid/inference/anakin/convert/density_prior_box.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class DensityPriorBoxOpConverter : public AnakinOpConverter {
+ public:
+  DensityPriorBoxOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DensityPriorBoxOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/detection_out.cc
+++ b/paddle/fluid/inference/anakin/convert/detection_out.cc
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/detection_out.h"
+#include <algorithm>
+#include <map>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void DetectionOutOpConverter::operator()(const framework::proto::OpDesc &op,
+                                         const framework::Scope &scope,
+                                         bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  auto target_name = op_desc.Input("TargetBox").front();
+  auto prior_box_name = op_desc.Input("PriorBox").front();
+  auto scores_name = op_desc.Input("Scores").front();
+  auto output_name = op_desc.Output("Out").front();
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  auto code_type = boost::get<std::string>(op_desc.GetAttr("code_type"));
+  auto background_label = boost::get<int>(op_desc.GetAttr("background_label"));
+  auto score_threshold = boost::get<float>(op_desc.GetAttr("score_threshold"));
+  auto nms_top_k = boost::get<int>(op_desc.GetAttr("nms_top_k"));
+  auto nms_threshold = boost::get<float>(op_desc.GetAttr("nms_threshold"));
+  auto nms_eta = boost::get<float>(op_desc.GetAttr("nms_eta"));
+  auto keep_top_k = boost::get<int>(op_desc.GetAttr("keep_top_k"));
+  std::string anakin_code_type;
+  if (code_type == "decode_center_size") {
+    anakin_code_type = "CENTER_SIZE";
+  } else if (code_type == "encode_center_size") {
+    PADDLE_THROW(
+        "Not support encode_center_size code_type in DetectionOut of anakin");
+  }
+
+  engine_->AddOp(op_name, "DetectionOutput",
+                 {target_name, scores_name, prior_box_name}, {output_name});
+  engine_->AddOpAttr(op_name, "share_location", true);
+  engine_->AddOpAttr(op_name, "variance_encode_in_target", false);
+  engine_->AddOpAttr(op_name, "class_num", static_cast<int>(0));
+  engine_->AddOpAttr(op_name, "background_id", background_label);
+  engine_->AddOpAttr(op_name, "keep_top_k", keep_top_k);
+  engine_->AddOpAttr(op_name, "code_type", anakin_code_type);
+  engine_->AddOpAttr(op_name, "conf_thresh", score_threshold);
+  engine_->AddOpAttr(op_name, "nms_top_k", nms_top_k);
+  engine_->AddOpAttr(op_name, "nms_thresh", nms_threshold);
+  engine_->AddOpAttr(op_name, "nms_eta", nms_eta);
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(detection_out, DetectionOutOpConverter);
--- a/paddle/fluid/inference/anakin/convert/detection_out.h
+++ b/paddle/fluid/inference/anakin/convert/detection_out.h
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class DetectionOutOpConverter : public AnakinOpConverter {
+ public:
+  DetectionOutOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~DetectionOutOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
--- a/paddle/fluid/inference/anakin/convert/flatten.cc
+++ b/paddle/fluid/inference/anakin/convert/flatten.cc
@@ -34,20 +34,11 @@ void FlattenOpConverter::operator()(const framework::proto::OpDesc &op,

  auto input = op_desc.Input("X").front();
  auto output = op_desc.Output("Out").front();
-  auto in_dims = scope.FindVar(input)->Get<framework::LoDTensor>().dims();
  int axis = boost::get<int>(op_desc.GetAttr("axis"));
+  PADDLE_ENFORCE(axis == 1,
+                 "the anakin flatten op converter now only support aixs == 1.");

-  int inner = 1;
-  int outer = 1;
-  for (int i = 0; i < in_dims.size(); i++) {
-    if (i < axis) {
-      outer *= in_dims[i];
-    } else {
-      inner *= in_dims[i];
-    }
-  }
-
-  std::vector<int> out_dims = {1, outer, inner, 1};
+  std::vector<int> out_dims = {0, -1, 1, 1};
  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
  engine_->AddOp(op_name, "Reshape", {input}, {output});
  engine_->AddOpAttr<PTuple<int>>(op_name, "dims", out_dims);

--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -47,6 +47,10 @@ class AnakinOpConverter {
    std::string op_type = op_desc.Type();
    AnakinOpConverter *it = nullptr;

+    if (op_type == "reshape2") op_type = "reshape";
+    if (op_type == "transpose2") op_type = "transpose";
+    if (op_type == "flatten2") op_type = "flatten";
+
    if (!it) {
      it = Registry<AnakinOpConverter>::Global().Lookup(op_type);
    }

--- a/paddle/fluid/inference/anakin/convert/test_concat_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_concat_op.cc
@@ -44,6 +44,29 @@ TEST(concat_op, test) {
  validator.Execute(1);
 }

+TEST(concat_op, test2) {
+  std::unordered_set<std::string> parameters({""});
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("concat_x1", {1, 4});
+  validator.DeclInputVar("concat_x2", {3, 4});
+  validator.DeclInputVar("concat_x3", {2, 4});
+  validator.DeclOutputVar("concat_out", {6, 4});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("concat");
+  desc.SetInput("X", {"concat_x1", "concat_x2", "concat_x3"});
+  desc.SetOutput("Out", {"concat_out"});
+
+  int axis = 0;
+  desc.SetAttr("axis", axis);
+
+  validator.SetOp(*desc.Proto());
+
+  validator.Execute(1);
+}
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_flatten_op.cc
@@ -27,13 +27,13 @@ TEST(flatten_op, test) {
  std::unordered_set<std::string> parameters;
  framework::Scope scope;
  AnakinConvertValidation validator(parameters, scope);
-  validator.DeclInputVar("flatten-X", {3, 100, 100, 4});
-  validator.DeclOutputVar("flatten-Out", {1, 300, 400, 1});
+  validator.DeclInputVar("flatten-X", {3, 10, 10, 4});
+  validator.DeclOutputVar("flatten-Out", {3, 400, 1, 1});
  framework::OpDesc desc;
  desc.SetType("flatten");
  desc.SetInput("X", {"flatten-X"});
  desc.SetOutput("Out", {"flatten-Out"});
-  desc.SetAttr("axis", 2);
+  desc.SetAttr("axis", 1);

  LOG(INFO) << "set OP";
  validator.SetOp(*desc.Proto());

--- a/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_reshape_op.cc
@@ -45,6 +45,27 @@ TEST(reshape, test) {
  validator.Execute(1);
 }

+TEST(reshape, test2) {
+  framework::Scope scope;
+  std::unordered_set<std::string> parameters;
+  AnakinConvertValidation validator(parameters, scope);
+
+  validator.DeclInputVar("reshape-X", {1, 2, 4});
+  validator.DeclOutputVar("reshape-Out", {1, 4, 2});
+
+  framework::OpDesc desc;
+  desc.SetType("reshape");
+  desc.SetInput("X", {"reshape-X"});
+  desc.SetOutput("Out", {"reshape-Out"});
+  // desc.SetAttr("shape", std::vector<int>({3, 2, 1, 3}));
+  desc.SetAttr("shape", std::vector<int>({0, -1, 2}));
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+  validator.Execute(1);
+}
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_softmax_op.cc
@@ -27,9 +27,8 @@ TEST(softmax, test) {
  std::unordered_set<std::string> parameters;
  AnakinConvertValidation validator(parameters, scope);

-  std::vector<int> tensor_shape{8, 10};
-  validator.DeclInputVar("softmax-X", {1, 10, 1, 1});
-  validator.DeclOutputVar("softmax-Out", {1, 10, 1, 1});
+  validator.DeclInputVar("softmax-X", {1, 10});
+  validator.DeclOutputVar("softmax-Out", {1, 10});

  framework::OpDesc desc;
  desc.SetType("softmax");

--- a/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
+++ b/paddle/fluid/inference/anakin/convert/test_transpose_op.cc
@@ -43,6 +43,28 @@ TEST(transpose_op, test) {
  validator.Execute(3);
 }

+// test input shape's dims < 4
+TEST(transpose_op, test2) {
+  std::unordered_set<std::string> parameters;
+  framework::Scope scope;
+  AnakinConvertValidation validator(parameters, scope);
+  validator.DeclInputVar("transpose-X", {3, 4, 5});
+  validator.DeclOutputVar("transpose-Out", {3, 5, 4});
+
+  // Prepare Op description
+  framework::OpDesc desc;
+  desc.SetType("transpose");
+  desc.SetInput("X", {"transpose-X"});
+  desc.SetOutput("Out", {"transpose-Out"});
+  desc.SetAttr("axis", std::vector<int>({0, 2, 1}));
+
+  LOG(INFO) << "set OP";
+  validator.SetOp(*desc.Proto());
+  LOG(INFO) << "execute";
+
+  validator.Execute(1);
+}
+
 }  // namespace anakin
 }  // namespace inference
 }  // namespace paddle

--- a/paddle/fluid/inference/anakin/convert/transpose.cc
+++ b/paddle/fluid/inference/anakin/convert/transpose.cc
@@ -40,6 +40,11 @@ void TransposeOpConverter::operator()(const framework::proto::OpDesc &op,
  engine_->AddOp(op_name, "Permute", {input}, {output});

  auto axis = boost::get<std::vector<int>>(op_desc.GetAttr("axis"));
+  size_t axis_size = axis.size();
+  while (axis.size() < 4) {
+    axis.push_back(axis_size);
+    axis_size += 1;
+  }
  engine_->AddOpAttr<PTuple<int>>(op_name, "dims", axis);
 }


--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -127,6 +127,9 @@ class AnakinConvertValidation {
      auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
                                                                        input);
      auto t_shape = framework::vectorize2int(t.dims());
+      while (t_shape.size() < 4) {
+        t_shape.push_back(1);
+      }
      engine_->SetInputShape(input, t_shape);
    }
    engine_->Optimize();

--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@@ -21,7 +21,7 @@ namespace anakin {
 // Just tell by the op_types.
 struct SimpleOpTypeSetTeller : public Teller {
  SimpleOpTypeSetTeller() {
-    // teller_set.insert("mul");
+    teller_set.insert("mul");
    teller_set.insert("fc");
    teller_set.insert("conv2d_fusion");
    teller_set.insert("split");
@@ -30,7 +30,14 @@ struct SimpleOpTypeSetTeller : public Teller {
    teller_set.insert("elementwise_add");
    teller_set.insert("concat");
    teller_set.insert("tanh");
-    // teller_set.insert("conv2d");
+    teller_set.insert("conv2d");
+    teller_set.insert("batch_norm");
+    teller_set.insert("softmax");
+    teller_set.insert("flatten2");
+    teller_set.insert("reshape2");
+    teller_set.insert("transpose2");
+    teller_set.insert("density_prior_box");
+    teller_set.insert("detection_out");
  }

  bool operator()(const std::string& op_type,

--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -45,7 +45,7 @@ std::unique_ptr<framework::ir::Graph> analysis::AnakinSubgraphPass::ApplyImpl(
    return anakin::OpTeller::Global().Tell(node->Op()->Type(), *node->Op());
  };

-  SubGraphFuser fuser(graph.get(), teller, 0);
+  SubGraphFuser fuser(graph.get(), teller, 3 /* min_subgraph_size */);
  fuser();

  for (auto *node : graph->Nodes()) {

--- a/paddle/fluid/inference/api/CMakeLists.txt
+++ b/paddle/fluid/inference/api/CMakeLists.txt
@@ -64,3 +64,8 @@ if (WITH_ANAKIN AND WITH_MKL) # only needed in CI
    anakin_target(inference_anakin_api)
    anakin_target(inference_anakin_api_shared)
 endif()
+if (WITH_ANAKIN_SUBGRAPH)
+   inference_analysis_test(test_anakin_model SRCS mobilenet_test.cc EXTRA_DEPS paddle_fluid) 
+   inference_analysis_test(anakin_conv_model SRCS conv_anakin_test.cc EXTRA_DEPS paddle_fluid) 
+   inference_analysis_test(life_feature_test SRCS life_feature_test.cc EXTRA_DEPS paddle_fluid) 
+endif()
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -808,13 +808,22 @@ USE_TRT_CONVERTER(conv2d_transpose);
 USE_TRT_CONVERTER(leaky_relu);
 #endif

+USE_ANAKIN_CONVERTER(mul);
 USE_ANAKIN_CONVERTER(fc);
 USE_ANAKIN_CONVERTER(conv2d);
+USE_ANAKIN_CONVERTER(conv2d_fusion);
 USE_ANAKIN_CONVERTER(concat);
 USE_ANAKIN_CONVERTER(split);
 USE_ANAKIN_CONVERTER(relu);
 USE_ANAKIN_CONVERTER(sigmoid);
 USE_ANAKIN_CONVERTER(tanh);
 USE_ANAKIN_CONVERTER(pool2d);
-USE_ANAKIN_CONVERTER(conv2d_fusion);
 USE_ANAKIN_CONVERTER(elementwise_add);
+USE_ANAKIN_CONVERTER(batch_norm);
+USE_ANAKIN_CONVERTER(flatten);
+USE_ANAKIN_CONVERTER(reshape);
+USE_ANAKIN_CONVERTER(transpose);
+USE_ANAKIN_CONVERTER(softmax);
+
+USE_ANAKIN_CONVERTER(detection_out);
+USE_ANAKIN_CONVERTER(density_prior_box);