From a25331bc264b935b064655720ab08378bdf8458c Mon Sep 17 00:00:00 2001
From: nhzlx <zlx_hg@163.com>
Date: Wed, 20 Mar 2019 13:02:31 +0000
Subject: [PATCH] cherry-pick from feature/anakin-engine: deal the changing
 shape when using anakin #16189

---
 paddle/fluid/framework/ir/CMakeLists.txt      |  3 +-
 ...anakin_fillconstant_elementwisemul_fuse.cc | 85 +++++++++++++++++++
 .../anakin_fillconstant_elementwisemul_fuse.h | 35 ++++++++
 .../framework/ir/graph_pattern_detector.cc    | 23 +++++
 .../framework/ir/graph_pattern_detector.h     | 15 ++++
 .../simplify_anakin_detection_pattern_pass.cc |  4 +
 .../inference/anakin/convert/CMakeLists.txt   |  6 +-
 .../inference/anakin/convert/op_converter.h   | 64 +++++++++++---
 .../fluid/inference/anakin/convert/scale.cc   | 56 ++++++++++++
 paddle/fluid/inference/anakin/convert/scale.h | 37 ++++++++
 .../inference/anakin/convert/ut_helper.h      |  4 +
 paddle/fluid/inference/anakin/engine.cc       | 24 ++++--
 paddle/fluid/inference/anakin/engine.h        | 26 ++++--
 paddle/fluid/inference/anakin/op_teller.cc    |  1 +
 paddle/fluid/inference/analysis/argument.h    |  3 +
 .../inference/analysis/ir_pass_manager.cc     |  2 +
 .../ir_passes/anakin_subgraph_pass.cc         | 10 ++-
 .../ir_passes/tensorrt_subgraph_pass.cc       |  9 +-
 .../ir_params_sync_among_devices_pass.cc      |  1 -
 paddle/fluid/inference/api/analysis_config.cc |  6 +-
 .../fluid/inference/api/analysis_predictor.cc |  2 +
 .../inference/api/paddle_analysis_config.h    |  5 +-
 .../inference/api/paddle_pass_builder.cc      |  4 +
 .../fluid/operators/anakin/anakin_engine_op.h |  7 +-
 24 files changed, 391 insertions(+), 41 deletions(-)
 create mode 100644 paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
 create mode 100644 paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
 create mode 100644 paddle/fluid/inference/anakin/convert/scale.cc
 create mode 100644 paddle/fluid/inference/anakin/convert/scale.h
diff --git a/paddle/fluid/framework/ir/CMakeLists.txt b/paddle/fluid/framework/ir/CMakeLists.txt
index 36024d4a7d5..49fa323fc66 100644
--- a/paddle/fluid/framework/ir/CMakeLists.txt
+++ b/paddle/fluid/framework/ir/CMakeLists.txt
@@ -72,6 +72,7 @@ pass_library(identity_scale_op_clean_pass base)
 pass_library(sync_batch_norm_pass base)
 pass_library(runtime_context_cache_pass base)
 pass_library(simplify_anakin_detection_pattern_pass inference)
+pass_library(anakin_fillconstant_elementwisemul_fuse inference)
 
 # There may be many transpose-flatten structures in a model, and the output of
 # these structures will be used as inputs to the concat Op. This pattern will
@@ -82,7 +83,7 @@ foreach (index RANGE 3 6)
    file(APPEND ${pass_file} "USE_PASS(transpose_flatten${index}_concat_fuse_pass);\n")
 endforeach()
 
-foreach (index RANGE 3 6)
+foreach (index RANGE 2 6)
    file(APPEND ${pass_file} "USE_PASS(simplify_anakin_detection_pattern_pass${index});\n")
 endforeach()
 
diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
new file mode 100644
index 00000000000..83b0da0c011
--- /dev/null
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.cc
@@ -0,0 +1,85 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <memory>
+#include <string>
+
+#include "paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h"
+#include "paddle/fluid/framework/ir/graph_viz_pass.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+#define GET_IR_NODE(node__) GET_IR_NODE_FROM_SUBGRAPH(node__, node__, pattern);
+#define GET_NODES                 \
+  GET_IR_NODE(fill_constant);     \
+  GET_IR_NODE(fill_constant_out); \
+  GET_IR_NODE(elementwise_mul);   \
+  GET_IR_NODE(elementwise_mul_out);
+
+std::unique_ptr<ir::Graph> AnakinFillconstantElementwisemulFuse::ApplyImpl(
+    std::unique_ptr<ir::Graph> graph) const {
+  const std::string pattern_name = "anakin_fillconstant_elementwisemul_fuse";
+  FusePassBase::Init(pattern_name, graph.get());
+
+  GraphPatternDetector gpd;
+  auto* x = gpd.mutable_pattern()
+                ->NewNode("x")
+                ->assert_is_op_input("elementwise_mul", "X")
+                ->AsInput();
+
+  patterns::AnakinFillConstantElementWiseMulFuse pattern(gpd.mutable_pattern(),
+                                                         pattern_name);
+  pattern(x);
+
+  auto handler = [&](const GraphPatternDetector::subgraph_t& subgraph,
+                     Graph* g) {
+    GET_NODES;
+
+    PADDLE_ENFORCE(subgraph.count(x));
+    auto* elementwise_in = subgraph.at(x);
+    float constant_value =
+        boost::get<float>(fill_constant->Op()->GetAttr("value"));
+
+    framework::OpDesc new_op_desc;
+    new_op_desc.SetType("scale");
+    new_op_desc.SetInput("X", {elementwise_in->Name()});
+    new_op_desc.SetAttr("scale", constant_value);
+    new_op_desc.SetAttr("bias", static_cast<float>(0.0));
+    new_op_desc.SetAttr("bias_after_scale", true);
+    new_op_desc.SetOutput("Out", {elementwise_mul_out->Name()});
+    new_op_desc.Flush();
+
+    // Create a new node for the fused op.
+    auto* scale_op = graph->CreateOpNode(&new_op_desc);
+
+    IR_NODE_LINK_TO(elementwise_in, scale_op);       // Input
+    IR_NODE_LINK_TO(scale_op, elementwise_mul_out);  // Output
+
+    // Delete the unneeded nodes.
+    GraphSafeRemoveNodes(graph.get(),
+                         {fill_constant, fill_constant_out, elementwise_mul});
+  };
+
+  gpd(graph.get(), handler);
+  return graph;
+}
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
+
+REGISTER_PASS(anakin_fillconstant_elementwisemul_fuse,
+              paddle::framework::ir::AnakinFillconstantElementwisemulFuse);
diff --git a/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
new file mode 100644
index 00000000000..fa95143d3ad
--- /dev/null
+++ b/paddle/fluid/framework/ir/anakin_fillconstant_elementwisemul_fuse.h
@@ -0,0 +1,35 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <memory>
+#include "paddle/fluid/framework/ir/fuse_pass_base.h"
+#include "paddle/fluid/framework/ir/graph_pattern_detector.h"
+
+namespace paddle {
+namespace framework {
+namespace ir {
+
+class AnakinFillconstantElementwisemulFuse : public FusePassBase {
+ public:
+  virtual ~AnakinFillconstantElementwisemulFuse() {}
+
+ protected:
+  std::unique_ptr<ir::Graph> ApplyImpl(
+      std::unique_ptr<ir::Graph> graph) const override;
+};
+
+}  // namespace ir
+}  // namespace framework
+}  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.cc b/paddle/fluid/framework/ir/graph_pattern_detector.cc
index 77c9a94df2f..31e259c51d1 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.cc
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.cc
@@ -1596,6 +1596,29 @@ PDNode *patterns::AnakinDetectionPattern::operator()(
   return multiclass_nms_out;
 }
 
+PDNode *patterns::AnakinFillConstantElementWiseMulFuse::operator()(
+    PDNode *elementwise_op_input) {
+  auto fill_constant =
+      pattern->NewNode(fill_constant_repr())->assert_is_op("fill_constant");
+
+  auto fill_constant_out = pattern->NewNode(fill_constant_out_repr())
+                               ->assert_is_op_output("fill_constant")
+                               ->assert_is_op_input("elementwise_mul", "Y")
+                               ->AsIntermediate();
+
+  auto elementwise_mul_op =
+      pattern->NewNode(elementwise_mul_repr())->assert_is_op("elementwise_mul");
+
+  auto elementwise_mul_out = pattern->NewNode(elementwise_mul_out_repr())
+                                 ->assert_is_op_output("elementwise_mul")
+                                 ->AsOutput();
+
+  fill_constant_out->LinksFrom({fill_constant});
+  elementwise_mul_op->LinksFrom({elementwise_op_input, fill_constant_out});
+  elementwise_mul_out->LinksFrom({elementwise_mul_op});
+  return elementwise_mul_out;
+}
+
 }  // namespace ir
 }  // namespace framework
 }  // namespace paddle
diff --git a/paddle/fluid/framework/ir/graph_pattern_detector.h b/paddle/fluid/framework/ir/graph_pattern_detector.h
index 080b2f96444..16cb6fb7aee 100644
--- a/paddle/fluid/framework/ir/graph_pattern_detector.h
+++ b/paddle/fluid/framework/ir/graph_pattern_detector.h
@@ -856,6 +856,21 @@ struct AnakinDetectionPattern : public PatternBase {
   }
 };
 
+struct AnakinFillConstantElementWiseMulFuse : public PatternBase {
+  AnakinFillConstantElementWiseMulFuse(PDPattern* pattern,
+                                       const std::string& name_scope)
+      : PatternBase(pattern, name_scope,
+                    "anakin_fillconstant_elementwisemul_fuse") {}
+
+  PDNode* operator()(PDNode* elementwise_op_input);
+
+  // declare operator node's name
+  PATTERN_DECL_NODE(fill_constant);
+  PATTERN_DECL_NODE(fill_constant_out);
+  PATTERN_DECL_NODE(elementwise_mul);
+  PATTERN_DECL_NODE(elementwise_mul_out);
+};
+
 }  // namespace patterns
 
 // Link two ir::Nodes from each other.
diff --git a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
index 5ab10ba39fa..84fb8063e6f 100644
--- a/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
+++ b/paddle/fluid/framework/ir/simplify_anakin_detection_pattern_pass.cc
@@ -215,6 +215,7 @@ std::unique_ptr<ir::Graph> SimplifyAnakinDetectionPatternPass<times>::ApplyImpl(
 }
 
 template class SimplifyAnakinDetectionPatternPass<1>;
+template class SimplifyAnakinDetectionPatternPass<2>;
 template class SimplifyAnakinDetectionPatternPass<3>;
 template class SimplifyAnakinDetectionPatternPass<4>;
 template class SimplifyAnakinDetectionPatternPass<5>;
@@ -227,6 +228,9 @@ template class SimplifyAnakinDetectionPatternPass<6>;
 REGISTER_PASS(simplify_anakin_detection_pattern_pass,
               paddle::framework::ir::SimplifyAnakinDetectionPatternPass<1>);
 
+REGISTER_PASS(simplify_anakin_detection_pattern_pass2,
+              paddle::framework::ir::SimplifyAnakinDetectionPatternPass<2>);
+
 REGISTER_PASS(simplify_anakin_detection_pattern_pass3,
               paddle::framework::ir::SimplifyAnakinDetectionPatternPass<3>);
 
diff --git a/paddle/fluid/inference/anakin/convert/CMakeLists.txt b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
index 7b08375a7a3..da9ffa5bbf6 100644
--- a/paddle/fluid/inference/anakin/convert/CMakeLists.txt
+++ b/paddle/fluid/inference/anakin/convert/CMakeLists.txt
@@ -1,5 +1,8 @@
 cc_library(anakin_op_converter SRCS fc.cc conv2d.cc conv2d_fusion.cc
-elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc softmax.cc batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc detection_out.cc DEPS anakin_engine framework_proto scope op_registry)
+ elementwise.cc activation.cc pool2d.cc concat.cc split.cc relu.cc  softmax.cc
+batch_norm.cc reshape.cc flatten.cc transpose.cc density_prior_box.cc
+detection_out.cc scale.cc DEPS anakin_engine framework_proto scope op_registry)
+
 cc_test(test_anakin_fc SRCS test_fc_op.cc DEPS anakin_op_converter mul_op)
 cc_test(test_anakin_conv2d SRCS test_conv2d_op.cc DEPS anakin_op_converter conv_op im2col vol2col depthwise_conv)
 cc_test(test_anakin_activation SRCS test_activation_op.cc DEPS activation_op anakin_op_converter)
@@ -13,3 +16,4 @@ cc_test(test_anakin_reshape SRCS test_reshape_op.cc DEPS anakin_op_converter res
 cc_test(test_anakin_flatten SRCS test_flatten_op.cc DEPS anakin_op_converter flatten_op reshape_op)
 cc_test(test_anakin_transpose SRCS test_transpose_op.cc DEPS anakin_op_converter transpose_op)
 cc_test(test_anakin_batch_norm SRCS test_batch_norm_op.cc DEPS anakin_op_converter batch_norm_op)
+cc_test(test_anakin_scale SRCS test_scale_op.cc DEPS anakin_op_converter scale_op math_function)
diff --git a/paddle/fluid/inference/anakin/convert/op_converter.h b/paddle/fluid/inference/anakin/convert/op_converter.h
index 6ce37c39e6c..2eb7f24ce54 100644
--- a/paddle/fluid/inference/anakin/convert/op_converter.h
+++ b/paddle/fluid/inference/anakin/convert/op_converter.h
@@ -14,6 +14,7 @@
 
 #pragma once
 
+#include <map>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -72,32 +73,71 @@ class AnakinOpConverter {
 
   // The scope  here should be inited with the parameter vars.
   void ConvertBlockToAnakinEngine(
-      framework::BlockDesc *block_desc, const framework::Scope &scope,
+      framework::BlockDesc *block_desc, framework::Scope *scope,
       const std::vector<std::string> &inputs,
       const std::unordered_set<std::string> &parameters,
       const std::vector<std::string> &outputs, AnakinNvEngine *engine) {
     framework::proto::BlockDesc *block_proto = block_desc->Proto();
-    ConvertBlock(*block_proto, parameters, scope, engine);
+    ConvertBlock(*block_proto, parameters, *scope, engine);
+
     engine->Freeze();
+    // if the max_batch size
+    int max_batch_size = engine->GetMaxBatchSize();
+    PADDLE_ENFORCE(max_batch_size > 0,
+                   "the max_batch_size setted from config->EnableAnakinEngine "
+                   "must largger than 0");
+    // If the user does not specify this variable, we use the input shape from
+    // the block_desc.
+    auto max_input_shape = engine->GetMaxInputShape();
+    std::map<std::string, std::vector<int>> temp_max_input_shape;
+
     for (auto &input : inputs) {
       if (parameters.count(input)) continue;
-      auto *var = block_desc->FindVar(input);
-      PADDLE_ENFORCE(var, "no variable called %s", input);
-
-      auto var_shape = var->GetShape();
-      PADDLE_ENFORCE(var_shape.size() == 4);
       std::vector<int> input_shape;
-      for (int i = 0; i < var_shape.size(); i++) {
-        input_shape.push_back(var_shape[i]);
+      input_shape.resize(4);
+      input_shape[0] = max_batch_size;
+      if (max_input_shape.count(input)) {
+        PADDLE_ENFORCE(max_input_shape[input].size() == 4,
+                       "the dimensions of  max_input_shape setted from "
+                       "config->EnableAnakinEngine must be 4");
+        for (int i = 1; i < 4; i++) {
+          input_shape[i] = max_input_shape[input][i];
+        }
+      } else {
+        auto *var = block_desc->FindVar(input);
+        PADDLE_ENFORCE(var, "no variable called %s", input);
+
+        auto var_shape = var->GetShape();
+        std::cout << "input :" << input << std::endl;
+        PADDLE_ENFORCE(var_shape.size() == 4);
+
+        for (size_t i = 1; i < var_shape.size(); i++) {
+          input_shape[i] = var_shape[i];
+        }
       }
-      input_shape[0] = engine->GetMaxBatch();
-
+      temp_max_input_shape[input] = input_shape;
       engine->SetInputShape(input, input_shape);
+      // engine->Graph()->RegistVar(input); // For share from data.
     }
+    engine->SetMaxInputShape(temp_max_input_shape);
 
-    // engine->Graph()->RegistAllOut();
     engine->Optimize();
     engine->InitGraph();
+    /*
+    for(auto& input : inputs) {
+      platform::CUDAPlace gpu_place(engine->GetDevice());
+      auto input_var = scope->Var();
+      auto input_tensor = input_var->GetMutable<framework::LoDTensor>();
+      auto input_max_shape = temp_max_input_shape[input];
+      input_tensor->Resize(framework::make_ddim(input_max_shape));
+      auto input_data = input_tensor->mutable_data<float>(gpu_place);
+      auto* anakin_input = engine->Net()->get_in(input);
+
+      ::anakin::saber::Tensor<::anakin::saber::NV> tmp_anakin_tensor(input_data,
+    ::anakin::saber::NV(), 0, input_max_shape);
+      anakin_input->share_from(tmp_anakin_tensor);
+    }
+    */
   }
 
   void SetEngine(AnakinNvEngine *engine) { engine_ = engine; }
diff --git a/paddle/fluid/inference/anakin/convert/scale.cc b/paddle/fluid/inference/anakin/convert/scale.cc
new file mode 100644
index 00000000000..6f3aa8c5d11
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/scale.cc
@@ -0,0 +1,56 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "paddle/fluid/inference/anakin/convert/scale.h"
+#include <algorithm>
+#include <map>
+
+using anakin::graph::GraphGlobalMem;
+using anakin::AK_FLOAT;
+using anakin::saber::NV;
+using anakin::saber::Shape;
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+void ScaleOpConverter::operator()(const framework::proto::OpDesc &op,
+                                  const framework::Scope &scope,
+                                  bool test_mode) {
+  framework::OpDesc op_desc(op, nullptr);
+  PADDLE_ENFORCE_EQ(op_desc.Input("X").size(), 1);
+  PADDLE_ENFORCE_EQ(op_desc.Output("Out").size(), 1);
+
+  auto op_name = op_desc.Type() + ":" + op_desc.Output("Out").front();
+
+  auto input_name = op_desc.Input("X").front();
+  auto output_name = op_desc.Output("Out").front();
+  float scale = boost::get<float>(op_desc.GetAttr("scale"));
+  float bias = boost::get<float>(op_desc.GetAttr("bias"));
+  float bias_after_scale =
+      boost::get<bool>(op_desc.GetAttr("bias_after_scale"));
+  PADDLE_ENFORCE(bias_after_scale,
+                 "The anakin scale layer only support bias after scale now.");
+
+  engine_->AddOp(op_name, "Power", {input_name}, {output_name});
+  engine_->AddOpAttr(op_name, "shift", bias);
+  engine_->AddOpAttr(op_name, "scale", scale);
+  engine_->AddOpAttr(op_name, "power", static_cast<float>(1.0));
+}
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
+
+REGISTER_ANAKIN_OP_CONVERTER(scale, ScaleOpConverter);
diff --git a/paddle/fluid/inference/anakin/convert/scale.h b/paddle/fluid/inference/anakin/convert/scale.h
new file mode 100644
index 00000000000..b858e3c5124
--- /dev/null
+++ b/paddle/fluid/inference/anakin/convert/scale.h
@@ -0,0 +1,37 @@
+// Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <map>
+#include <string>
+#include "paddle/fluid/inference/anakin/convert/op_converter.h"
+
+namespace paddle {
+namespace inference {
+namespace anakin {
+
+class ScaleOpConverter : public AnakinOpConverter {
+ public:
+  ScaleOpConverter() = default;
+
+  virtual void operator()(const framework::proto::OpDesc &op,
+                          const framework::Scope &scope,
+                          bool test_mode) override;
+  virtual ~ScaleOpConverter() {}
+};
+
+}  // namespace anakin
+}  // namespace inference
+}  // namespace paddle
diff --git a/paddle/fluid/inference/anakin/convert/ut_helper.h b/paddle/fluid/inference/anakin/convert/ut_helper.h
index 1b0ef8c7dbe..d62d11d25bb 100644
--- a/paddle/fluid/inference/anakin/convert/ut_helper.h
+++ b/paddle/fluid/inference/anakin/convert/ut_helper.h
@@ -122,6 +122,8 @@ class AnakinConvertValidation {
     Singleton<AnakinOpConverter>::Global().ConvertOp(
         desc, parameters_, scope_, engine_.get(), true /*test_mode*/);
     engine_->Freeze();
+
+    std::map<std::string, std::vector<int>> temp_max_input_shape;
     for (const auto& input : op_desc_->InputArgumentNames()) {
       if (parameters_.count(input)) continue;
       auto& t = inference::analysis::GetFromScope<framework::LoDTensor>(scope_,
@@ -131,7 +133,9 @@ class AnakinConvertValidation {
         t_shape.push_back(1);
       }
       engine_->SetInputShape(input, t_shape);
+      temp_max_input_shape[input] = t_shape;
     }
+    engine_->SetMaxInputShape(temp_max_input_shape);
     engine_->Optimize();
     engine_->InitGraph();
   }
diff --git a/paddle/fluid/inference/anakin/engine.cc b/paddle/fluid/inference/anakin/engine.cc
index b8b0d06d210..176bc1254b5 100644
--- a/paddle/fluid/inference/anakin/engine.cc
+++ b/paddle/fluid/inference/anakin/engine.cc
@@ -33,13 +33,14 @@ namespace inference {
 namespace anakin {
 
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
-AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(bool need_summary,
-                                                            int device,
-                                                            int max_batch_size)
+AnakinEngine<TargetT, PrecisionType, RunType>::AnakinEngine(
+    bool need_summary, int device, int max_batch_size,
+    std::map<std::string, std::vector<int>> max_input_shape)
     : graph_(new AnakinGraphT<TargetT, PrecisionType>()),
       net_(new AnakinNetT<TargetT, PrecisionType, RunType>(need_summary)) {
   device_ = device;
   max_batch_size_ = max_batch_size;
+  max_input_shape_ = max_input_shape;
 }
 
 template <typename TargetT, Precision PrecisionType, OpRunType RunType>
@@ -75,20 +76,31 @@ void AnakinEngine<TargetT, PrecisionType, RunType>::Execute(
     auto *data = tensor->data<float>();
 
     auto fluid_input_shape = framework::vectorize2int(tensor->dims());
+    while (fluid_input_shape.size() < 4) {
+      fluid_input_shape.push_back(1);
+    }
     auto *anakin_input = net_->get_in(input.first);
-    auto net_shape = anakin_input->shape();
+    std::vector<int> max_input_shape = max_input_shape_[input.first];
+    int max_shape_sum =
+        std::accumulate(max_input_shape.begin(), max_input_shape.end(), 1,
+                        std::multiplies<int>());
+
+    PADDLE_ENFORCE(max_shape_sum >= tensor->numel(),
+                   "The anakin input max shape should be greater than"
+                   " or equal to the real input shape, Please set the max "
+                   "input shape using EnableAnakinEngine");
+    /*
     if (tensor->numel() > net_shape.count()) {
       graph_->Reshape(input.first, fluid_input_shape);
       net_.reset(new AnakinNetT<TargetT, PrecisionType, RunType>(true));
       net_->init(*graph_);
       anakin_input = net_->get_in(input.first);
     }
+    */
 
     anakin_input->reshape(fluid_input_shape);
-    net_shape = anakin_input->shape();
 
     ::anakin::saber::Tensor<TargetT> tmp_anakin_tensor(data, TargetT(), 0,
-                                                       // net_shape);
                                                        fluid_input_shape);
     anakin_input->copy_from(tmp_anakin_tensor);
   }
diff --git a/paddle/fluid/inference/anakin/engine.h b/paddle/fluid/inference/anakin/engine.h
index 101ca491678..3835ead1946 100644
--- a/paddle/fluid/inference/anakin/engine.h
+++ b/paddle/fluid/inference/anakin/engine.h
@@ -15,6 +15,7 @@
 #pragma once
 
 #include <algorithm>
+#include <functional>
 #include <map>
 #include <memory>
 #include <string>
@@ -55,8 +56,9 @@ class AnakinEngine {
   using GraphT = ::anakin::graph::Graph<TargetT, PrecisionType>;
 
  public:
-  explicit AnakinEngine(bool need_summary = false, int device = 0,
-                        int max_batch_size = 1);
+  explicit AnakinEngine(
+      bool need_summary = false, int device = 0, int max_batch_size = 1,
+      std::map<std::string, std::vector<int>> max_input_shape = {});
   ~AnakinEngine();
   void InitGraph();
   void SetInputShape(const std::string &name, std::vector<int> shape);
@@ -73,10 +75,17 @@ class AnakinEngine {
   NetT *Net() { return net_.get(); }
   GraphT *Graph() { return graph_.get(); }
   std::unique_ptr<AnakinEngine> Clone();
+  const std::map<std::string, std::vector<int>> &GetMaxInputShape() {
+    return max_input_shape_;
+  }
+  void SetMaxInputShape(std::map<std::string, std::vector<int>> shape) {
+    max_input_shape_ = shape;
+  }
+  int GetMaxBatchSize() { return max_batch_size_; }
   void Freeze();
   void Optimize();
   void Save(std::string path) { graph_->save(path); }
-  int GetMaxBatch() { return max_batch_size_; }
+  int GetDevice() { return device_; }
   // void SaveSerializedData(std::string& data) { graph_->save_to_string(data);
   // }
   // void LoadSerializedData(const std::string& data) {
@@ -87,6 +96,7 @@ class AnakinEngine {
 
  private:
   int max_batch_size_;
+  std::map<std::string, std::vector<int>> max_input_shape_;
   int device_;
   std::unique_ptr<GraphT> graph_;
   std::unique_ptr<NetT> net_;
@@ -104,11 +114,13 @@ class AnakinEngineManager {
     return engines_.at(name).get();
   }
 
-  AnakinNvEngineT *Create(bool need_summary, int device, int max_batch_size,
-                          std::string engine_name) {
+  AnakinNvEngineT *Create(
+      bool need_summary, int device, int max_batch_size,
+      std::map<std::string, std::vector<int>> max_input_shape,
+      std::string engine_name) {
     std::unique_lock<std::mutex> lk(mut_);
-    auto *p = new AnakinEngine<NV, Precision::FP32>(need_summary, device,
-                                                    max_batch_size);
+    auto *p = new AnakinEngine<NV, Precision::FP32>(
+        need_summary, device, max_batch_size, max_input_shape);
     engines_[engine_name].reset(p);
     return p;
   }
diff --git a/paddle/fluid/inference/anakin/op_teller.cc b/paddle/fluid/inference/anakin/op_teller.cc
index 3166f68b67a..3270f5b57a1 100644
--- a/paddle/fluid/inference/anakin/op_teller.cc
+++ b/paddle/fluid/inference/anakin/op_teller.cc
@@ -38,6 +38,7 @@ struct SimpleOpTypeSetTeller : public Teller {
     teller_set.insert("transpose2");
     teller_set.insert("density_prior_box");
     teller_set.insert("detection_out");
+    teller_set.insert("scale");
   }
 
   bool operator()(const std::string& op_type,
diff --git a/paddle/fluid/inference/analysis/argument.h b/paddle/fluid/inference/analysis/argument.h
index 87aceba4793..992c779711a 100644
--- a/paddle/fluid/inference/analysis/argument.h
+++ b/paddle/fluid/inference/analysis/argument.h
@@ -57,6 +57,7 @@ struct Argument {
   using unique_ptr_t = std::unique_ptr<void, std::function<void(void*)>>;
   using fusion_statis_t = std::unordered_map<std::string, int>;
   using engine_opt_info_t = std::map<std::string, std::string>;
+  using anakin_max_shape_t = std::map<std::string, std::vector<int>>;
 
   bool Has(const std::string& key) const { return valid_fields_.count(key); }
 
@@ -150,6 +151,8 @@ struct Argument {
   DECL_ARGUMENT_FIELD(tensorrt_use_static_engine, TensorRtUseStaticEngine,
                       bool);
 
+  DECL_ARGUMENT_FIELD(anakin_max_input_shape, AnakinMaxInputShape,
+                      anakin_max_shape_t);
   DECL_ARGUMENT_FIELD(anakin_max_batch_size, AnakinMaxBatchSize, int);
   DECL_ARGUMENT_FIELD(use_anakin, UseAnakin, bool);
 
diff --git a/paddle/fluid/inference/analysis/ir_pass_manager.cc b/paddle/fluid/inference/analysis/ir_pass_manager.cc
index 3dc9c347b5f..b0e07fdf132 100644
--- a/paddle/fluid/inference/analysis/ir_pass_manager.cc
+++ b/paddle/fluid/inference/analysis/ir_pass_manager.cc
@@ -77,6 +77,8 @@ void IRPassManager::CreatePasses(Argument *argument,
       pass->Set("engine_opt_info", new std::map<std::string, std::string>(
                                        argument->engine_opt_info()));
       pass->Set("predictor_id", new int(argument->predictor_id()));
+      pass->Set("max_input_shape", new std::map<std::string, std::vector<int>>(
+                                       argument->anakin_max_input_shape()));
       pass->Set("max_batch_size", new int(argument->anakin_max_batch_size()));
     }
 
diff --git a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
index b2bd1ec0ea1..de41e05f1a6 100644
--- a/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/anakin_subgraph_pass.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include <algorithm>
+#include <map>
 #include <memory>
 #include <set>
 #include <string>
@@ -256,11 +257,14 @@ void AnakinSubgraphPass::CreateAnakinOp(
       input_names_with_id, output_names_with_id, std::to_string(predictor_id));
 
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
-  int max_batch_size = Get<int>("max_batch_size");
+  auto max_input_shape =
+      Get<std::map<std::string, std::vector<int>>>("max_input_shape");
+  auto max_batch_size = Get<int>("max_batch_size");
 
   auto *anakin_engine =
       inference::Singleton<anakin::AnakinEngineManager>::Global().Create(
-          true, Get<int>("gpu_device_id"), max_batch_size, engine_key);
+          true, Get<int>("gpu_device_id"), max_batch_size, max_input_shape,
+          engine_key);
 
   auto *scope = param_scope();
   std::unordered_set<std::string> param_set(params.begin(), params.end());
@@ -268,7 +272,7 @@ void AnakinSubgraphPass::CreateAnakinOp(
 
   inference::Singleton<inference::anakin::AnakinOpConverter>::Global()
       .ConvertBlockToAnakinEngine(
-          &block_desc_temp, *scope,
+          &block_desc_temp, scope,
           std::vector<std::string>(input_names.begin(), input_names.end()),
           param_set, output_mapping, anakin_engine);
 }
diff --git a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
index 1800f06f2de..69d6ab1022e 100644
--- a/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
+++ b/paddle/fluid/inference/analysis/ir_passes/tensorrt_subgraph_pass.cc
@@ -214,13 +214,16 @@ void TensorRtSubgraphPass::CreateTensorRTOp(
                                       std::to_string(0));
 
   // Get "" when there is no cached calibration table data.
-  std::string calibration_data = GetTrtCalibTableData(
-      Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+  bool load_from_memory = Get<bool>("model_from_memory");
+  std::string calibration_data = "";
+  if (!load_from_memory) {
+    calibration_data = GetTrtCalibTableData(
+        Get<std::string>("model_opt_cache_dir"), engine_key, enable_int8);
+  }
   SetAttr(op_desc->Proto(), "calibration_data", calibration_data);
 
   SetAttr(op_desc->Proto(), "enable_int8", enable_int8);
   SetAttr(op_desc->Proto(), "engine_key", engine_key);
-  bool load_from_memory = Get<bool>("model_from_memory");
   std::string trt_engine_serialized_data = "";
   if (load_from_memory) {
     std::map<std::string, std::string> engine_opt_info =
diff --git a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
index 8360963f736..d13ec7608c3 100644
--- a/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
+++ b/paddle/fluid/inference/analysis/passes/ir_params_sync_among_devices_pass.cc
@@ -30,7 +30,6 @@ void IrParamsSyncAmongDevicesPass::RunImpl(Argument *argument) {
 
   // The parameters are on the cpu, therefore, synchronization is not necessary.
   if (!argument->use_gpu()) return;
-  return;
 
   auto &graph = argument->main_graph();
   std::vector<std::string> repetitive_params;
diff --git a/paddle/fluid/inference/api/analysis_config.cc b/paddle/fluid/inference/api/analysis_config.cc
index 3c17f49fa35..7bfdada4966 100644
--- a/paddle/fluid/inference/api/analysis_config.cc
+++ b/paddle/fluid/inference/api/analysis_config.cc
@@ -111,6 +111,7 @@ AnalysisConfig::AnalysisConfig(const AnalysisConfig &other) {
 
   CP_MEMBER(use_anakin_);
   CP_MEMBER(anakin_max_batchsize_);
+  CP_MEMBER(anakin_max_input_shape_);
 
   // Ir related.
   CP_MEMBER(enable_ir_optim_);
@@ -355,8 +356,11 @@ void AnalysisConfig::SwitchIrDebug(int x) {
   ir_debug_ = x;
   Update();
 }
-void AnalysisConfig::EnableAnakinEngine(int max_batch_size) {
+void AnalysisConfig::EnableAnakinEngine(
+    int max_batch_size,
+    std::map<std::string, std::vector<int>> max_input_shape) {
   anakin_max_batchsize_ = max_batch_size;
+  anakin_max_input_shape_ = max_input_shape;
   use_anakin_ = true;
   Update();
 }
diff --git a/paddle/fluid/inference/api/analysis_predictor.cc b/paddle/fluid/inference/api/analysis_predictor.cc
index 9c992602e0a..bcae080bc9a 100644
--- a/paddle/fluid/inference/api/analysis_predictor.cc
+++ b/paddle/fluid/inference/api/analysis_predictor.cc
@@ -380,6 +380,7 @@ void AnalysisPredictor::OptimizeInferenceProgram() {
 
   if (config_.use_gpu() && config_.anakin_engine_enabled()) {
     argument_.SetAnakinMaxBatchSize(config_.anakin_max_batchsize_);
+    argument_.SetAnakinMaxInputShape(config_.anakin_max_input_shape_);
     LOG(INFO) << "Anakin subgraph engine is enabled";
   }
 
@@ -835,3 +836,4 @@ USE_ANAKIN_CONVERTER(softmax);
 
 USE_ANAKIN_CONVERTER(detection_out);
 USE_ANAKIN_CONVERTER(density_prior_box);
+USE_ANAKIN_CONVERTER(scale);
diff --git a/paddle/fluid/inference/api/paddle_analysis_config.h b/paddle/fluid/inference/api/paddle_analysis_config.h
index 65dd669c95f..9a29f8f77ed 100644
--- a/paddle/fluid/inference/api/paddle_analysis_config.h
+++ b/paddle/fluid/inference/api/paddle_analysis_config.h
@@ -145,7 +145,9 @@ struct AnalysisConfig {
   /**
    *  \brief Turn on the usage of Anakin sub-graph engine.
    */
-  void EnableAnakinEngine(int max_batch_size = 1);
+  void EnableAnakinEngine(
+      int max_batch_size = 1,
+      std::map<std::string, std::vector<int>> max_input_shape = {});
 
   /** A boolean state indicating whether the Anakin sub-graph engine is used.
   */
@@ -271,6 +273,7 @@ struct AnalysisConfig {
   mutable std::unique_ptr<PassStrategy> pass_builder_;
   bool use_anakin_{false};
   int anakin_max_batchsize_;
+  std::map<std::string, std::vector<int>> anakin_max_input_shape_;
   std::map<std::string, std::string> engine_opt_info_;
 };
 
diff --git a/paddle/fluid/inference/api/paddle_pass_builder.cc b/paddle/fluid/inference/api/paddle_pass_builder.cc
index f6d82a57d29..8db636274fb 100644
--- a/paddle/fluid/inference/api/paddle_pass_builder.cc
+++ b/paddle/fluid/inference/api/paddle_pass_builder.cc
@@ -71,7 +71,11 @@ void GpuPassStrategy::EnableMKLDNN() {
 // The following passes works for Anakin sub-graph engine.
 const std::vector<std::string> kAnakinSubgraphPasses({
     "infer_clean_graph_pass",                   //
+    "simplify_anakin_detection_pattern_pass5",  //
+    "simplify_anakin_detection_pattern_pass4",  //
     "simplify_anakin_detection_pattern_pass3",  //
+    "simplify_anakin_detection_pattern_pass2",  //
+    "anakin_fillconstant_elementwisemul_fuse",  //
     "fc_fuse_pass",                             //
     "conv_elementwise_add_fuse_pass",           //
     "conv_bn_fuse_pass",                        //
diff --git a/paddle/fluid/operators/anakin/anakin_engine_op.h b/paddle/fluid/operators/anakin/anakin_engine_op.h
index 7a70836652d..bbe9a221b2c 100644
--- a/paddle/fluid/operators/anakin/anakin_engine_op.h
+++ b/paddle/fluid/operators/anakin/anakin_engine_op.h
@@ -97,6 +97,7 @@ class AnakinEngineOp : public framework::OperatorBase {
       if (param_names_.count(x)) continue;
       auto &t =
           inference::analysis::GetFromScope<framework::LoDTensor>(scope, x);
+      /*
       auto t_shape = framework::vectorize(t.dims());
       auto *anakin_input = engine->Net()->get_in(x);
       auto net_shape = anakin_input->shape();
@@ -112,20 +113,16 @@ class AnakinEngineOp : public framework::OperatorBase {
         t.mutable_data<float>(dev_place);
         TensorCopySync(temp_t, dev_place, &t);
       }
+      */
       inputs.insert({x, &t});
     }
 
     std::map<std::string, framework::LoDTensor *> outputs;
     int output_index = 0;
     for (const auto &y : Outputs("Ys")) {
-      // std::vector<int> ddim =
-      //    engine->Net()->get_out(output_maps[output_index])->valid_shape();
-      // we need get the output anakin output shape.
       auto *fluid_v = scope.FindVar(y);
       PADDLE_ENFORCE_NOT_NULL(fluid_v, "no output variable called %s", y);
       auto *fluid_t = fluid_v->GetMutable<framework::LoDTensor>();
-      // fluid_t->Resize(framework::make_ddim(ddim));
-      // fluid_t->mutable_data<float>(boost::get<platform::CUDAPlace>(dev_place));
       outputs.insert({output_maps[output_index], fluid_t});
       output_index += 1;
     }
-- 
GitLab