Tailoring cherrypick (#2263)

* cherry-pick model-tailoring into release/v2.1.0 test=develop

Tailoring cherrypick (#2263)
* cherry-pick model-tailoring into release/v2.1.0 test=develop
5f72a91b · huzhiqiang · GitHub · 71684f98 · 5f72a91b · 5f72a91b
23 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 # publish options
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
+lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)

 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -241,6 +241,10 @@ set(host_kernels CACHE INTERNAL "host kernels")

 set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
 file(WRITE ${kernels_src_list} "") # clean
+if(LITE_BUILD_TAILOR)
+  set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
+  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
+endif()
 # add a kernel for some specific device
 # device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
 # level: one of (basic, extra)
@@ -252,6 +256,15 @@ function(add_kernel TARGET device level)
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+    if(LITE_BUILD_TAILOR)
+      foreach(src ${args_SRCS})
+        list (FIND tailored_kernels_list ${src} _index)
+        if (${_index} EQUAL -1)
+          return()
+        endif()
+      endforeach()
+    endif()
+
    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
        return()
    endif()
@@ -338,6 +351,10 @@ endfunction()
 set(ops CACHE INTERNAL "ops")
 set(ops_src_list "${CMAKE_BINARY_DIR}/ops_src_list.txt")
 file(WRITE ${ops_src_list} "") # clean
+if(LITE_BUILD_TAILOR)
+  set(tailored_ops_list_path "${LITE_OPTMODEL_DIR}/.tailored_ops_source_list")
+  file(STRINGS ${tailored_ops_list_path} tailored_ops_list)
+endif()
 # add an operator
 # level: one of (basic, extra)
 function(add_operator TARGET level)
@@ -348,16 +365,24 @@ function(add_operator TARGET level)
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+
    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
        return()
    endif()

-    set(ops "${ops};${TARGET}" CACHE INTERNAL "source")

    foreach(src ${args_SRCS})
+      if(LITE_BUILD_TAILOR)
+        list(FIND tailored_ops_list ${src} _index)
+        if (${_index} EQUAL -1)
+          return()
+        endif()
+      endif()
      file(APPEND ${ops_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
    endforeach()

+    set(ops "${ops};${TARGET}" CACHE INTERNAL "source")
+
    lite_cc_library(${TARGET} SRCS ${args_SRCS}
              DEPS ${args_DEPS}
              X86_DEPS ${args_X86_DEPS}

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -9,12 +9,17 @@ if (LITE_ON_TINY_PUBLISH)
    set(CMAKE_C_FLAGS_RELEASE "-Os -DNDEBUG")
 endif()
 set(light_lib_DEPS light_api paddle_api paddle_api_light optimizer)
-if ((NOT LITE_ON_TINY_PUBLISH) AND (ARM_TARGET_OS STREQUAL "android"))
+if ((NOT LITE_ON_TINY_PUBLISH) AND (LITE_WITH_X86 OR ARM_TARGET_OS STREQUAL "android"))
    #full api dynamic library
    add_library(paddle_full_api_shared SHARED "")
    target_sources(paddle_full_api_shared PUBLIC ${__lite_cc_files} paddle_api.cc light_api.cc cxx_api.cc cxx_api_impl.cc light_api_impl.cc)
    add_dependencies(paddle_full_api_shared op_list_h kernel_list_h framework_proto)
    target_link_libraries(paddle_full_api_shared framework_proto)
+    add_dependencies(lite_compile_deps paddle_full_api_shared)
+    if(LITE_WITH_X86)
+       add_dependencies(paddle_full_api_shared xxhash)
+       target_link_libraries(paddle_full_api_shared xxhash)
+    endif()
    
    #light api dynamic library
    lite_cc_library(paddle_light_api_shared MODULE

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -15,6 +15,7 @@
 #include "lite/api/cxx_api.h"
 #include <algorithm>
 #include <memory>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -23,8 +24,16 @@
 namespace paddle {
 namespace lite {

+static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
+    ".tailored_ops_source_list";
+static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
+static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
+    ".tailored_kernels_source_list";
+static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
+
 void Predictor::SaveModel(const std::string &dir,
-                          lite_api::LiteModelType model_type) {
+                          lite_api::LiteModelType model_type,
+                          bool record_info) {
  if (!program_) {
    GenRuntimeProgram();
  }
@@ -40,6 +49,83 @@ void Predictor::SaveModel(const std::string &dir,
    default:
      LOG(FATAL) << "Unknown model type";
  }
+  if (record_info) {
+    SaveOpKernelInfo(dir);
+  }
+}
+
+void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
+  std::set<std::string> ops_info;
+  std::set<std::string> kernels_info;
+  const auto &instructions_ = program_->instructions();
+  for (auto &node : instructions_) {
+    // parse op type infomation
+    auto op = node.op()->op_info();
+    ops_info.insert(op->Type());
+    // parse kernel type information
+    std::string kernel_type_str =
+        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
+        "," + PrecisionRepr(node.kernel()->precision()) + "," +
+        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
+    kernels_info.insert(kernel_type_str);
+  }
+
+  // get souce_file name from op type and kernel type
+  auto op2pathmap = OpKernelInfoCollector::Global().GetOp2PathDict();
+  auto kernel2pathmap = OpKernelInfoCollector::Global().GetKernel2PathDict();
+
+  // write used op and kernel info into files
+  std::string opf_path = model_dir + "/" + TAILORD_OPS_LIST_NAME;
+  std::string opf_source_path =
+      model_dir + "/" + TAILORD_OPS_SOURCE_LIST_FILENAME;
+  std::string kpf_path = model_dir + "/" + TAILORD_KERNELS_LIST_NAME;
+  std::string kpf_source_path =
+      model_dir + "/" + TAILORD_KERNELS_SOURCE_LIST_FILENAME;
+  std::map<std::string, std::string> op2path;
+
+  std::FILE *opf = std::fopen(opf_path.c_str(), "w");
+  std::FILE *opf_source = std::fopen(opf_source_path.c_str(), "w");
+  std::FILE *kpf = std::fopen(kpf_path.c_str(), "w");
+  std::FILE *kpf_source = std::fopen(kpf_source_path.c_str(), "w");
+  std::vector<std::string> opcompile;
+  std::vector<std::string> kernelcompile;
+
+  if (nullptr == opf || nullptr == opf_source || nullptr == opf ||
+      nullptr == kpf_source) {
+    LOG(FATAL) << "failed to create info file into: " << model_dir;
+  }
+  for (auto op_info = ops_info.begin(); op_info != ops_info.end(); op_info++) {
+    fputs(op_info->c_str(), opf);
+    fputc('\n', opf);
+    std::string op_path = op2pathmap[*op_info];
+    fputs(op_path.c_str(), opf_source);
+    fputc('\n', opf_source);
+  }
+  std::fclose(opf_source);
+  std::fclose(opf);
+  LOG(INFO) << "operators information of tailored model is stored into: "
+            << opf_path;
+
+  // write Kernel_type and Kernel_path into file
+  for (auto kernel_info = kernels_info.begin();
+       kernel_info != kernels_info.end();
+       kernel_info++) {
+    fputs(kernel_info->c_str(), kpf);
+    fputc('\n', kpf);
+    std::string kernel_path = kernel2pathmap[*kernel_info];
+    fputs(kernel_path.c_str(), kpf_source);
+    fputc('\n', kpf_source);
+    if (kernel_path == "conv_compute.cc") {
+      fputs(
+          "conv_depthwise.cc\nconv_direct.cc\nconv_gemmlike.cc\nconv_"
+          "winograd.cc\n",
+          kpf_source);
+    }
+  }
+  std::fclose(kpf_source);
+  std::fclose(kpf);
+  LOG(INFO) << "kernels information of tailored model is stored into: "
+            << kpf_path;
 }

 lite::Tensor *Predictor::GetInput(size_t offset) {
@@ -61,7 +147,7 @@ void Predictor::PrepareFeedFetch() {
  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
  std::vector<cpp::OpDesc *> feeds;
  std::vector<cpp::OpDesc *> fetchs;
-  for (int i = 0; i < current_block->OpsSize(); i++) {
+  for (size_t i = 0; i < current_block->OpsSize(); i++) {
    auto op = current_block->GetOp<cpp::OpDesc>(i);
    if (op->Type() == "feed") {
      feeds.push_back(op);
@@ -71,11 +157,11 @@ void Predictor::PrepareFeedFetch() {
  }
  input_names_.resize(feeds.size());
  output_names_.resize(fetchs.size());
-  for (int i = 0; i < feeds.size(); i++) {
+  for (size_t i = 0; i < feeds.size(); i++) {
    input_names_[feeds[i]->GetAttr<int>("col")] =
        feeds[i]->Output("Out").front();
  }
-  for (int i = 0; i < fetchs.size(); i++) {
+  for (size_t i = 0; i < fetchs.size(); i++) {
    output_names_[fetchs[i]->GetAttr<int>("col")] =
        fetchs[i]->Input("X").front();
  }
@@ -191,7 +277,7 @@ lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  if (element == input_names_.end()) {
    LOG(ERROR) << "Model do not have input named with: [" << name
               << "], model's inputs include:";
-    for (int i = 0; i < input_names_.size(); i++) {
+    for (size_t i = 0; i < input_names_.size(); i++) {
      LOG(ERROR) << "[" << input_names_[i] << "]";
    }
    return nullptr;

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -89,7 +89,9 @@ class LITE_API Predictor {
  // This method is disabled in mobile, for unnecessary dependencies required.
  void SaveModel(
      const std::string& dir,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf);
+      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
+      bool record_info = false);
+  void SaveOpKernelInfo(const std::string& model_dir);

 #ifdef LITE_WITH_TRAIN
  void Run(const std::vector<framework::Tensor>& tensors) {
@@ -137,9 +139,10 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
  std::unique_ptr<lite_api::Tensor> GetInputByName(
      const std::string& name) override;

-  void SaveOptimizedModel(const std::string& model_dir,
-                          lite_api::LiteModelType model_type =
-                              lite_api::LiteModelType::kProtobuf) override;
+  void SaveOptimizedModel(
+      const std::string& model_dir,
+      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
+      bool record_info = false) override;

 private:
  Predictor raw_predictor_;

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -65,8 +65,9 @@ std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
 }

 void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir,
-                                          lite_api::LiteModelType model_type) {
-  raw_predictor_.SaveModel(model_dir, model_type);
+                                          lite_api::LiteModelType model_type,
+                                          bool record_info) {
+  raw_predictor_.SaveModel(model_dir, model_type, record_info);
 }

 }  // namespace lite

--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -16,7 +16,10 @@
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #endif
+// "all_kernel_faked.cc" and "kernel_src_map.h" are created automatically during
+// model_optimize_tool's compiling period
 #include "all_kernel_faked.cc"  // NOLINT
+#include "kernel_src_map.h"     // NOLINT
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
@@ -35,6 +38,11 @@ DEFINE_string(
    "protobuf",
    "store type of the output optimized model. protobuf/naive_buffer");
 DEFINE_bool(display_kernels, false, "Display kernel information");
+DEFINE_bool(record_tailoring_info,
+            false,
+            "Record kernels and operators information of the optimized model "
+            "for tailoring compiling, information are stored into optimized "
+            "model path as hidden files");
 DEFINE_string(optimize_out, "", "path of the output optimized model");
 DEFINE_string(valid_targets,
              "arm",
@@ -104,8 +112,14 @@ void Main() {
  } else {
    LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type;
  }
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);

-  predictor->SaveOptimizedModel(FLAGS_optimize_out, model_type);
+  predictor->SaveOptimizedModel(
+      FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info);
+  if (FLAGS_record_tailoring_info) {
+    LOG(INFO) << "Record the information of tailored model into :"
+              << FLAGS_optimize_out;
+  }
 }

 }  // namespace lite_api

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -145,7 +145,8 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
 void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }

 void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
-                                         LiteModelType model_type) {
+                                         LiteModelType model_type,
+                                         bool record_info) {
  LOG(FATAL)
      << "The SaveOptimizedModel API is only supported by CxxConfig predictor.";
 }

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -97,7 +97,8 @@ class LITE_API PaddlePredictor {
  /// CxxConfig, and the persisted model can be reused for MobileConfig.
  virtual void SaveOptimizedModel(
      const std::string& model_dir,
-      LiteModelType model_type = LiteModelType::kProtobuf);
+      LiteModelType model_type = LiteModelType::kProtobuf,
+      bool record_info = false);

  virtual ~PaddlePredictor() = default;
 };

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -64,8 +64,8 @@ TEST(CxxApi, run) {
  EXPECT_NEAR(out[1], -28.8729, 1e-3);

  predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2");
-  predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2.naive",
-                                LiteModelType::kNaiveBuffer);
+  predictor->SaveOptimizedModel(
+      FLAGS_model_dir + ".opt2.naive", LiteModelType::kNaiveBuffer, true);
 }

 // Demo1 for Mobile Devices :Load model from file and run

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -71,6 +71,8 @@ add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
  ${kernels_src_list}
  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
+  "${LITE_OPTMODEL_DIR}/.tailored_kernels_list"
+  LITE_BUILD_TAILOR
  OUTPUT kernels.h # not a real path to the output to force it execute every time.
  )
 # A trick to generate the paddle_use_ops.h
@@ -78,6 +80,8 @@ add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
  ${ops_src_list}
  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
+  "${LITE_OPTMODEL_DIR}/.tailored_ops_list"
+  LITE_BUILD_TAILOR
  OUTPUT ops.h # not a real path to the output to force it execute every time.
  )
 # generate fake kernels for memory_optimize_tool
@@ -85,6 +89,7 @@ add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
  ${kernels_src_list}
  ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
+  ${CMAKE_BINARY_DIR}/kernel_src_map.h
  OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
  )
 add_custom_target(op_list_h DEPENDS ops.h)

--- a/lite/core/mir/fusion/conv_bn_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_bn_fuse_pass.cc
@@ -16,6 +16,7 @@
 #include <memory>
 #include <vector>
 #include "lite/core/mir/fusion/conv_bn_fuser.h"
+#include "lite/core/mir/graph_visualize_pass.h"
 #include "lite/core/mir/pass_registry.h"

 namespace paddle {
@@ -23,11 +24,19 @@ namespace lite {
 namespace mir {

 void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::ConvBNFuser fuser("conv2d");
-  fuser(graph.get());
+  // initialze fuser params
+  std::vector<bool> conv_has_bias_cases{true, false};
+  std::vector<std::string> conv_type_cases{"conv2d", "depthwise_conv2d"};

-  fusion::ConvBNFuser fuser2("depthwise_conv2d");
-  fuser2(graph.get());
+  // start fuse using params
+  for (auto conv_has_bias : conv_has_bias_cases) {
+    for (auto conv_type : conv_type_cases) {
+      VLOG(4) << "conv_has_bias:" << conv_has_bias
+              << " conv_type:" << conv_type;
+      fusion::ConvBNFuser fuser(conv_type, conv_has_bias);
+      fuser(graph.get());
+    }
+  }
 }

 }  // namespace mir
@@ -35,5 +44,4 @@ void ConvBNFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
 }  // namespace paddle

 REGISTER_MIR_PASS(lite_conv_bn_fuse_pass, paddle::lite::mir::ConvBNFusePass)
-    .BindTargets({TARGET(kAny)})
-    .BindKernel("elementwise_add");
+    .BindTargets({TARGET(kAny)});
--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -14,6 +14,7 @@

 #include "lite/core/mir/fusion/conv_bn_fuser.h"
 #include <memory>
+#include <unordered_set>
 #include <vector>

 namespace paddle {
@@ -30,7 +31,8 @@ void ConvBNFuser::BuildPattern() {
  auto* conv = OpNode("conv2d", conv_type_)->assert_is_op(conv_type_);
  auto* conv_out = VarNode("conv_out")
                       ->assert_is_op_output(conv_type_, "Output")
-                       ->assert_is_op_input("batch_norm", "X");
+                       ->assert_is_op_input("batch_norm", "X")
+                       ->AsIntermediate();

  auto* bn_scale = VarNode("bn_scale")
                       ->assert_is_op_input("batch_norm", "Scale")
@@ -61,34 +63,30 @@ void ConvBNFuser::BuildPattern() {
                           ->assert_is_op_output("batch_norm", "SavedVariance")
                           ->AsIntermediate();

-  conv->LinksFrom({conv_input, conv_weight}).LinksTo({conv_out});
+  if (conv_has_bias_) {
+    auto* conv_bias = VarNode("conv_bias")
+                          ->assert_is_op_input(conv_type_, "Bias")
+                          ->AsInput()
+                          ->AsIntermediate();
+    conv->LinksFrom({conv_input, conv_weight, conv_bias}).LinksTo({conv_out});
+  } else {
+    conv->LinksFrom({conv_input, conv_weight}).LinksTo({conv_out});
+  }

  bn->LinksFrom({conv_out, bn_scale, bn_bias, bn_mean, bn_var})
      .LinksTo({bn_out, bn_mean_out, bn_saved_mean, bn_saved_var, bn_var_out});
 }

 void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto eltwise_op = LiteOpRegistry::Global().Create("elementwise_add");
-
  auto conv_instruct = matched.at("conv2d")->stmt();
+  auto conv_op_desc = conv_instruct->mutable_op_info();
  auto conv = conv_instruct->op();
  auto* scope = conv->scope();
-  auto& valid_places = conv->valid_places();
-
-  auto conv_weight_t = scope->FindVar(matched.at("conv_weight")->arg()->name)
-                           ->GetMutable<lite::Tensor>();
-  auto conv_weight_dims = conv_weight_t->dims();
-  size_t weight_num = conv_weight_t->data_size();

+  // bn
  auto bn_scale_t = scope->FindVar(matched.at("bn_scale")->arg()->name)
                        ->GetMutable<lite::Tensor>();
-  size_t bias_size = bn_scale_t->data_size();
  auto bn_scale_d = bn_scale_t->mutable_data<float>();
-  CHECK_EQ(bias_size, static_cast<size_t>(conv_weight_dims[0]))
-      << "The BN bias's size should be equal to the size of the first "
-      << "dim size of the conv weights";
-
  auto bn_mean_t = scope->FindVar(matched.at("bn_mean")->arg()->name)
                       ->GetMutable<lite::Tensor>();
  auto bn_mean_d = bn_mean_t->mutable_data<float>();
@@ -102,59 +100,102 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  auto bn_bias_d = bn_bias_t->mutable_data<float>();
  auto eps = matched.at("bn")->stmt()->op_info()->GetAttr<float>("epsilon");

-  auto conv_op_desc = conv_instruct->mutable_op_info();
-
+  // conv
+  auto conv_weight_t = scope->FindVar(matched.at("conv_weight")->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+  CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
+           static_cast<size_t>(conv_weight_t->dims()[0]))
+      << "The BN bias's size should be equal to the size of the first "
+      << "dim size of the conv weights";
+  size_t weight_num = conv_weight_t->data_size();
  bool enable_int8 = conv_op_desc->HasAttr("enable_int8") ? true : false;
+
+  // comupte BN alpha and beta
  Tensor alpha_tensor, beta_tensor;
  alpha_tensor.CopyDataFrom(*bn_bias_t);
  beta_tensor.CopyDataFrom(*bn_bias_t);
  auto alpha_data = alpha_tensor.mutable_data<float>();
  auto beta_data = beta_tensor.mutable_data<float>();

-  int h = bias_size;
-  int w = weight_num / bias_size;
+  int h =
+      bn_scale_t
+          ->data_size();  // h == bias_size == out channel num of conv weight
+  int w = weight_num /
+          (bn_scale_t->data_size());  // w = `conv_weight_num` / bias_size = in
+                                      // channel num of conv weight
+
  ComputeAlphaAndBeta(
      bn_scale_d, bn_mean_d, bn_var_d, alpha_data, beta_data, eps, h, w);

+  ///////////////////////////////////////////////////////////////////////////////
+  // Compute ConvBNFuser
+  // Before fusion
+  //
+  //   conv(x) = conv(x) = kx + z = y
+  //   bn(y) = ay + b
+  //
+  // Note: `alpha_data` is a, `beta_data` is b from `ComputeAlphaAndBeta`
+  //
+  // After fusion:
+  //
+  //   bn(conv(x)) = a(kx + z) + b = akx + az + b
+  //
+  // Note: h == bias_size == out channel num of conv weight
+  //       w = `conv_weight_num` / bias_size = in channel num of conv weight
+  //       little difference for int8
+  ///////////////////////////////////////////////////////////////////////////////
  if (enable_int8) {
    PADDLE_ENFORCE(conv_op_desc->HasAttr("weight_scale"),
                   "INT8 mode: Conv should has weight_scale attr");
+    auto conv_weight_d = conv_weight_t->mutable_data<int8_t>();
+    // compute new conv_weight for int8
    auto weight_scale =
        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    for (int i = 0; i < h; i++) {
-      weight_scale[i] *= alpha_data[i];
+    for (unsigned int i = 0; i < h; ++i) {
+      weight_scale[i] *= fabsf(alpha_data[i]);
+      if (alpha_data[i] < 0.f) {
+        auto ptr_row = conv_weight_d + i * w;
+        for (unsigned int j = 0; j < w; ++j) {
+          ptr_row[j] *= -1;
+        }
+      }
    }
-    // Interface like this should be abandoned.
    conv_op_desc->SetAttr("weight_scale", weight_scale);
-    auto update_conv_desc = *conv_instruct->mutable_op_info();
-    conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
  } else {
+    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    for (int i = 0; i < h; i++) {
-      for (int j = 0; j < w; j++) {
+    for (unsigned int i = 0; i < h; ++i) {    // n: conv2d output channels
+      for (unsigned int j = 0; j < w; ++j) {  // w: conv2d input channels
        conv_weight_d[i * w + j] *= alpha_data[i];
      }
    }
  }
-  for (int i = 0; i < bias_size; i++) {
+
+  // compute new conv_bias
+  if (conv_has_bias_) {
+    auto conv_bias_t = scope->FindVar(matched.at("conv_bias")->arg()->name)
+                           ->GetMutable<lite::Tensor>();
+    auto conv_bias_d = conv_bias_t->data<float>();
+    for (unsigned int i = 0; i < bn_bias_t->data_size();
+         ++i) {  // bias_size == h == conv2d output channls
+      bn_bias_d[i] += alpha_data[i] * conv_bias_d[i];
+    }
+  }
+  for (unsigned int i = 0; i < bn_bias_t->data_size(); ++i) {
    bn_bias_d[i] += beta_data[i];
  }
-  eltwise_op->Attach(op_desc, scope);
-  auto* new_op_node = graph->GraphCreateInstructNode(eltwise_op, valid_places);
-
-  IR_NODE_LINK_TO(matched.at("conv_out"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("bn_bias"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("bn_out"));
-}

-cpp::OpDesc ConvBNFuser::GenOpDesc(const key2nodes_t& matched) {
-  cpp::OpDesc op_desc;
-  op_desc.SetType("elementwise_add");
-  op_desc.SetInput("X", {matched.at("conv_out")->arg()->name});
-  op_desc.SetInput("Y", {matched.at("bn_bias")->arg()->name});
-  op_desc.SetOutput("Out", {matched.at("bn_out")->arg()->name});
-  op_desc.SetAttr("axis", 1);
-  return op_desc;
+  conv_op_desc->SetType(conv_type_);
+  conv_op_desc->SetInput("Input", {matched.at("conv_input")->arg()->name});
+  conv_op_desc->SetInput("Filter", {matched.at("conv_weight")->arg()->name});
+  conv_op_desc->SetOutput("Output", {matched.at("bn_out")->arg()->name});
+  conv_op_desc->SetInput("Bias",
+                         {matched.at("bn_bias")->arg()->name});  // conv_bias
+  auto update_conv_desc = *conv_instruct->mutable_op_info();
+  conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
+
+  IR_NODE_LINK_TO(matched.at("bn_bias"), matched.at("conv2d"));
+  IR_OP_VAR_LINK(matched.at("conv2d"), matched.at("bn_out"));
 }

 }  // namespace fusion

--- a/lite/core/mir/fusion/conv_bn_fuser.h
+++ b/lite/core/mir/fusion/conv_bn_fuser.h
@@ -27,12 +27,12 @@ namespace fusion {

 class ConvBNFuser : public FuseBase {
 public:
-  explicit ConvBNFuser(const std::string& conv_type) : conv_type_(conv_type) {}
+  explicit ConvBNFuser(const std::string& conv_type, const bool conv_has_bias)
+      : conv_type_(conv_type), conv_has_bias_(conv_has_bias) {}
  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;

 private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
  void ComputeAlphaAndBeta(float* scale_d,
                           float* mean_d,
                           float* var_d,
@@ -51,6 +51,7 @@ class ConvBNFuser : public FuseBase {

 private:
  std::string conv_type_{"conv2d"};
+  bool conv_has_bias_{false};
 };

 }  // namespace fusion

--- a/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuse_pass.cc
@@ -23,14 +23,21 @@ namespace lite {
 namespace mir {

 void ConvElementwiseFusePass::Apply(const std::unique_ptr<SSAGraph>& graph) {
-  fusion::ConvElementwiseFuser fuser("conv2d");
-  fuser(graph.get());
+  // initialze fuser params
+  // note: `true` of conv_has_bias must as first pattern to match
+  std::vector<bool> conv_has_bias_cases{true, false};
+  std::vector<std::string> conv_type_cases{
+      "conv2d", "depthwise_conv2d", "conv2d_transpose"};

-  fusion::ConvElementwiseFuser depthwise_fuser("depthwise_conv2d");
-  depthwise_fuser(graph.get());
-
-  fusion::ConvElementwiseFuser conv2d_transpose_fuser("conv2d_transpose");
-  conv2d_transpose_fuser(graph.get());
+  // start fuse using params
+  for (auto conv_has_bias : conv_has_bias_cases) {
+    for (auto conv_type : conv_type_cases) {
+      VLOG(4) << "conv_has_bias:" << conv_has_bias
+              << " conv_type:" << conv_type;
+      fusion::ConvElementwiseFuser fuser(conv_type, conv_has_bias);
+      fuser(graph.get());
+    }
+  }
 }

 }  // namespace mir

--- a/lite/core/mir/fusion/conv_elementwise_fuser.cc
+++ b/lite/core/mir/fusion/conv_elementwise_fuser.cc
@@ -33,8 +33,7 @@ void ConvElementwiseFuser::BuildPattern() {
                   ->assert_is_persistable_var();

  // create op nodes
-  auto* conv2d =
-      OpNode("conv2d", conv_type_)->assert_is_op(conv_type_)->AsIntermediate();
+  auto* conv2d = OpNode("conv2d", conv_type_)->assert_is_op(conv_type_);
  auto* add = OpNode("add", "elementwise_add")
                  ->assert_is_op("elementwise_add")
                  ->AsIntermediate();
@@ -51,6 +50,13 @@ void ConvElementwiseFuser::BuildPattern() {

  // create topology.
  std::vector<PMNode*> conv2d_inputs{filter, input};
+  // consider a special case: conv with bias
+  if (conv_has_bias_) {
+    PMNode* conv_bias = VarNode("conv_bias")
+                            ->assert_is_op_input(conv_type_, "Bias")
+                            ->AsIntermediate();
+    conv2d_inputs.emplace_back(conv_bias);
+  }
  std::vector<PMNode*> add_inputs{conv2d_out, bias};
  conv2d_inputs >> *conv2d >> *conv2d_out;
  add_inputs >> *add >> *add_out;
@@ -58,44 +64,49 @@ void ConvElementwiseFuser::BuildPattern() {

 void ConvElementwiseFuser::InsertNewNode(SSAGraph* graph,
                                         const key2nodes_t& matched) {
-  auto op_desc = GenOpDesc(matched);
-  auto conv_op = LiteOpRegistry::Global().Create(conv_type_);
-  auto conv_old = matched.at("conv2d")->stmt()->op();
-  auto* scope = conv_old->scope();
-  auto& valid_places = conv_old->valid_places();
-  conv_op->Attach(op_desc, scope);
-
-  auto* new_op_node = graph->GraphCreateInstructNode(conv_op, valid_places);
+  auto conv_instruct = matched.at("conv2d")->stmt();
+  auto conv_op_desc = conv_instruct->mutable_op_info();
+  auto* scope = conv_instruct->op()->scope();

-  IR_NODE_LINK_TO(matched.at("input"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("filter"), new_op_node);
-  IR_NODE_LINK_TO(matched.at("bias"), new_op_node);
-  IR_NODE_LINK_TO(new_op_node, matched.at("output"));
-}
+  /////////////////////////////////////////////////////////////////////////////////////
+  // ConvElementwiseFuser
+  //   if `conv_bias` existed, store previous old `conv_bias` to
+  //   `elemwise_bias`, and add `elementwise_add_bias` to `new_conv_bias`.
+  //   if `conv_bias` not existed, set `elementwise_add_bias` as
+  //   `new_conv_bias`.
+  /////////////////////////////////////////////////////////////////////////////////////

-cpp::OpDesc ConvElementwiseFuser::GenOpDesc(const key2nodes_t& matched) {
-  auto* desc = matched.at("conv2d")->stmt()->op_info();
+  if (conv_has_bias_ == true && conv_op_desc->HasInput("Bias") &&
+      conv_op_desc->Input("Bias").size() > 0) {
+    auto conv_bias_var = scope->FindVar(conv_op_desc->Input("Bias").front());
+    if (conv_bias_var != nullptr) {
+      // conv bias
+      auto conv_bias_t = &(conv_bias_var->Get<lite::Tensor>());
+      auto conv_bias_d = conv_bias_t->data<float>();

-  cpp::OpDesc op_desc = *desc;
-  op_desc.SetType(conv_type_);
-  op_desc.SetInput("Input", {matched.at("input")->arg()->name});
-  op_desc.SetInput("Filter", {matched.at("filter")->arg()->name});
-  op_desc.SetInput("Bias", {matched.at("bias")->arg()->name});
-  op_desc.SetOutput("Output", {matched.at("output")->arg()->name});
-  // Other inputs. See operators/conv_op.h
-  std::vector<std::string> input_arg_names = desc->InputArgumentNames();
+      // elementwise_add bias
+      auto elementwise_add_bias_t =
+          scope->FindVar(matched.at("bias")->arg()->name)
+              ->GetMutable<lite::Tensor>();
+      auto elementwise_add_bias_d =
+          elementwise_add_bias_t->mutable_data<float>();

-  if (std::find(input_arg_names.begin(),
-                input_arg_names.end(),
-                "ResidualData") != input_arg_names.end()) {
-    op_desc.SetInput("ResidualData", desc->Input("ResidualData"));
+      for (unsigned int i = 0; i < conv_bias_t->data_size(); ++i) {
+        elementwise_add_bias_d[i] += conv_bias_d[i];
+      }
+    }
  }
-  // Only consider strides, padding, groups, dilations for now
-  op_desc.SetAttr("strides", desc->GetAttr<std::vector<int>>("strides"));
-  op_desc.SetAttr("paddings", desc->GetAttr<std::vector<int>>("paddings"));
-  op_desc.SetAttr("groups", desc->GetAttr<int>("groups"));
-  op_desc.SetAttr("dilations", desc->GetAttr<std::vector<int>>("dilations"));
-  return op_desc;
+
+  conv_op_desc->SetType(conv_type_);
+  conv_op_desc->SetInput("Input", {matched.at("input")->arg()->name});
+  conv_op_desc->SetInput("Filter", {matched.at("filter")->arg()->name});
+  conv_op_desc->SetOutput("Output", {matched.at("output")->arg()->name});
+  conv_op_desc->SetInput("Bias", {matched.at("bias")->arg()->name});
+  auto update_conv_desc = *conv_instruct->mutable_op_info();
+  conv_instruct->ResetOp(update_conv_desc, graph->valid_places());
+
+  IR_NODE_LINK_TO(matched.at("bias"), matched.at("conv2d"));
+  IR_OP_VAR_LINK(matched.at("conv2d"), matched.at("output"));
 }

 }  // namespace fusion

--- a/lite/core/mir/fusion/conv_elementwise_fuser.h
+++ b/lite/core/mir/fusion/conv_elementwise_fuser.h
@@ -25,16 +25,18 @@ namespace fusion {

 class ConvElementwiseFuser : public FuseBase {
 public:
-  explicit ConvElementwiseFuser(const std::string& conv_type) {
+  explicit ConvElementwiseFuser(const std::string& conv_type,
+                                const bool conv_has_bias) {
    conv_type_ = conv_type;
+    conv_has_bias_ = conv_has_bias;
  }

  void BuildPattern() override;
  void InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) override;

 private:
-  cpp::OpDesc GenOpDesc(const key2nodes_t& matched) override;
-  std::string conv_type_;
+  std::string conv_type_{"conv2d"};
+  bool conv_has_bias_{false};
 };

 }  // namespace fusion

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -32,6 +32,43 @@

 using LiteType = paddle::lite::Type;

+class OpKernelInfoCollector {
+ public:
+  static OpKernelInfoCollector &Global() {
+    static auto *x = new OpKernelInfoCollector;
+    return *x;
+  }
+  void AddOp2path(const std::string &op_name, const std::string &op_path) {
+    size_t index = op_path.find_last_of('/');
+    if (index != std::string::npos) {
+      op2path_.insert(std::pair<std::string, std::string>(
+          op_name, op_path.substr(index + 1)));
+    }
+  }
+  void AddKernel2path(const std::string &kernel_name,
+                      const std::string &kernel_path) {
+    size_t index = kernel_path.find_last_of('/');
+    if (index != std::string::npos) {
+      kernel2path_.insert(std::pair<std::string, std::string>(
+          kernel_name, kernel_path.substr(index + 1)));
+    }
+  }
+  void SetKernel2path(
+      const std::map<std::string, std::string> &kernel2path_map) {
+    kernel2path_ = kernel2path_map;
+  }
+  const std::map<std::string, std::string> &GetOp2PathDict() {
+    return op2path_;
+  }
+  const std::map<std::string, std::string> &GetKernel2PathDict() {
+    return kernel2path_;
+  }
+
+ private:
+  std::map<std::string, std::string> op2path_;
+  std::map<std::string, std::string> kernel2path_;
+};
+
 namespace paddle {
 namespace lite {

@@ -59,7 +96,6 @@ class OpLiteRegistor : public Registor<OpClass> {
              });
        }) {}
 };
-
 template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
 using KernelRegistryForTarget =
    Factory<KernelLite<Target, Precision, Layout>, std::unique_ptr<KernelBase>>;
@@ -287,6 +323,7 @@ class KernelRegistor : public lite::Registor<KernelType> {
  static paddle::lite::OpLiteRegistor<OpClass> LITE_OP_REGISTER_INSTANCE( \
      op_type__)(#op_type__);                                             \
  int touch_op_##op_type__() {                                            \
+    OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__);     \
    return LITE_OP_REGISTER_INSTANCE(op_type__).Touch();                  \
  }

@@ -312,6 +349,9 @@ class KernelRegistor : public lite::Registor<KernelType> {
  static KernelClass LITE_KERNEL_INSTANCE(                                     \
      op_type__, target__, precision__, layout__, alias__);                    \
  int touch_##op_type__##target__##precision__##layout__##alias__() {          \
+    OpKernelInfoCollector::Global().AddKernel2path(                            \
+        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__,  \
+        __FILE__);                                                             \
    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__)  \
        .Touch();                                                              \
    return 0;                                                                  \

--- a/lite/fluid/lod.h
+++ b/lite/fluid/lod.h
@@ -21,7 +21,7 @@ namespace lite {
 namespace fluid {
 using LoD = std::vector<std::vector<size_t>>;

-LoD ToAbsOffset(const LoD &in) {
+static LoD ToAbsOffset(const LoD &in) {
  // the lowest level stores relative offsets
  if (in.empty() || in.size() == 1) return in;
  LoD result = in;

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -17,6 +17,8 @@ BUILD_EXTRA=OFF
 BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
+OPTMODEL_DIR=""
+BUILD_TAILOR=OFF

 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz

@@ -94,6 +96,8 @@ function make_tiny_publish_so {
      -DLITE_ON_TINY_PUBLISH=ON \
      -DANDROID_STL_TYPE=$android_stl \
      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
+      -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
+      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make publish_inference -j$NUM_PROC
@@ -133,6 +137,8 @@ function make_full_publish_so {
      -DLITE_SHUTDOWN_LOG=ON \
      -DANDROID_STL_TYPE=$android_stl \
      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
+      -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
+      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make publish_inference -j4
@@ -317,6 +323,14 @@ function main {
                BUILD_DIR="${i#*=}"
                shift
 		            ;;
+            --opt_model_dir=*)
+                OPTMODEL_DIR="${i#*=}"
+                shift
+                ;;
+            --build_tailor=*)
+                BUILD_TAILOR="${i#*=}"
+                shift
+                ;;
            tiny_publish)
                make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
                shift

--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
@@ -20,6 +20,7 @@ from utils import *

 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
+kernelmap_path = sys.argv[3]

 out_lines = [
    '#pragma once',
@@ -47,6 +48,31 @@ class %s : public KernelLite<TARGET(%s), PRECISION(%s), DATALAYOUT(%s)> {
 }  // namespace paddle
 '''

+# create .h file to store kernel&source relationship
+kernel_src_map_lines = [
+'''
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include<map>
+// ATTENTION This can only include in a .cc file.
+
+const std::map<std::string, std::string> kernel2path_map{
+
+'''
+]


 with open(ops_list_path) as f:
@@ -99,7 +125,23 @@ with open(ops_list_path) as f:
                out_lines.append("")
                out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))

-
+                index = path.rindex('/')
+                filename = path[index + 1:]
+                map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
+                    k.op_type,
+                    k.target,
+                    k.precision,
+                    k.data_layout,
+                    k.alias,
+                    filename.strip()
+                )
+                kernel_src_map_lines.append(map_element)
 with open(dest_path, 'w') as f:
    logging.info("write kernel list to %s" % dest_path)
    f.write('\n'.join(out_lines))
+
+with open(kernelmap_path, 'w') as fd:
+    logging.info("write kernel map to %s" % dest_path)
+    kernel_src_map_lines.append('  {"  ", "  "}')
+    kernel_src_map_lines.append('};')
+    fd.write('\n'.join(kernel_src_map_lines))
--- a/lite/tools/cmake_tools/parse_kernel_registry.py
+++ b/lite/tools/cmake_tools/parse_kernel_registry.py
@@ -18,14 +18,19 @@ from ast import RegisterLiteKernelParser

 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
+minkernels_list_path = sys.argv[3]
+tailored = sys.argv[4]

 out_lines = [
    '#pragma once',
    '#include "paddle_lite_factory_helper.h"',
    '',
 ]
-
-
+minlines = set()
+if tailored == "ON":
+    with open(minkernels_list_path) as fd:
+        for line in fd:
+            minlines.add(line.strip())
 with open(ops_list_path) as f:
    paths = set([path for path in f])
    for path in paths:
@@ -35,6 +40,15 @@ with open(ops_list_path) as f:
            kernel_parser.parse()

            for k in kernel_parser.kernels:
+                  kernel = "%s, %s, %s, %s, %s" % (
+                     k.op_type,
+                     k.target,
+                     k.precision,
+                     k.data_layout,
+                     k.alias,
+                  )
+                  if tailored == "ON":
+                      if kernel not in minlines: continue
                  key = "USE_LITE_KERNEL(%s, %s, %s, %s, %s);" % (
                     k.op_type,
                     k.target,

--- a/lite/tools/cmake_tools/parse_op_registry.py
+++ b/lite/tools/cmake_tools/parse_op_registry.py
@@ -19,7 +19,8 @@ from ast import RegisterLiteOpParser

 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
-
+minops_list_path = sys.argv[3]
+tailored = sys.argv[4]
 out_lines = [
    '#pragma once',
    '#include "paddle_lite_factory_helper.h"',
@@ -30,6 +31,11 @@ paths = set()
 for line in open(ops_list_path):
    paths.add(line.strip())

+if tailored == "ON":
+    minlines = set()
+    with open(minops_list_path) as fd:
+        for line in fd:
+            minlines.add(line.strip())
 for path in paths:
    str_info = open(path.strip()).read()
    op_parser = RegisterLiteOpParser(str_info)
@@ -37,6 +43,8 @@ for path in paths:
    for op in ops:
        if "_grad" in op: 
            continue
+        if tailored == "ON":
+            if op not in minlines: continue
        out = "USE_LITE_OP(%s);" % op
        out_lines.append(out)