model dynamic library tailoring (#2256)

* add shell file to automatically build and collect publish result test=develop * modify API inference of model_optimize_tool and add option for tiny&full publish test=develop

model dynamic library tailoring (#2256)
* add shell file to automatically build and collect publish result test=develop * modify API inference of model_optimize_tool and add option for tiny&full publish test=develop
b987ee38 · huzhiqiang · GitHub · 74a6980e · b987ee38 · b987ee38
15 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -70,6 +70,7 @@ lite_option(LITE_ON_TINY_PUBLISH "Publish tiny predictor lib." OFF)
 lite_option(LITE_ON_MODEL_OPTIMIZE_TOOL "Build the model optimize tool" OFF)
 # publish options
 lite_option(LITE_BUILD_EXTRA "Enable extra algorithm support in Lite, both kernels and operators" OFF)
+lite_option(LITE_BUILD_TAILOR "Enable tailoring library according to model" OFF)

 # TODO(Superjomn) Remove WITH_ANAKIN option if not needed latter.
 if(ANDROID OR IOS OR ARMLINUX)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -241,6 +241,10 @@ set(host_kernels CACHE INTERNAL "host kernels")

 set(kernels_src_list "${CMAKE_BINARY_DIR}/kernels_src_list.txt")
 file(WRITE ${kernels_src_list} "") # clean
+if(LITE_BUILD_TAILOR)
+  set(tailored_kernels_list_path "${LITE_OPTMODEL_DIR}/.tailored_kernels_source_list")
+  file(STRINGS ${tailored_kernels_list_path} tailored_kernels_list)
+endif()
 # add a kernel for some specific device
 # device: one of (Host, ARM, X86, NPU, FPGA, OPENCL, CUDA)
 # level: one of (basic, extra)
@@ -252,6 +256,15 @@ function(add_kernel TARGET device level)
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+    if(LITE_BUILD_TAILOR)
+      foreach(src ${args_SRCS})
+        list (FIND tailored_kernels_list ${src} _index)
+        if (${_index} EQUAL -1)
+          return()
+        endif()
+      endforeach()
+    endif()
+
    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
        return()
    endif()
@@ -338,6 +351,10 @@ endfunction()
 set(ops CACHE INTERNAL "ops")
 set(ops_src_list "${CMAKE_BINARY_DIR}/ops_src_list.txt")
 file(WRITE ${ops_src_list} "") # clean
+if(LITE_BUILD_TAILOR)
+  set(tailored_ops_list_path "${LITE_OPTMODEL_DIR}/.tailored_ops_source_list")
+  file(STRINGS ${tailored_ops_list_path} tailored_ops_list)
+endif()
 # add an operator
 # level: one of (basic, extra)
 function(add_operator TARGET level)
@@ -348,16 +365,24 @@ function(add_operator TARGET level)
        ARGS)
    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})

+
    if ("${level}" STREQUAL "extra" AND (NOT LITE_BUILD_EXTRA))
        return()
    endif()

-    set(ops "${ops};${TARGET}" CACHE INTERNAL "source")

    foreach(src ${args_SRCS})
+      if(LITE_BUILD_TAILOR)
+        list(FIND tailored_ops_list ${src} _index)
+        if (${_index} EQUAL -1)
+          return()
+        endif()
+      endif()
      file(APPEND ${ops_src_list} "${CMAKE_CURRENT_SOURCE_DIR}/${src}\n")
    endforeach()

+    set(ops "${ops};${TARGET}" CACHE INTERNAL "source")
+
    lite_cc_library(${TARGET} SRCS ${args_SRCS}
              DEPS ${args_DEPS}
              X86_DEPS ${args_X86_DEPS}

--- a/lite/api/cxx_api.cc
+++ b/lite/api/cxx_api.cc
@@ -15,6 +15,7 @@
 #include "lite/api/cxx_api.h"
 #include <algorithm>
 #include <memory>
+#include <set>
 #include <string>
 #include <utility>
 #include <vector>
@@ -23,8 +24,16 @@
 namespace paddle {
 namespace lite {

+static const char TAILORD_OPS_SOURCE_LIST_FILENAME[] =
+    ".tailored_ops_source_list";
+static const char TAILORD_OPS_LIST_NAME[] = ".tailored_ops_list";
+static const char TAILORD_KERNELS_SOURCE_LIST_FILENAME[] =
+    ".tailored_kernels_source_list";
+static const char TAILORD_KERNELS_LIST_NAME[] = ".tailored_kernels_list";
+
 void Predictor::SaveModel(const std::string &dir,
-                          lite_api::LiteModelType model_type) {
+                          lite_api::LiteModelType model_type,
+                          bool record_info) {
  if (!program_) {
    GenRuntimeProgram();
  }
@@ -40,6 +49,83 @@ void Predictor::SaveModel(const std::string &dir,
    default:
      LOG(FATAL) << "Unknown model type";
  }
+  if (record_info) {
+    SaveOpKernelInfo(dir);
+  }
+}
+
+void Predictor::SaveOpKernelInfo(const std::string &model_dir) {
+  std::set<std::string> ops_info;
+  std::set<std::string> kernels_info;
+  const auto &instructions_ = program_->instructions();
+  for (auto &node : instructions_) {
+    // parse op type infomation
+    auto op = node.op()->op_info();
+    ops_info.insert(op->Type());
+    // parse kernel type information
+    std::string kernel_type_str =
+        node.kernel()->op_type() + "," + TargetRepr(node.kernel()->target()) +
+        "," + PrecisionRepr(node.kernel()->precision()) + "," +
+        DataLayoutRepr(node.kernel()->layout()) + "," + node.kernel()->alias();
+    kernels_info.insert(kernel_type_str);
+  }
+
+  // get souce_file name from op type and kernel type
+  auto op2pathmap = OpKernelInfoCollector::Global().GetOp2PathDict();
+  auto kernel2pathmap = OpKernelInfoCollector::Global().GetKernel2PathDict();
+
+  // write used op and kernel info into files
+  std::string opf_path = model_dir + "/" + TAILORD_OPS_LIST_NAME;
+  std::string opf_source_path =
+      model_dir + "/" + TAILORD_OPS_SOURCE_LIST_FILENAME;
+  std::string kpf_path = model_dir + "/" + TAILORD_KERNELS_LIST_NAME;
+  std::string kpf_source_path =
+      model_dir + "/" + TAILORD_KERNELS_SOURCE_LIST_FILENAME;
+  std::map<std::string, std::string> op2path;
+
+  std::FILE *opf = std::fopen(opf_path.c_str(), "w");
+  std::FILE *opf_source = std::fopen(opf_source_path.c_str(), "w");
+  std::FILE *kpf = std::fopen(kpf_path.c_str(), "w");
+  std::FILE *kpf_source = std::fopen(kpf_source_path.c_str(), "w");
+  std::vector<std::string> opcompile;
+  std::vector<std::string> kernelcompile;
+
+  if (nullptr == opf || nullptr == opf_source || nullptr == opf ||
+      nullptr == kpf_source) {
+    LOG(FATAL) << "failed to create info file into: " << model_dir;
+  }
+  for (auto op_info = ops_info.begin(); op_info != ops_info.end(); op_info++) {
+    fputs(op_info->c_str(), opf);
+    fputc('\n', opf);
+    std::string op_path = op2pathmap[*op_info];
+    fputs(op_path.c_str(), opf_source);
+    fputc('\n', opf_source);
+  }
+  std::fclose(opf_source);
+  std::fclose(opf);
+  LOG(INFO) << "operators information of tailored model is stored into: "
+            << opf_path;
+
+  // write Kernel_type and Kernel_path into file
+  for (auto kernel_info = kernels_info.begin();
+       kernel_info != kernels_info.end();
+       kernel_info++) {
+    fputs(kernel_info->c_str(), kpf);
+    fputc('\n', kpf);
+    std::string kernel_path = kernel2pathmap[*kernel_info];
+    fputs(kernel_path.c_str(), kpf_source);
+    fputc('\n', kpf_source);
+    if (kernel_path == "conv_compute.cc") {
+      fputs(
+          "conv_depthwise.cc\nconv_direct.cc\nconv_gemmlike.cc\nconv_"
+          "winograd.cc\n",
+          kpf_source);
+    }
+  }
+  std::fclose(kpf_source);
+  std::fclose(kpf);
+  LOG(INFO) << "kernels information of tailored model is stored into: "
+            << kpf_path;
 }

 lite::Tensor *Predictor::GetInput(size_t offset) {
@@ -61,7 +147,7 @@ void Predictor::PrepareFeedFetch() {
  auto current_block = program_desc_.GetBlock<cpp::BlockDesc>(0);
  std::vector<cpp::OpDesc *> feeds;
  std::vector<cpp::OpDesc *> fetchs;
-  for (int i = 0; i < current_block->OpsSize(); i++) {
+  for (size_t i = 0; i < current_block->OpsSize(); i++) {
    auto op = current_block->GetOp<cpp::OpDesc>(i);
    if (op->Type() == "feed") {
      feeds.push_back(op);
@@ -71,11 +157,11 @@ void Predictor::PrepareFeedFetch() {
  }
  input_names_.resize(feeds.size());
  output_names_.resize(fetchs.size());
-  for (int i = 0; i < feeds.size(); i++) {
+  for (size_t i = 0; i < feeds.size(); i++) {
    input_names_[feeds[i]->GetAttr<int>("col")] =
        feeds[i]->Output("Out").front();
  }
-  for (int i = 0; i < fetchs.size(); i++) {
+  for (size_t i = 0; i < fetchs.size(); i++) {
    output_names_[fetchs[i]->GetAttr<int>("col")] =
        fetchs[i]->Input("X").front();
  }
@@ -191,7 +277,7 @@ lite::Tensor *Predictor::GetInputByName(const std::string &name) {
  if (element == input_names_.end()) {
    LOG(ERROR) << "Model do not have input named with: [" << name
               << "], model's inputs include:";
-    for (int i = 0; i < input_names_.size(); i++) {
+    for (size_t i = 0; i < input_names_.size(); i++) {
      LOG(ERROR) << "[" << input_names_[i] << "]";
    }
    return nullptr;

--- a/lite/api/cxx_api.h
+++ b/lite/api/cxx_api.h
@@ -89,7 +89,9 @@ class LITE_API Predictor {
  // This method is disabled in mobile, for unnecessary dependencies required.
  void SaveModel(
      const std::string& dir,
-      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf);
+      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
+      bool record_info = false);
+  void SaveOpKernelInfo(const std::string& model_dir);

 #ifdef LITE_WITH_TRAIN
  void Run(const std::vector<framework::Tensor>& tensors) {
@@ -137,9 +139,10 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
  std::unique_ptr<lite_api::Tensor> GetInputByName(
      const std::string& name) override;

-  void SaveOptimizedModel(const std::string& model_dir,
-                          lite_api::LiteModelType model_type =
-                              lite_api::LiteModelType::kProtobuf) override;
+  void SaveOptimizedModel(
+      const std::string& model_dir,
+      lite_api::LiteModelType model_type = lite_api::LiteModelType::kProtobuf,
+      bool record_info = false) override;

 private:
  Predictor raw_predictor_;

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -65,8 +65,9 @@ std::unique_ptr<lite_api::Tensor> CxxPaddleApiImpl::GetInputByName(
 }

 void CxxPaddleApiImpl::SaveOptimizedModel(const std::string &model_dir,
-                                          lite_api::LiteModelType model_type) {
-  raw_predictor_.SaveModel(model_dir, model_type);
+                                          lite_api::LiteModelType model_type,
+                                          bool record_info) {
+  raw_predictor_.SaveModel(model_dir, model_type, record_info);
 }

 }  // namespace lite

--- a/lite/api/model_optimize_tool.cc
+++ b/lite/api/model_optimize_tool.cc
@@ -16,7 +16,10 @@
 #ifdef PADDLE_WITH_TESTING
 #include <gtest/gtest.h>
 #endif
+// "all_kernel_faked.cc" and "kernel_src_map.h" are created automatically during
+// model_optimize_tool's compiling period
 #include "all_kernel_faked.cc"  // NOLINT
+#include "kernel_src_map.h"     // NOLINT
 #include "lite/api/paddle_api.h"
 #include "lite/api/paddle_use_ops.h"
 #include "lite/api/paddle_use_passes.h"
@@ -35,6 +38,11 @@ DEFINE_string(
    "protobuf",
    "store type of the output optimized model. protobuf/naive_buffer");
 DEFINE_bool(display_kernels, false, "Display kernel information");
+DEFINE_bool(record_tailoring_info,
+            false,
+            "Record kernels and operators information of the optimized model "
+            "for tailoring compiling, information are stored into optimized "
+            "model path as hidden files");
 DEFINE_string(optimize_out, "", "path of the output optimized model");
 DEFINE_string(valid_targets,
              "arm",
@@ -104,8 +112,14 @@ void Main() {
  } else {
    LOG(FATAL) << "Unsupported Model type :" << FLAGS_optimize_out_type;
  }
+  OpKernelInfoCollector::Global().SetKernel2path(kernel2path_map);

-  predictor->SaveOptimizedModel(FLAGS_optimize_out, model_type);
+  predictor->SaveOptimizedModel(
+      FLAGS_optimize_out, model_type, FLAGS_record_tailoring_info);
+  if (FLAGS_record_tailoring_info) {
+    LOG(INFO) << "Record the information of tailored model into :"
+              << FLAGS_optimize_out;
+  }
 }

 }  // namespace lite_api

--- a/lite/api/paddle_api.cc
+++ b/lite/api/paddle_api.cc
@@ -145,7 +145,8 @@ lod_t Tensor::lod() const { return ctensor(raw_tensor_)->lod(); }
 void Tensor::SetLoD(const lod_t &lod) { tensor(raw_tensor_)->set_lod(lod); }

 void PaddlePredictor::SaveOptimizedModel(const std::string &model_dir,
-                                         LiteModelType model_type) {
+                                         LiteModelType model_type,
+                                         bool record_info) {
  LOG(FATAL)
      << "The SaveOptimizedModel API is only supported by CxxConfig predictor.";
 }

--- a/lite/api/paddle_api.h
+++ b/lite/api/paddle_api.h
@@ -97,7 +97,8 @@ class LITE_API PaddlePredictor {
  /// CxxConfig, and the persisted model can be reused for MobileConfig.
  virtual void SaveOptimizedModel(
      const std::string& model_dir,
-      LiteModelType model_type = LiteModelType::kProtobuf);
+      LiteModelType model_type = LiteModelType::kProtobuf,
+      bool record_info = false);

  virtual ~PaddlePredictor() = default;
 };

--- a/lite/api/paddle_api_test.cc
+++ b/lite/api/paddle_api_test.cc
@@ -64,8 +64,8 @@ TEST(CxxApi, run) {
  EXPECT_NEAR(out[1], -28.8729, 1e-3);

  predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2");
-  predictor->SaveOptimizedModel(FLAGS_model_dir + ".opt2.naive",
-                                LiteModelType::kNaiveBuffer);
+  predictor->SaveOptimizedModel(
+      FLAGS_model_dir + ".opt2.naive", LiteModelType::kNaiveBuffer, true);
 }

 // Demo1 for Mobile Devices :Load model from file and run

--- a/lite/core/CMakeLists.txt
+++ b/lite/core/CMakeLists.txt
@@ -71,6 +71,8 @@ add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_kernel_registry.py
  ${kernels_src_list}
  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_kernels.h
+  "${LITE_OPTMODEL_DIR}/.tailored_kernels_list"
+  LITE_BUILD_TAILOR
  OUTPUT kernels.h # not a real path to the output to force it execute every time.
  )
 # A trick to generate the paddle_use_ops.h
@@ -78,6 +80,8 @@ add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/parse_op_registry.py
  ${ops_src_list}
  ${CMAKE_SOURCE_DIR}/lite/api/paddle_use_ops.h
+  "${LITE_OPTMODEL_DIR}/.tailored_ops_list"
+  LITE_BUILD_TAILOR
  OUTPUT ops.h # not a real path to the output to force it execute every time.
  )
 # generate fake kernels for memory_optimize_tool
@@ -85,6 +89,7 @@ add_custom_command(
  COMMAND python ${CMAKE_SOURCE_DIR}/lite/tools/cmake_tools/create_fake_kernel_registry.py
  ${kernels_src_list}
  ${CMAKE_BINARY_DIR}/all_kernel_faked.cc
+  ${CMAKE_BINARY_DIR}/kernel_src_map.h
  OUTPUT all_kernel_faked.cc # not a real path to the output to force it execute every time.
  )
 add_custom_target(op_list_h DEPENDS ops.h)

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -32,6 +32,43 @@

 using LiteType = paddle::lite::Type;

+class OpKernelInfoCollector {
+ public:
+  static OpKernelInfoCollector &Global() {
+    static auto *x = new OpKernelInfoCollector;
+    return *x;
+  }
+  void AddOp2path(const std::string &op_name, const std::string &op_path) {
+    size_t index = op_path.find_last_of('/');
+    if (index != std::string::npos) {
+      op2path_.insert(std::pair<std::string, std::string>(
+          op_name, op_path.substr(index + 1)));
+    }
+  }
+  void AddKernel2path(const std::string &kernel_name,
+                      const std::string &kernel_path) {
+    size_t index = kernel_path.find_last_of('/');
+    if (index != std::string::npos) {
+      kernel2path_.insert(std::pair<std::string, std::string>(
+          kernel_name, kernel_path.substr(index + 1)));
+    }
+  }
+  void SetKernel2path(
+      const std::map<std::string, std::string> &kernel2path_map) {
+    kernel2path_ = kernel2path_map;
+  }
+  const std::map<std::string, std::string> &GetOp2PathDict() {
+    return op2path_;
+  }
+  const std::map<std::string, std::string> &GetKernel2PathDict() {
+    return kernel2path_;
+  }
+
+ private:
+  std::map<std::string, std::string> op2path_;
+  std::map<std::string, std::string> kernel2path_;
+};
+
 namespace paddle {
 namespace lite {

@@ -59,7 +96,6 @@ class OpLiteRegistor : public Registor<OpClass> {
              });
        }) {}
 };
-
 template <TargetType Target, PrecisionType Precision, DataLayoutType Layout>
 using KernelRegistryForTarget =
    Factory<KernelLite<Target, Precision, Layout>, std::unique_ptr<KernelBase>>;
@@ -287,6 +323,7 @@ class KernelRegistor : public lite::Registor<KernelType> {
  static paddle::lite::OpLiteRegistor<OpClass> LITE_OP_REGISTER_INSTANCE( \
      op_type__)(#op_type__);                                             \
  int touch_op_##op_type__() {                                            \
+    OpKernelInfoCollector::Global().AddOp2path(#op_type__, __FILE__);     \
    return LITE_OP_REGISTER_INSTANCE(op_type__).Touch();                  \
  }

@@ -312,6 +349,9 @@ class KernelRegistor : public lite::Registor<KernelType> {
  static KernelClass LITE_KERNEL_INSTANCE(                                     \
      op_type__, target__, precision__, layout__, alias__);                    \
  int touch_##op_type__##target__##precision__##layout__##alias__() {          \
+    OpKernelInfoCollector::Global().AddKernel2path(                            \
+        #op_type__ "," #target__ "," #precision__ "," #layout__ "," #alias__,  \
+        __FILE__);                                                             \
    LITE_KERNEL_INSTANCE(op_type__, target__, precision__, layout__, alias__)  \
        .Touch();                                                              \
    return 0;                                                                  \

--- a/lite/tools/build.sh
+++ b/lite/tools/build.sh
@@ -17,6 +17,8 @@ BUILD_EXTRA=OFF
 BUILD_JAVA=ON
 BUILD_PYTHON=OFF
 BUILD_DIR=$(pwd)
+OPTMODEL_DIR=""
+BUILD_TAILOR=OFF

 readonly THIRDPARTY_TAR=https://paddle-inference-dist.bj.bcebos.com/PaddleLite/third-party-05b862.tar.gz

@@ -94,6 +96,8 @@ function make_tiny_publish_so {
      -DLITE_ON_TINY_PUBLISH=ON \
      -DANDROID_STL_TYPE=$android_stl \
      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
+      -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
+      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make publish_inference -j$NUM_PROC
@@ -133,6 +137,8 @@ function make_full_publish_so {
      -DLITE_SHUTDOWN_LOG=ON \
      -DANDROID_STL_TYPE=$android_stl \
      -DLITE_BUILD_EXTRA=$BUILD_EXTRA \
+      -DLITE_BUILD_TAILOR=$BUILD_TAILOR \
+      -DLITE_OPTMODEL_DIR=$OPTMODEL_DIR \
      -DARM_TARGET_OS=${os} -DARM_TARGET_ARCH_ABI=${abi} -DARM_TARGET_LANG=${lang}

  make publish_inference -j4
@@ -317,6 +323,14 @@ function main {
                BUILD_DIR="${i#*=}"
                shift
 		            ;;
+            --opt_model_dir=*)
+                OPTMODEL_DIR="${i#*=}"
+                shift
+                ;;
+            --build_tailor=*)
+                BUILD_TAILOR="${i#*=}"
+                shift
+                ;;
            tiny_publish)
                make_tiny_publish_so $ARM_OS $ARM_ABI $ARM_LANG $ANDROID_STL
                shift

--- a/lite/tools/cmake_tools/create_fake_kernel_registry.py
+++ b/lite/tools/cmake_tools/create_fake_kernel_registry.py
@@ -20,6 +20,7 @@ from utils import *

 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
+kernelmap_path = sys.argv[3]

 out_lines = [
    '#pragma once',
@@ -47,6 +48,31 @@ class %s : public KernelLite<TARGET(%s), PRECISION(%s), DATALAYOUT(%s)> {
 }  // namespace paddle
 '''

+# create .h file to store kernel&source relationship
+kernel_src_map_lines = [
+'''
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include<map>
+// ATTENTION This can only include in a .cc file.
+
+const std::map<std::string, std::string> kernel2path_map{
+
+'''
+]


 with open(ops_list_path) as f:
@@ -99,7 +125,23 @@ with open(ops_list_path) as f:
                out_lines.append("")
                out_lines.append(gen_use_kernel_statement(k.op_type, k.target, k.precision, k.data_layout, k.alias))

-
+                index = path.rindex('/')
+                filename = path[index + 1:]
+                map_element = '  {"%s,%s,%s,%s,%s", "%s"},' % (
+                    k.op_type,
+                    k.target,
+                    k.precision,
+                    k.data_layout,
+                    k.alias,
+                    filename.strip()
+                )
+                kernel_src_map_lines.append(map_element)
 with open(dest_path, 'w') as f:
    logging.info("write kernel list to %s" % dest_path)
    f.write('\n'.join(out_lines))
+
+with open(kernelmap_path, 'w') as fd:
+    logging.info("write kernel map to %s" % dest_path)
+    kernel_src_map_lines.append('  {"  ", "  "}')
+    kernel_src_map_lines.append('};')
+    fd.write('\n'.join(kernel_src_map_lines))
--- a/lite/tools/cmake_tools/parse_kernel_registry.py
+++ b/lite/tools/cmake_tools/parse_kernel_registry.py
@@ -18,14 +18,19 @@ from ast import RegisterLiteKernelParser

 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
+minkernels_list_path = sys.argv[3]
+tailored = sys.argv[4]

 out_lines = [
    '#pragma once',
    '#include "paddle_lite_factory_helper.h"',
    '',
 ]
-
-
+minlines = set()
+if tailored == "ON":
+    with open(minkernels_list_path) as fd:
+        for line in fd:
+            minlines.add(line.strip())
 with open(ops_list_path) as f:
    paths = set([path for path in f])
    for path in paths:
@@ -35,6 +40,15 @@ with open(ops_list_path) as f:
            kernel_parser.parse()

            for k in kernel_parser.kernels:
+                  kernel = "%s, %s, %s, %s, %s" % (
+                     k.op_type,
+                     k.target,
+                     k.precision,
+                     k.data_layout,
+                     k.alias,
+                  )
+                  if tailored == "ON":
+                      if kernel not in minlines: continue
                  key = "USE_LITE_KERNEL(%s, %s, %s, %s, %s);" % (
                     k.op_type,
                     k.target,

--- a/lite/tools/cmake_tools/parse_op_registry.py
+++ b/lite/tools/cmake_tools/parse_op_registry.py
@@ -19,7 +19,8 @@ from ast import RegisterLiteOpParser

 ops_list_path = sys.argv[1]
 dest_path = sys.argv[2]
-
+minops_list_path = sys.argv[3]
+tailored = sys.argv[4]
 out_lines = [
    '#pragma once',
    '#include "paddle_lite_factory_helper.h"',
@@ -30,6 +31,11 @@ paths = set()
 for line in open(ops_list_path):
    paths.add(line.strip())

+if tailored == "ON":
+    minlines = set()
+    with open(minops_list_path) as fd:
+        for line in fd:
+            minlines.add(line.strip())
 for path in paths:
    str_info = open(path.strip()).read()
    op_parser = RegisterLiteOpParser(str_info)
@@ -37,6 +43,8 @@ for path in paths:
    for op in ops:
        if "_grad" in op: 
            continue
+        if tailored == "ON":
+            if op not in minlines: continue
        out = "USE_LITE_OP(%s);" % op
        out_lines.append(out)