Merge branch 'chunwe/refactor-api' into 'incubate/lite'

refactor api and recover CI cache See merge request inference/paddlelite!28

Merge branch 'chunwe/refactor-api' into 'incubate/lite'
refactor api and recover CI cache See merge request inference/paddlelite!28
7cf536f0 · Chunwei · 6c68024d · b2222692 · 7cf536f0 · 7cf536f0
28 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,6 +78,7 @@ build:mobile_android:
        paths:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_ccache
    script:
@@ -98,6 +99,7 @@ build:mobile_armlinux:
        paths:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_ccache2
    script:
@@ -107,24 +109,13 @@ build:mobile_armlinux:
    dependencies:
        - build:server

-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache

 build:mobile_model_mobilenetv1:
    tags:
        - lite
    stage: build_mobile
    image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
+
    script:
        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv1
        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv1
@@ -137,6 +128,7 @@ build:mobile_model_mobilenetv1:
        paths:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_model_mobilenetv1

@@ -145,12 +137,7 @@ build:mobile_model_mobilenetv2:
        - lite
    stage: build_mobile
    image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
+
    script:
        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv2
        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv2
@@ -163,6 +150,7 @@ build:mobile_model_mobilenetv2:
        paths:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_model_mobilenetv2

@@ -171,12 +159,7 @@ build:mobile_model_resnet50:
        - lite
    stage: build_mobile
    image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
+
    script:
        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_resnet50
        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_resnet50
@@ -189,6 +172,7 @@ build:mobile_model_resnet50:
        paths:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
            - ~/.ccache
            - $CI_PROJECT_DIR/build_mobile_model_resnet50


--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -24,8 +24,7 @@ function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
            ${EXTERNAL_PROJECT_NAME}
            ${EXTERNAL_PROJECT_LOG_ARGS}
            PREFIX                ${INSTALL_DIR}
-            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
            DOWNLOAD_DIR          ${INSTALL_DIR}
            DOWNLOAD_NO_PROGRESS  1
            CONFIGURE_COMMAND     ""
@@ -143,6 +142,8 @@ function(lite_cc_binary TARGET)
            HVY_DEPS ${args_HVY_DEPS}
            )
    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+    # collect targets need to compile for lite
+    add_dependencies(lite_compile_deps ${TARGET})
 endfunction()

 # Add a unit-test name to file for latter offline manual test.

--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -12,7 +12,6 @@ lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
  CUDA_DEPS kernels_cuda
  X86_DEPS ${x86_kernels}
  )
-lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper)

 set(light_api_deps
    scope_lite target_wrapper_host model_parser_lite program_lite)
@@ -21,27 +20,34 @@ if(LITE_WITH_CUDA)
    set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()

-lite_cc_library(light_api_lite SRCS light_api.cc
-  DEPS ${light_api_deps} ${ops_lite} ${host_kernels}
-  )
-
 message(STATUS "get ops ${ops_lite}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")

+lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} ${host_kernels} program_lite)
+
+lite_cc_library(light_api_lite SRCS light_api.cc
+        DEPS scope_lite target_wrapper_host model_parser_lite
+            ${light_api_deps} ${ops_lite} ${host_kernels} program_lite
+        CUDA_DEPS target_wrapper_cuda
+        X86_DEPS ${x86_kernels} operator
+        ARM_DEPS ${arm_kernels}
+        )
+
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
        "A path setting inference demo download directories.")

 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
    lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-       DEPS cxx_api_lite mir_passes 
+       DEPS cxx_api_lite mir_passes lite_api_test_helper
       ${ops_lite} ${host_kernels} ${x86_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
    add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
 endif()

+
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
    set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels})

@@ -68,25 +74,20 @@ endif()

 # These tests needs CLI arguments, and is not supported in ARM CI.
 # TODO(Superjomn) support latter.
-if(NOT LITE_ON_MOBILE)
-    lite_cc_test(test_light_api SRCS light_api_test.cc
-      DEPS light_api_lite mir_passes
-      X86_DEPS ${x86_kernels}
+lite_cc_test(test_light_api SRCS light_api_test.cc
+    DEPS light_api_lite program_lite mir_passes
    ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
    SERIAL)

+if(NOT LITE_ON_MOBILE)
    lite_cc_test(test_apis_lite SRCS apis_test.cc
-      DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes
-      X86_DEPS ${x86_kernels}
+      DEPS cxx_api_lite light_api_lite ${ops_lite}
+      X86_DEPS ${x86_kernels} operator
      ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
          --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 endif()

-lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
-    DEPS
-    cxx_api_lite
-    model_parser_lite
-    target_wrapper_host
-    mir_passes
-    ${ops_lite} ${host_kernels}
-    ARM_DEPS ${arm_kernels})
+#lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
+    #X86_DEPS operator
+    #DEPS light_api_lite model_parser_lite target_wrapper_host mir_passes
+    #ARM_DEPS ${arm_kernels})
--- a/paddle/fluid/lite/api/apis_test.cc
+++ b/paddle/fluid/lite/api/apis_test.cc
@@ -39,7 +39,7 @@ void SetConstInput(lite::Tensor* x) {
  }
 }

-bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
+bool CompareTensors(const std::string& name, const Predictor& cxx_api,
                    const LightPredictor& light_api) {
  const auto* a = cxx_api.GetTensor(name);
  const auto* b = light_api.GetTensor(name);
@@ -48,8 +48,8 @@ bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,

 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi_LightApi, save_and_load_model) {
-  lite::ExecutorLite cxx_api;
-  lite::LightPredictor light_api;
+  lite::Predictor cxx_api;
+  lite::LightPredictor light_api(FLAGS_optimized_model);

  // CXXAPi
  {
@@ -69,8 +69,6 @@ TEST(CXXApi_LightApi, save_and_load_model) {

  // LightApi
  {
-    light_api.Build(FLAGS_optimized_model);
-
    auto* x = light_api.GetInput(0);
    SetConstInput(x);


--- a/paddle/fluid/lite/api/cxx_api.cc
+++ b/paddle/fluid/lite/api/cxx_api.cc
@@ -17,19 +17,49 @@
 #include <string>
 #include <utility>
 #include <vector>
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#include "paddle/fluid/platform/port.h"
-#endif
+#include "paddle/fluid/lite/utils/io.h"

 namespace paddle {
 namespace lite {

-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void ExecutorLite::SaveModel(const std::string &dir) {
-  MkDirRecursively(dir.c_str());
+void Predictor::SaveModel(const std::string &dir) {
+#ifndef LITE_WITH_ARM
+  LOG(INFO) << "Save model to " << dir;
+  MkDirRecur(dir);
  program_->PersistModel(dir, program_desc_);
-}
+#else
+  LOG(INFO) << "Save model to ./";
+  program_->PersistModel("./", program_desc_);
 #endif
+}
+
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+
+const lite::Tensor *Predictor::GetOutput(size_t offset) {
+  auto *_fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+void Predictor::Build(const std::string &model_path, const Place &prefer_place,
+                      const std::vector<Place> &valid_places) {
+  LoadModel(model_path, scope_.get(), &program_desc_);
+  Build(program_desc_, prefer_place, valid_places);
+}
+
+const framework::proto::ProgramDesc &Predictor::program_desc() const {
+  return program_desc_;
+}

 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -26,20 +26,20 @@
 namespace paddle {
 namespace lite {

-struct Config {};
-
-class ExecutorLite {
+/*
+ * Predictor for inference, input a model, it will optimize and execute it.
+ */
+class Predictor {
 public:
-  ExecutorLite() { scope_ = std::make_shared<Scope>(); }
-  explicit ExecutorLite(const std::shared_ptr<lite::Scope>& root_scope) {
-    scope_ = root_scope;
-  }
+  // Create an empty predictor.
+  Predictor() { scope_ = std::make_shared<Scope>(); }
+  // Create a predictor with the weight variable scope set.
+  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
+      : scope_(root_scope) {}

+  // Build from a model, with places set for hardware config.
  void Build(const std::string& model_path, const Place& prefer_place,
-             const std::vector<Place>& valid_places) {
-    LoadModel(model_path, scope_.get(), &program_desc_);
-    Build(program_desc_, prefer_place, valid_places);
-  }
+             const std::vector<Place>& valid_places);

  void Build(const framework::proto::ProgramDesc& desc,
             const Place& prefer_place,
@@ -55,40 +55,24 @@ class ExecutorLite {
    program_ = optimizer_.GenRuntimeProgram();
  }

-// This method is disabled in mobile, or unnecessary dependencies required.
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  void SaveModel(const std::string& dir);
-#endif
+  // Run the predictor for a single batch of data.
+  void Run() { program_->Run(); }

-  // Get offset-th col of feed.
-  lite::Tensor* GetInput(size_t offset) {
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
+  // Get offset-th col of feed inputs.
+  lite::Tensor* GetInput(size_t offset);

-  const lite::Tensor* GetOutput(size_t offset) {
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
-  }
+  // Get offset-th col of fetch results.
+  const lite::Tensor* GetOutput(size_t offset);

+  // Return the program desc for debug.
+  const framework::proto::ProgramDesc& program_desc() const;
  const lite::Tensor* GetTensor(const std::string& name) const {
    auto* var = program_->exec_scope()->FindVar(name);
    return &var->Get<lite::Tensor>();
  }

-  void Run() { program_->Run(); }
-
-  const framework::proto::ProgramDesc& program_desc() const {
-    return program_desc_;
-  }
+  // This method is disabled in mobile, for unnecessary dependencies required.
+  void SaveModel(const std::string& dir);

 private:
  Optimizer optimizer_;
@@ -97,6 +81,7 @@ class ExecutorLite {
  std::unique_ptr<RuntimeProgram> program_;
 };

+#ifdef LITE_WITH_X86
 /*
 * An executor for training.
 *
@@ -120,13 +105,13 @@ class CXXTrainer {
      : scope_(root_scope),
        preferred_place_(preferred_place),
        valid_places_(valid_places),
-        main_program_executor_(ExecutorLite(scope_)) {}
+        main_program_executor_(Predictor(scope_)) {}

  // Build the RuntimeProgram cache for the main program. The cache will run
  // multiple times for the epoches.
  // NOTE Just support to execute the 0-th block currently.
-  ExecutorLite& BuildMainProgramExecutor(
-      const framework::proto::ProgramDesc& desc, int block_id = 0) {
+  Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc,
+                                      int block_id = 0) {
    main_program_executor_.Build(desc, preferred_place_, valid_places_);
    return main_program_executor_;
  }
@@ -134,7 +119,7 @@ class CXXTrainer {
  // Run the startup program. It just executes once, no cache needed.
  void RunStartupProgram(const framework::proto::ProgramDesc& desc,
                         int block_id = 0) {
-    ExecutorLite exe(scope_);
+    Predictor exe(scope_);
    exe.Build(desc, preferred_place_, valid_places_);
    exe.Run();
  }
@@ -146,8 +131,9 @@ class CXXTrainer {
  std::vector<Place> valid_places_;

  // The training program.
-  ExecutorLite main_program_executor_;
+  Predictor main_program_executor_;
 };
+#endif

 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -34,7 +34,7 @@ void Run(const char* model_dir, int repeat, int thread_num) {
  DeviceInfo::Init();
  DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});


--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -42,7 +42,7 @@ TEST(CXXApi, test) {
 }

 TEST(CXXApi, save_model) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},

--- a/paddle/fluid/lite/api/inceptionv4_test.cc
+++ b/paddle/fluid/lite/api/inceptionv4_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(InceptionV4, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});


--- a/paddle/fluid/lite/api/light_api.cc
+++ b/paddle/fluid/lite/api/light_api.cc
@@ -13,3 +13,67 @@
 // limitations under the License.

 #include "paddle/fluid/lite/api/light_api.h"
+
+namespace paddle {
+namespace lite {
+
+void LightPredictor::Build(const std::string& model_dir) {
+  framework::proto::ProgramDesc desc;
+  LoadModel(model_dir, scope_.get(), &desc);
+  BuildRuntimeProgram(desc);
+}
+
+Tensor* LightPredictor::GetInput(size_t offset) {
+  auto* _feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+
+const Tensor* LightPredictor::GetOutput(size_t offset) {
+  auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+void LightPredictor::BuildRuntimeProgram(
+    const framework::proto::ProgramDesc& prog) {
+  std::vector<Instruction> insts;
+  // 1. Create op first
+  Program program(prog, scope_, {});
+
+  // 2. Create Instructs
+
+  // Create the kernels of the target places, and filter out the specific
+  // kernel with the target alias.
+  for (auto& op : program.ops()) {
+    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
+    std::string op_type, alias;
+    Place place;
+    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+    auto kernels = op->CreateKernels({place});
+    // filter out a kernel
+    auto it = std::find_if(
+        kernels.begin(), kernels.end(),
+        [&](std::unique_ptr<KernelBase>& it) { return it->alias() == alias; });
+    CHECK(it != kernels.end());
+    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    insts.emplace_back(op, std::move(*it));
+  }
+  program_.reset(new RuntimeProgram(std::move(insts)));
+  CHECK(program.exec_scope());
+  program_->set_exec_scope(program.exec_scope());
+}
+
+LightPredictor::LightPredictor(const std::string& model_dir) {
+  scope_ = std::make_shared<Scope>();
+  Build(model_dir);
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -32,36 +32,21 @@
 namespace paddle {
 namespace lite {

+/*
+ * The light weight predictor, mainly for mobile. It loads an optimized model,
+ * and will not depend on the MIR or perform latter optimization.
+ */
 class LightPredictor {
 public:
-  LightPredictor() { scope_ = std::make_shared<Scope>(); }
-
-  void Build(const std::string& model_dir) {
-    framework::proto::ProgramDesc desc;
-    LoadModel(model_dir, scope_.get(), &desc);
-    BuildRuntimeProgram(desc);
-  }
+  explicit LightPredictor(const std::string& model_dir);

  void Run() { program_->Run(); }

-  // Get offset-th col of feed.
-  Tensor* GetInput(size_t offset) {
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
+  // Get offset-th col of feed inputs.
+  Tensor* GetInput(size_t offset);

-  const Tensor* GetOutput(size_t offset) {
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
-  }
+  // Get offset-th col of fetch outputs.
+  const Tensor* GetOutput(size_t offset);

  const lite::Tensor* GetTensor(const std::string& name) const {
    auto* var = program_->exec_scope()->FindVar(name);
@@ -69,34 +54,8 @@ class LightPredictor {
  }

 private:
-  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
-    std::vector<Instruction> insts;
-    // 1. Create op first
-    Program program(prog, scope_, {});
-
-    // 2. Create Instructs
-
-    // Create the kernels of the target places, and filter out the specific
-    // kernel with the target alias.
-    for (auto& op : program.ops()) {
-      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-      std::string op_type, alias;
-      Place place;
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      auto kernels = op->CreateKernels({place});
-      // filter out a kernel
-      auto it = std::find_if(kernels.begin(), kernels.end(),
-                             [&](std::unique_ptr<KernelBase>& it) {
-                               return it->alias() == alias;
-                             });
-      CHECK(it != kernels.end());
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-      insts.emplace_back(op, std::move(*it));
-    }
-    program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope());
-    program_->set_exec_scope(program.exec_scope());
-  }
+  void Build(const std::string& model_dir);
+  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog);

 private:
  std::shared_ptr<Scope> scope_;

--- a/paddle/fluid/lite/api/light_api_test.cc
+++ b/paddle/fluid/lite/api/light_api_test.cc
@@ -25,8 +25,10 @@ namespace paddle {
 namespace lite {

 TEST(LightAPI, load) {
-  LightPredictor predictor;
-  predictor.Build(FLAGS_optimized_model);
+  if (FLAGS_optimized_model.empty()) {
+    FLAGS_optimized_model = "lite_naive_model";
+  }
+  LightPredictor predictor(FLAGS_optimized_model);

  auto* input_tensor = predictor.GetInput(0);
  input_tensor->Resize(DDim(std::vector<int64_t>({100, 100})));

--- a/paddle/fluid/lite/api/lite_api_test_helper.cc
+++ b/paddle/fluid/lite/api/lite_api_test_helper.cc
@@ -22,7 +22,7 @@ namespace paddle {
 namespace lite {

 const lite::Tensor* RunHvyModel() {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
 #ifndef LITE_WITH_CUDA
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});

--- a/paddle/fluid/lite/api/mobilenetv1_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv1_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(MobileNetV1, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});


--- a/paddle/fluid/lite/api/mobilenetv2_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv2_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(MobileNetV2, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});


--- a/paddle/fluid/lite/api/resnet50_test.cc
+++ b/paddle/fluid/lite/api/resnet50_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(ResNet50, test) {
  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kARM), PRECISION(kFloat)}});


--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
 cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
 cc_library(cpu_info_lite SRCS cpu_info.cc)
 lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3)
-cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
+cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite kernel_lite
  cpp_op_desc_lite ${tensor_lite})
 cc_library(types_lite SRCS types.cc)
 cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)

--- a/paddle/fluid/lite/core/kernel.cc
+++ b/paddle/fluid/lite/core/kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.

 #include "paddle/fluid/lite/core/kernel.h"
+#include <cstdlib>

 namespace paddle {
 namespace lite {
@@ -49,6 +50,36 @@ std::string KernelBase::GenParamTypeKey() const {
  return ss.str();
 }

+void KernelBase::ParseKernelType(const std::string &kernel_type,
+                                 std::string *op_type, std::string *alias,
+                                 Place *place) {
+  std::stringstream ss(kernel_type);
+  std::getline(ss, *op_type, '/');
+  std::getline(ss, *alias, '/');
+  std::string target, precision, layout;
+  std::getline(ss, target, '/');
+  std::getline(ss, precision, '/');
+  std::getline(ss, layout, '/');
+
+  place->target = static_cast<TargetType>(std::atoi(target.c_str()));
+  place->precision = static_cast<PrecisionType>(std::atoi(precision.c_str()));
+  place->layout = static_cast<DataLayoutType>(std::atoi(layout.c_str()));
+}
+
+std::string KernelBase::SerializeKernelType(const std::string &op_type,
+                                            const std::string &alias,
+                                            const Place &place) {
+  std::stringstream ss;
+  ss << op_type << "/";
+  ss << alias << "/";
+  // We serialize the place value not the string representation here for
+  // easier deserialization.
+  ss << static_cast<int>(place.target) << "/";
+  ss << static_cast<int>(place.precision) << "/";
+  ss << static_cast<int>(place.layout);
+  return ss.str();
+}
+
 bool ParamTypeRegistry::KeyCmp::operator()(
    const ParamTypeRegistry::key_t &a,
    const ParamTypeRegistry::key_t &b) const {

--- a/paddle/fluid/lite/core/kernel.h
+++ b/paddle/fluid/lite/core/kernel.h
@@ -118,33 +118,11 @@ class KernelBase {

  static std::string SerializeKernelType(const std::string& op_type,
                                         const std::string& alias,
-                                         const Place& place) {
-    std::stringstream ss;
-    ss << op_type << "/";
-    ss << alias << "/";
-    // We serialize the place value not the string representation here for
-    // easier deserialization.
-    ss << static_cast<int>(place.target) << "/";
-    ss << static_cast<int>(place.precision) << "/";
-    ss << static_cast<int>(place.layout);
-    return ss.str();
-  }
+                                         const Place& place);

  static void ParseKernelType(const std::string& kernel_type,
                              std::string* op_type, std::string* alias,
-                              Place* place) {
-    std::stringstream ss(kernel_type);
-    std::getline(ss, *op_type, '/');
-    std::getline(ss, *alias, '/');
-    std::string target, precision, layout;
-    std::getline(ss, target, '/');
-    std::getline(ss, precision, '/');
-    std::getline(ss, layout, '/');
-
-    place->target = static_cast<TargetType>(std::stoi(target));
-    place->precision = static_cast<PrecisionType>(std::stoi(precision));
-    place->layout = static_cast<DataLayoutType>(std::stoi(layout));
-  }
+                              Place* place);

  virtual ~KernelBase() = default;
  void Torch() {}

--- a/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 namespace mir {

 TEST(fc_fuse_pass, fuse_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
 #ifndef LITE_WITH_CUDA
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});
@@ -69,7 +69,7 @@ TEST(fc_fuse_pass, fuse_test) {

 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(fc_fuse_pass, save_model_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},

--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -51,5 +51,3 @@ set(arm_kernels
    )

 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
- 
- 
--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ b/paddle/fluid/lite/kernels/arm/use_kernels.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/lite/core/op_registry.h"
-
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
--- a/paddle/fluid/lite/kernels/use_kernels.h
+++ b/paddle/fluid/lite/kernels/use_kernels.h
@@ -12,14 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#pragma once
 /*
 * ATTENTION this header file can only include in .cc file.
 */

+#pragma once
+#include "paddle/fluid/lite/core/op_registry.h"
+
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);

+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
+#endif
+
 #ifdef LITE_WITH_X86
 USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
@@ -36,21 +55,6 @@ USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
 #endif

-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
-#endif
-
 #ifdef LITE_WITH_CUDA
 USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
 USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);

--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -50,4 +50,3 @@ set(x86_kernels
    )

 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
- 
--- a/paddle/fluid/lite/operators/use_ops.h
+++ b/paddle/fluid/lite/operators/use_ops.h
@@ -13,9 +13,10 @@
 // limitations under the License.

 #pragma once
-/*
- * ATTENTION this header file can only include in .cc file.
- */
+
+// ATTENTION This can only include in a .cc file.
+
+#include "paddle/fluid/lite/core/op_registry.h"

 USE_LITE_OP(mul);
 USE_LITE_OP(fc);

--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -85,8 +85,8 @@ function build_test_server {

 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
-    test_name=$1
-    port=$2
+    local test_name=$1
+    local port=$2
    if [[ "${test_name}x" == "x" ]]; then
        echo "test_name can not be empty"
        exit 1
@@ -99,12 +99,18 @@ function test_arm_android {
    echo "test name: ${test_name}"
    adb_work_dir="/data/local/tmp"

-    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite")
+    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite" "test_light_api")
    for skip_name in ${skip_list[@]} ; do
        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
    done

-    testpath=$(find ./paddle/fluid -name ${test_name})
+    local testpath=$(find ./paddle/fluid -name ${test_name})
+
+    # if [[ "$test_name" == "test_light_api" ]]; then
+    #     local model_path=$(find . -name "lite_naive_model")
+    #     arm_push_necessary_file $port $model_path $adb_work_dir
+    # fi
+
    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
    adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
@@ -204,6 +210,7 @@ function test_arm {
    abi=$2
    lang=$3
    port=$4
+
    if [[ ${os} == "armlinux" ]]; then
        # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
        echo "Skip test arm linux yet. armlinux must in another docker"
@@ -221,6 +228,7 @@ function test_arm {
        return 0
    fi

+
    echo "test file: ${TESTS_FILE}"
    for _test in $(cat $TESTS_FILE); do
        test_arm_android $_test $port
@@ -242,6 +250,14 @@ function prepare_emulator {
    sleep 1m
 }

+function arm_push_necessary_file {
+    local port=$1
+    local testpath=$2
+    local adb_work_dir=$3
+
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+}
+

 # We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
 # sub-task1
@@ -286,20 +302,22 @@ function build_test_arm_subtask_armlinux {

    prepare_emulator $port_armv8 $port_armv7

+    cur=$PWD
+
    # job 5
-    build_arm "armlinux" "armv8"
-    test_arm "armlinux" "armv8"
-    cd -
+    build_arm "armlinux" "armv8" "gcc" $port_armv8
+    test_arm "armlinux" "armv8" "gcc" $port_armv8
+    cd $cur

    # job 6
-    build_arm "armlinux" "armv7"
-    test_arm "armlinux" "armv7"
-    cd -
+    build_arm "armlinux" "armv7" "gcc" $port_armv8
+    test_arm "armlinux" "armv7" "gcc" $port_armv8
+    cd $cur

    # job 7
-    build_arm "armlinux" "armv7hf"
-    test_arm "armlinux" "armv7hf"
-    cd -
+    build_arm "armlinux" "armv7hf" "gcc" $port_armv8
+    test_arm "armlinux" "armv7hf" "gcc" $port_armv8
+    cd $cur

    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    echo "Done"

--- a/paddle/fluid/lite/utils/io.h
+++ b/paddle/fluid/lite/utils/io.h
@@ -14,15 +14,18 @@

 #pragma once

-#include <sys/stat.h>
+#ifndef LITE_WITH_ARM
+#include <bits/stdc++.h>
+#endif
 #include <fstream>
 #include <string>
 #include "paddle/fluid/lite/utils/cp_logging.h"
+#include "paddle/fluid/lite/utils/string.h"

 namespace paddle {
 namespace lite {

-static bool IsFileExists(const std::string &path) {
+static bool IsFileExists(const std::string& path) {
  std::ifstream file(path);
  bool res = file.is_open();
  if (res) {
@@ -31,5 +34,13 @@ static bool IsFileExists(const std::string &path) {
  return res;
 }

+// ARM mobile not support mkdir in C++
+#ifndef LITE_WITH_ARM
+static void MkDirRecur(const std::string& path) {
+  CHECK_EQ(system(string_format("mkdir -p %s", path.c_str()).c_str()), 0)
+      << "Cann't mkdir " << path;
+}
+#endif
+
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/utils/string.h
+++ b/paddle/fluid/lite/utils/string.h
@@ -74,5 +74,15 @@ static std::string Repr(const std::vector<std::string>& v) {
  return "{" + Join(tmp, ",") + "}";
 }

+static std::vector<std::string> Split(const std::string& s, char delim) {
+  std::stringstream ss(s);
+  std::string line;
+  std::vector<std::string> res;
+  while (std::getline(ss, line, delim)) {
+    res.push_back(line);
+  }
+  return res;
+}
+
 }  // namespace lite
 }  // namespace paddle