diff --git a/.gitlab-ci.yml b/.gitlab-ci.yml
index f656e065a065ab65d461ba2901a548fcf9b4e42a..7771b24872fc7bac9b0a02c12b103b005da12dbe 100755
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -78,6 +78,7 @@ build:mobile_android:
         paths:
             - $MOBILE_LITE_CACHE0
             - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
             - ~/.ccache
             - $CI_PROJECT_DIR/build_mobile_ccache
     script:
@@ -98,6 +99,7 @@ build:mobile_armlinux:
         paths:
             - $MOBILE_LITE_CACHE0
             - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
             - ~/.ccache
             - $CI_PROJECT_DIR/build_mobile_ccache2
     script:
@@ -107,24 +109,13 @@ build:mobile_armlinux:
     dependencies:
         - build:server
 
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
 
 build:mobile_model_mobilenetv1:
     tags:
         - lite
     stage: build_mobile
     image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
+
     script:
         - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv1
         - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv1
@@ -137,6 +128,7 @@ build:mobile_model_mobilenetv1:
         paths:
             - $MOBILE_LITE_CACHE0
             - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
             - ~/.ccache
             - $CI_PROJECT_DIR/build_mobile_model_mobilenetv1
 
@@ -145,12 +137,7 @@ build:mobile_model_mobilenetv2:
         - lite
     stage: build_mobile
     image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
+
     script:
         - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv2
         - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv2
@@ -163,6 +150,7 @@ build:mobile_model_mobilenetv2:
         paths:
             - $MOBILE_LITE_CACHE0
             - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
             - ~/.ccache
             - $CI_PROJECT_DIR/build_mobile_model_mobilenetv2
 
@@ -171,12 +159,7 @@ build:mobile_model_resnet50:
         - lite
     stage: build_mobile
     image: $MOBILE_LITE_DOCKER_IMAGE
-    cache:
-        key: mobile_thirdparty
-        paths:
-            - $MOBILE_LITE_CACHE0
-            - $MOBILE_LITE_CACHE1
-            - ~/.ccache
+
     script:
         - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_resnet50
         - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_resnet50
@@ -189,6 +172,7 @@ build:mobile_model_resnet50:
         paths:
             - $MOBILE_LITE_CACHE0
             - $MOBILE_LITE_CACHE1
+            - $MOBILE_LITE_CACHE2
             - ~/.ccache
             - $CI_PROJECT_DIR/build_mobile_model_resnet50
 
diff --git a/paddle/fluid/lite/CMakeLists.txt b/paddle/fluid/lite/CMakeLists.txt
index e2a8984b459ce135a81170bcc3f293deafc61bb6..c43f055cec278fb70f3027ba2044459efebbe663 100644
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -24,8 +24,7 @@ function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
             ${EXTERNAL_PROJECT_NAME}
             ${EXTERNAL_PROJECT_LOG_ARGS}
             PREFIX                ${INSTALL_DIR}
-            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} &&
-            ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
+            DOWNLOAD_COMMAND      wget --no-check-certificate -q -O ${INSTALL_DIR}/${FILENAME} ${URL}/${FILENAME} && ${CMAKE_COMMAND} -E tar xzf ${INSTALL_DIR}/${FILENAME}
             DOWNLOAD_DIR          ${INSTALL_DIR}
             DOWNLOAD_NO_PROGRESS  1
             CONFIGURE_COMMAND     ""
@@ -143,6 +142,8 @@ function(lite_cc_binary TARGET)
             HVY_DEPS ${args_HVY_DEPS}
             )
     cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+    # collect targets need to compile for lite
+    add_dependencies(lite_compile_deps ${TARGET})
 endfunction()
 
 # Add a unit-test name to file for latter offline manual test.
diff --git a/paddle/fluid/lite/api/CMakeLists.txt b/paddle/fluid/lite/api/CMakeLists.txt
index 3cac3eeba6d4aef3d7af88979e79ee0cbf5b2efe..4440acd61b8e64bfdeccf455f641cb57eb0cdcdf 100644
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -12,7 +12,6 @@ lite_cc_library(lite_api_test_helper SRCS lite_api_test_helper.cc
   CUDA_DEPS kernels_cuda
   X86_DEPS ${x86_kernels}
   )
-lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS lite_api_test_helper)
 
 set(light_api_deps
     scope_lite target_wrapper_host model_parser_lite program_lite)
@@ -21,27 +20,34 @@ if(LITE_WITH_CUDA)
     set(light_api_deps ${light_api_deps} target_wrapper_cuda)
 endif()
 
-lite_cc_library(light_api_lite SRCS light_api.cc
-  DEPS ${light_api_deps} ${ops_lite} ${host_kernels}
-  )
-
 message(STATUS "get ops ${ops_lite}")
 message(STATUS "get Host kernels ${host_kernels}")
 message(STATUS "get ARM kernels ${arm_kernels}")
 
+lite_cc_library(cxx_api_lite SRCS cxx_api.cc DEPS ${cxx_api_lite_deps} ${ops_lite} ${host_kernels} program_lite)
+
+lite_cc_library(light_api_lite SRCS light_api.cc
+        DEPS scope_lite target_wrapper_host model_parser_lite
+            ${light_api_deps} ${ops_lite} ${host_kernels} program_lite
+        CUDA_DEPS target_wrapper_cuda
+        X86_DEPS ${x86_kernels} operator
+        ARM_DEPS ${arm_kernels}
+        )
+
 include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
         "A path setting inference demo download directories.")
 
 if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
     lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-       DEPS cxx_api_lite mir_passes 
+       DEPS cxx_api_lite mir_passes lite_api_test_helper
        ${ops_lite} ${host_kernels} ${x86_kernels}
        ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
             --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
     add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
 endif()
 
+
 if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
     set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels})
 
@@ -68,25 +74,20 @@ endif()
 
 # These tests needs CLI arguments, and is not supported in ARM CI.
 # TODO(Superjomn) support latter.
-if(NOT LITE_ON_MOBILE)
-    lite_cc_test(test_light_api SRCS light_api_test.cc
-      DEPS light_api_lite mir_passes
-      X86_DEPS ${x86_kernels}
-      ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
-      SERIAL)
+lite_cc_test(test_light_api SRCS light_api_test.cc
+    DEPS light_api_lite program_lite mir_passes
+    ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt
+    SERIAL)
 
+if(NOT LITE_ON_MOBILE)
     lite_cc_test(test_apis_lite SRCS apis_test.cc
-      DEPS cxx_api_lite light_api_lite ${ops_lite} mir_passes
-      X86_DEPS ${x86_kernels}
+      DEPS cxx_api_lite light_api_lite ${ops_lite}
+      X86_DEPS ${x86_kernels} operator
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
           --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
 endif()
 
-lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
-    DEPS
-    cxx_api_lite
-    model_parser_lite
-    target_wrapper_host
-    mir_passes
-    ${ops_lite} ${host_kernels}
-    ARM_DEPS ${arm_kernels})
+#lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
+    #X86_DEPS operator
+    #DEPS light_api_lite model_parser_lite target_wrapper_host mir_passes
+    #ARM_DEPS ${arm_kernels})
diff --git a/paddle/fluid/lite/api/apis_test.cc b/paddle/fluid/lite/api/apis_test.cc
index 7dd6a1193754437a32957f081b3be3fd5c1fc403..0b8e9550a104aeda94147ecdb9032424aa0baab1 100644
--- a/paddle/fluid/lite/api/apis_test.cc
+++ b/paddle/fluid/lite/api/apis_test.cc
@@ -39,7 +39,7 @@ void SetConstInput(lite::Tensor* x) {
   }
 }
 
-bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
+bool CompareTensors(const std::string& name, const Predictor& cxx_api,
                     const LightPredictor& light_api) {
   const auto* a = cxx_api.GetTensor(name);
   const auto* b = light_api.GetTensor(name);
@@ -48,8 +48,8 @@ bool CompareTensors(const std::string& name, const ExecutorLite& cxx_api,
 
 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(CXXApi_LightApi, save_and_load_model) {
-  lite::ExecutorLite cxx_api;
-  lite::LightPredictor light_api;
+  lite::Predictor cxx_api;
+  lite::LightPredictor light_api(FLAGS_optimized_model);
 
   // CXXAPi
   {
@@ -69,8 +69,6 @@ TEST(CXXApi_LightApi, save_and_load_model) {
 
   // LightApi
   {
-    light_api.Build(FLAGS_optimized_model);
-
     auto* x = light_api.GetInput(0);
     SetConstInput(x);
 
diff --git a/paddle/fluid/lite/api/cxx_api.cc b/paddle/fluid/lite/api/cxx_api.cc
index 1ea8be2c0b588ed58c82a70f4ef9263c46d15654..7c6ffccfa0bdab393c6870283834c76c5d1a2668 100644
--- a/paddle/fluid/lite/api/cxx_api.cc
+++ b/paddle/fluid/lite/api/cxx_api.cc
@@ -17,19 +17,49 @@
 #include <string>
 #include <utility>
 #include <vector>
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#include "paddle/fluid/platform/port.h"
-#endif
+#include "paddle/fluid/lite/utils/io.h"
 
 namespace paddle {
 namespace lite {
 
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-void ExecutorLite::SaveModel(const std::string &dir) {
-  MkDirRecursively(dir.c_str());
+void Predictor::SaveModel(const std::string &dir) {
+#ifndef LITE_WITH_ARM
+  LOG(INFO) << "Save model to " << dir;
+  MkDirRecur(dir);
   program_->PersistModel(dir, program_desc_);
-}
+#else
+  LOG(INFO) << "Save model to ./";
+  program_->PersistModel("./", program_desc_);
 #endif
+}
+
+lite::Tensor *Predictor::GetInput(size_t offset) {
+  auto *_feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto *feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+
+const lite::Tensor *Predictor::GetOutput(size_t offset) {
+  auto *_fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto &fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+void Predictor::Build(const std::string &model_path, const Place &prefer_place,
+                      const std::vector<Place> &valid_places) {
+  LoadModel(model_path, scope_.get(), &program_desc_);
+  Build(program_desc_, prefer_place, valid_places);
+}
+
+const framework::proto::ProgramDesc &Predictor::program_desc() const {
+  return program_desc_;
+}
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/api/cxx_api.h b/paddle/fluid/lite/api/cxx_api.h
index 915a469a58765f102ff01c28ed9856d185311168..e7b74a04da25ba3d228aba78d9a5ce9d0909d708 100644
--- a/paddle/fluid/lite/api/cxx_api.h
+++ b/paddle/fluid/lite/api/cxx_api.h
@@ -26,20 +26,20 @@
 namespace paddle {
 namespace lite {
 
-struct Config {};
-
-class ExecutorLite {
+/*
+ * Predictor for inference, input a model, it will optimize and execute it.
+ */
+class Predictor {
  public:
-  ExecutorLite() { scope_ = std::make_shared<Scope>(); }
-  explicit ExecutorLite(const std::shared_ptr<lite::Scope>& root_scope) {
-    scope_ = root_scope;
-  }
+  // Create an empty predictor.
+  Predictor() { scope_ = std::make_shared<Scope>(); }
+  // Create a predictor with the weight variable scope set.
+  explicit Predictor(const std::shared_ptr<lite::Scope>& root_scope)
+      : scope_(root_scope) {}
 
+  // Build from a model, with places set for hardware config.
   void Build(const std::string& model_path, const Place& prefer_place,
-             const std::vector<Place>& valid_places) {
-    LoadModel(model_path, scope_.get(), &program_desc_);
-    Build(program_desc_, prefer_place, valid_places);
-  }
+             const std::vector<Place>& valid_places);
 
   void Build(const framework::proto::ProgramDesc& desc,
              const Place& prefer_place,
@@ -55,40 +55,24 @@ class ExecutorLite {
     program_ = optimizer_.GenRuntimeProgram();
   }
 
-// This method is disabled in mobile, or unnecessary dependencies required.
-#ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-  void SaveModel(const std::string& dir);
-#endif
+  // Run the predictor for a single batch of data.
+  void Run() { program_->Run(); }
 
-  // Get offset-th col of feed.
-  lite::Tensor* GetInput(size_t offset) {
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<lite::Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
+  // Get offset-th col of feed inputs.
+  lite::Tensor* GetInput(size_t offset);
 
-  const lite::Tensor* GetOutput(size_t offset) {
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
-  }
+  // Get offset-th col of fetch results.
+  const lite::Tensor* GetOutput(size_t offset);
 
+  // Return the program desc for debug.
+  const framework::proto::ProgramDesc& program_desc() const;
   const lite::Tensor* GetTensor(const std::string& name) const {
     auto* var = program_->exec_scope()->FindVar(name);
     return &var->Get<lite::Tensor>();
   }
 
-  void Run() { program_->Run(); }
-
-  const framework::proto::ProgramDesc& program_desc() const {
-    return program_desc_;
-  }
+  // This method is disabled in mobile, for unnecessary dependencies required.
+  void SaveModel(const std::string& dir);
 
  private:
   Optimizer optimizer_;
@@ -97,6 +81,7 @@ class ExecutorLite {
   std::unique_ptr<RuntimeProgram> program_;
 };
 
+#ifdef LITE_WITH_X86
 /*
  * An executor for training.
  *
@@ -120,13 +105,13 @@ class CXXTrainer {
       : scope_(root_scope),
         preferred_place_(preferred_place),
         valid_places_(valid_places),
-        main_program_executor_(ExecutorLite(scope_)) {}
+        main_program_executor_(Predictor(scope_)) {}
 
   // Build the RuntimeProgram cache for the main program. The cache will run
   // multiple times for the epoches.
   // NOTE Just support to execute the 0-th block currently.
-  ExecutorLite& BuildMainProgramExecutor(
-      const framework::proto::ProgramDesc& desc, int block_id = 0) {
+  Predictor& BuildMainProgramExecutor(const framework::proto::ProgramDesc& desc,
+                                      int block_id = 0) {
     main_program_executor_.Build(desc, preferred_place_, valid_places_);
     return main_program_executor_;
   }
@@ -134,7 +119,7 @@ class CXXTrainer {
   // Run the startup program. It just executes once, no cache needed.
   void RunStartupProgram(const framework::proto::ProgramDesc& desc,
                          int block_id = 0) {
-    ExecutorLite exe(scope_);
+    Predictor exe(scope_);
     exe.Build(desc, preferred_place_, valid_places_);
     exe.Run();
   }
@@ -146,8 +131,9 @@ class CXXTrainer {
   std::vector<Place> valid_places_;
 
   // The training program.
-  ExecutorLite main_program_executor_;
+  Predictor main_program_executor_;
 };
+#endif
 
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/api/cxx_api_bin.cc b/paddle/fluid/lite/api/cxx_api_bin.cc
index 58cf5dd785efc5de02e746e0ef1d5609a7c120a5..36f6ed45a10653aec74658a3c4794954d65dd1f5 100644
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -34,7 +34,7 @@ void Run(const char* model_dir, int repeat, int thread_num) {
   DeviceInfo::Init();
   DeviceInfo::Global().SetRunMode(LITE_POWER_HIGH, thread_num);
 #endif
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kARM), PRECISION(kFloat)}});
 
diff --git a/paddle/fluid/lite/api/cxx_api_test.cc b/paddle/fluid/lite/api/cxx_api_test.cc
index 093f8b73055fd0e9a8caed33430460b68cb8fbea..a1a028a5453a25f025bb55a4f81d4b94445480bb 100644
--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -42,7 +42,7 @@ TEST(CXXApi, test) {
 }
 
 TEST(CXXApi, save_model) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
   predictor.Build(FLAGS_model_dir, Place{TARGET(kCUDA), PRECISION(kFloat)},
diff --git a/paddle/fluid/lite/api/inceptionv4_test.cc b/paddle/fluid/lite/api/inceptionv4_test.cc
index b0f0aaf3c13abe9e5fb02c8a47c29a66842008af..7908a8110045c53ad6f0b4f33702dccf58e5b1b5 100644
--- a/paddle/fluid/lite/api/inceptionv4_test.cc
+++ b/paddle/fluid/lite/api/inceptionv4_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(InceptionV4, test) {
   DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kARM), PRECISION(kFloat)}});
 
diff --git a/paddle/fluid/lite/api/light_api.cc b/paddle/fluid/lite/api/light_api.cc
index 9d3da3a5919e9cb07fbfc67dc0c7538d96775db4..6a7e20a053c8d355289f51636966f55eb429b897 100644
--- a/paddle/fluid/lite/api/light_api.cc
+++ b/paddle/fluid/lite/api/light_api.cc
@@ -13,3 +13,67 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/api/light_api.h"
+
+namespace paddle {
+namespace lite {
+
+void LightPredictor::Build(const std::string& model_dir) {
+  framework::proto::ProgramDesc desc;
+  LoadModel(model_dir, scope_.get(), &desc);
+  BuildRuntimeProgram(desc);
+}
+
+Tensor* LightPredictor::GetInput(size_t offset) {
+  auto* _feed_list = program_->exec_scope()->FindVar("feed");
+  CHECK(_feed_list) << "no feed variable in exec_scope";
+  auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
+  if (offset >= feed_list->size()) {
+    feed_list->resize(offset + 1);
+  }
+  return &feed_list->at(offset);
+}
+
+const Tensor* LightPredictor::GetOutput(size_t offset) {
+  auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
+  CHECK(_fetch_list) << "no fatch variable in exec_scope";
+  auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
+  CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
+  return &fetch_list.at(offset);
+}
+
+void LightPredictor::BuildRuntimeProgram(
+    const framework::proto::ProgramDesc& prog) {
+  std::vector<Instruction> insts;
+  // 1. Create op first
+  Program program(prog, scope_, {});
+
+  // 2. Create Instructs
+
+  // Create the kernels of the target places, and filter out the specific
+  // kernel with the target alias.
+  for (auto& op : program.ops()) {
+    auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
+    std::string op_type, alias;
+    Place place;
+    KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
+    auto kernels = op->CreateKernels({place});
+    // filter out a kernel
+    auto it = std::find_if(
+        kernels.begin(), kernels.end(),
+        [&](std::unique_ptr<KernelBase>& it) { return it->alias() == alias; });
+    CHECK(it != kernels.end());
+    (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
+    insts.emplace_back(op, std::move(*it));
+  }
+  program_.reset(new RuntimeProgram(std::move(insts)));
+  CHECK(program.exec_scope());
+  program_->set_exec_scope(program.exec_scope());
+}
+
+LightPredictor::LightPredictor(const std::string& model_dir) {
+  scope_ = std::make_shared<Scope>();
+  Build(model_dir);
+}
+
+}  // namespace lite
+}  // namespace paddle
diff --git a/paddle/fluid/lite/api/light_api.h b/paddle/fluid/lite/api/light_api.h
index 5085909385c94e2e81b2cfa14167e8ce886060a3..bf1d7e95a3d90c8db3090815b150926551f63113 100644
--- a/paddle/fluid/lite/api/light_api.h
+++ b/paddle/fluid/lite/api/light_api.h
@@ -32,36 +32,21 @@
 namespace paddle {
 namespace lite {
 
+/*
+ * The light weight predictor, mainly for mobile. It loads an optimized model,
+ * and will not depend on the MIR or perform latter optimization.
+ */
 class LightPredictor {
  public:
-  LightPredictor() { scope_ = std::make_shared<Scope>(); }
-
-  void Build(const std::string& model_dir) {
-    framework::proto::ProgramDesc desc;
-    LoadModel(model_dir, scope_.get(), &desc);
-    BuildRuntimeProgram(desc);
-  }
+  explicit LightPredictor(const std::string& model_dir);
 
   void Run() { program_->Run(); }
 
-  // Get offset-th col of feed.
-  Tensor* GetInput(size_t offset) {
-    auto* _feed_list = program_->exec_scope()->FindVar("feed");
-    CHECK(_feed_list) << "no feed variable in exec_scope";
-    auto* feed_list = _feed_list->GetMutable<std::vector<Tensor>>();
-    if (offset >= feed_list->size()) {
-      feed_list->resize(offset + 1);
-    }
-    return &feed_list->at(offset);
-  }
+  // Get offset-th col of feed inputs.
+  Tensor* GetInput(size_t offset);
 
-  const Tensor* GetOutput(size_t offset) {
-    auto* _fetch_list = program_->exec_scope()->FindVar("fetch");
-    CHECK(_fetch_list) << "no fatch variable in exec_scope";
-    auto& fetch_list = *_fetch_list->GetMutable<std::vector<lite::Tensor>>();
-    CHECK_LT(offset, fetch_list.size()) << "offset " << offset << " overflow";
-    return &fetch_list.at(offset);
-  }
+  // Get offset-th col of fetch outputs.
+  const Tensor* GetOutput(size_t offset);
 
   const lite::Tensor* GetTensor(const std::string& name) const {
     auto* var = program_->exec_scope()->FindVar(name);
@@ -69,34 +54,8 @@ class LightPredictor {
   }
 
  private:
-  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog) {
-    std::vector<Instruction> insts;
-    // 1. Create op first
-    Program program(prog, scope_, {});
-
-    // 2. Create Instructs
-
-    // Create the kernels of the target places, and filter out the specific
-    // kernel with the target alias.
-    for (auto& op : program.ops()) {
-      auto kernel_type = op->op_info()->GetAttr<std::string>(kKernelTypeAttr);
-      std::string op_type, alias;
-      Place place;
-      KernelBase::ParseKernelType(kernel_type, &op_type, &alias, &place);
-      auto kernels = op->CreateKernels({place});
-      // filter out a kernel
-      auto it = std::find_if(kernels.begin(), kernels.end(),
-                             [&](std::unique_ptr<KernelBase>& it) {
-                               return it->alias() == alias;
-                             });
-      CHECK(it != kernels.end());
-      (*it)->SetContext(ContextScheduler::Global().NewContext((*it)->target()));
-      insts.emplace_back(op, std::move(*it));
-    }
-    program_.reset(new RuntimeProgram(std::move(insts)));
-    CHECK(program.exec_scope());
-    program_->set_exec_scope(program.exec_scope());
-  }
+  void Build(const std::string& model_dir);
+  void BuildRuntimeProgram(const framework::proto::ProgramDesc& prog);
 
  private:
   std::shared_ptr<Scope> scope_;
diff --git a/paddle/fluid/lite/api/light_api_test.cc b/paddle/fluid/lite/api/light_api_test.cc
index faf53b8177a4d11fb33017599ecdb9dc650fbc43..d7e58fbe56cee4055c422af9a8881e664cc26605 100644
--- a/paddle/fluid/lite/api/light_api_test.cc
+++ b/paddle/fluid/lite/api/light_api_test.cc
@@ -25,8 +25,10 @@ namespace paddle {
 namespace lite {
 
 TEST(LightAPI, load) {
-  LightPredictor predictor;
-  predictor.Build(FLAGS_optimized_model);
+  if (FLAGS_optimized_model.empty()) {
+    FLAGS_optimized_model = "lite_naive_model";
+  }
+  LightPredictor predictor(FLAGS_optimized_model);
 
   auto* input_tensor = predictor.GetInput(0);
   input_tensor->Resize(DDim(std::vector<int64_t>({100, 100})));
diff --git a/paddle/fluid/lite/api/lite_api_test_helper.cc b/paddle/fluid/lite/api/lite_api_test_helper.cc
index b82541723308f4748e28c64affa6899bf2d9b727..3c0835bc49b32a336848e9b9e88ea2afa3f1c698 100644
--- a/paddle/fluid/lite/api/lite_api_test_helper.cc
+++ b/paddle/fluid/lite/api/lite_api_test_helper.cc
@@ -22,7 +22,7 @@ namespace paddle {
 namespace lite {
 
 const lite::Tensor* RunHvyModel() {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
 #ifndef LITE_WITH_CUDA
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
diff --git a/paddle/fluid/lite/api/mobilenetv1_test.cc b/paddle/fluid/lite/api/mobilenetv1_test.cc
index 527b387a4260b46f8033ce7e8a1b8b5ae91a7928..94935e8699643577b309fb294a18ea848a5ad567 100644
--- a/paddle/fluid/lite/api/mobilenetv1_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv1_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(MobileNetV1, test) {
   DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kARM), PRECISION(kFloat)}});
 
diff --git a/paddle/fluid/lite/api/mobilenetv2_test.cc b/paddle/fluid/lite/api/mobilenetv2_test.cc
index 8a1ccdf4d37755559b80aba08010ec1ae6eb0578..0d615f61f267a612a32e5a0535d6272f2c867769 100644
--- a/paddle/fluid/lite/api/mobilenetv2_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv2_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(MobileNetV2, test) {
   DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kARM), PRECISION(kFloat)}});
 
diff --git a/paddle/fluid/lite/api/resnet50_test.cc b/paddle/fluid/lite/api/resnet50_test.cc
index c4c214d6cdb462b7d95cbfd0f1787dab8d359a47..cb63ad83cab40842fe799496ff8881f51ea953ae 100644
--- a/paddle/fluid/lite/api/resnet50_test.cc
+++ b/paddle/fluid/lite/api/resnet50_test.cc
@@ -30,7 +30,7 @@ namespace lite {
 #ifdef LITE_WITH_ARM
 TEST(ResNet50, test) {
   DeviceInfo::Init();
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kARM), PRECISION(kFloat)}});
 
diff --git a/paddle/fluid/lite/core/CMakeLists.txt b/paddle/fluid/lite/core/CMakeLists.txt
index 1e95668cddc722e32ea784fe2331380ea3a3940e..f6d48c2bea52040a924561812fb092df412a0c15 100644
--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -25,7 +25,7 @@ cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
 cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
 cc_library(cpu_info_lite SRCS cpu_info.cc)
 lite_cc_library(context_lite SRCS context.cc DEPS ${tensor_lite} any_lite cpu_info_lite eigen3)
-cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite
+cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapper_lite kernel_lite
   cpp_op_desc_lite ${tensor_lite})
 cc_library(types_lite SRCS types.cc)
 cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)
diff --git a/paddle/fluid/lite/core/kernel.cc b/paddle/fluid/lite/core/kernel.cc
index 44b00f53d018ffe9431c7b481fb1bc1a6e1f7cdc..0dae1394290c34cddcf8b2f22868fa326f1974fd 100644
--- a/paddle/fluid/lite/core/kernel.cc
+++ b/paddle/fluid/lite/core/kernel.cc
@@ -13,6 +13,7 @@
 // limitations under the License.
 
 #include "paddle/fluid/lite/core/kernel.h"
+#include <cstdlib>
 
 namespace paddle {
 namespace lite {
@@ -49,6 +50,36 @@ std::string KernelBase::GenParamTypeKey() const {
   return ss.str();
 }
 
+void KernelBase::ParseKernelType(const std::string &kernel_type,
+                                 std::string *op_type, std::string *alias,
+                                 Place *place) {
+  std::stringstream ss(kernel_type);
+  std::getline(ss, *op_type, '/');
+  std::getline(ss, *alias, '/');
+  std::string target, precision, layout;
+  std::getline(ss, target, '/');
+  std::getline(ss, precision, '/');
+  std::getline(ss, layout, '/');
+
+  place->target = static_cast<TargetType>(std::atoi(target.c_str()));
+  place->precision = static_cast<PrecisionType>(std::atoi(precision.c_str()));
+  place->layout = static_cast<DataLayoutType>(std::atoi(layout.c_str()));
+}
+
+std::string KernelBase::SerializeKernelType(const std::string &op_type,
+                                            const std::string &alias,
+                                            const Place &place) {
+  std::stringstream ss;
+  ss << op_type << "/";
+  ss << alias << "/";
+  // We serialize the place value not the string representation here for
+  // easier deserialization.
+  ss << static_cast<int>(place.target) << "/";
+  ss << static_cast<int>(place.precision) << "/";
+  ss << static_cast<int>(place.layout);
+  return ss.str();
+}
+
 bool ParamTypeRegistry::KeyCmp::operator()(
     const ParamTypeRegistry::key_t &a,
     const ParamTypeRegistry::key_t &b) const {
diff --git a/paddle/fluid/lite/core/kernel.h b/paddle/fluid/lite/core/kernel.h
index d7b296eec12a27281b84701e1daa7ca09829fc47..0ef46b65870b11077dcda2cd1833b3eb67a562fa 100644
--- a/paddle/fluid/lite/core/kernel.h
+++ b/paddle/fluid/lite/core/kernel.h
@@ -118,33 +118,11 @@ class KernelBase {
 
   static std::string SerializeKernelType(const std::string& op_type,
                                          const std::string& alias,
-                                         const Place& place) {
-    std::stringstream ss;
-    ss << op_type << "/";
-    ss << alias << "/";
-    // We serialize the place value not the string representation here for
-    // easier deserialization.
-    ss << static_cast<int>(place.target) << "/";
-    ss << static_cast<int>(place.precision) << "/";
-    ss << static_cast<int>(place.layout);
-    return ss.str();
-  }
+                                         const Place& place);
 
   static void ParseKernelType(const std::string& kernel_type,
                               std::string* op_type, std::string* alias,
-                              Place* place) {
-    std::stringstream ss(kernel_type);
-    std::getline(ss, *op_type, '/');
-    std::getline(ss, *alias, '/');
-    std::string target, precision, layout;
-    std::getline(ss, target, '/');
-    std::getline(ss, precision, '/');
-    std::getline(ss, layout, '/');
-
-    place->target = static_cast<TargetType>(std::stoi(target));
-    place->precision = static_cast<PrecisionType>(std::stoi(precision));
-    place->layout = static_cast<DataLayoutType>(std::stoi(layout));
-  }
+                              Place* place);
 
   virtual ~KernelBase() = default;
   void Torch() {}
diff --git a/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
index 44189e3d1ed5e58807bb577a477a5ee68ac11a80..9d2c9fbc7dc9d0e7c591b189308795d3f783e112 100644
--- a/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/fusion/fc_fuse_pass_test.cc
@@ -28,7 +28,7 @@ namespace lite {
 namespace mir {
 
 TEST(fc_fuse_pass, fuse_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
 #ifndef LITE_WITH_CUDA
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
@@ -69,7 +69,7 @@ TEST(fc_fuse_pass, fuse_test) {
 
 #ifndef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 TEST(fc_fuse_pass, save_model_test) {
-  lite::ExecutorLite predictor;
+  lite::Predictor predictor;
   std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
                                    Place{TARGET(kX86), PRECISION(kFloat)}});
   predictor.Build(FLAGS_model_dir, Place{TARGET(kX86), PRECISION(kFloat)},
diff --git a/paddle/fluid/lite/kernels/arm/CMakeLists.txt b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
index 337fd846cbddac2fe53da1faf79b0479a215a576..21d3aa564acae69ecf3d50267fe916e6fc5432c6 100644
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -51,5 +51,3 @@ set(arm_kernels
     )
 
 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
- 
- 
diff --git a/paddle/fluid/lite/kernels/arm/use_kernels.h b/paddle/fluid/lite/kernels/arm/use_kernels.h
deleted file mode 100644
index 1a6583f3f570e688080b1bb1a96217c25ca4bcc9..0000000000000000000000000000000000000000
--- a/paddle/fluid/lite/kernels/arm/use_kernels.h
+++ /dev/null
@@ -1,25 +0,0 @@
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include "paddle/fluid/lite/core/op_registry.h"
-
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
diff --git a/paddle/fluid/lite/kernels/use_kernels.h b/paddle/fluid/lite/kernels/use_kernels.h
index d44069e14e0d6bcaf73c09d41e107d970d8acecb..09395abab523accd0bc4f95c75d0b9b23f1e8999 100644
--- a/paddle/fluid/lite/kernels/use_kernels.h
+++ b/paddle/fluid/lite/kernels/use_kernels.h
@@ -12,14 +12,33 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.
 
-#pragma once
 /*
  * ATTENTION this header file can only include in .cc file.
  */
 
+#pragma once
+#include "paddle/fluid/lite/core/op_registry.h"
+
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
 
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(split, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(transpose2, kARM, kFloat, kNCHW, def);
+#endif
+
 #ifdef LITE_WITH_X86
 USE_LITE_KERNEL(relu, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
@@ -36,21 +55,6 @@ USE_LITE_KERNEL(depthwise_conv2d, kX86, kFloat, kNCHW, def);
 USE_LITE_KERNEL(pool2d, kX86, kFloat, kNCHW, def);
 #endif
 
-#ifdef LITE_WITH_ARM
-USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(batch_norm, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(relu, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
-#endif
-
 #ifdef LITE_WITH_CUDA
 USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);
 USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
diff --git a/paddle/fluid/lite/kernels/x86/CMakeLists.txt b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
index f66818b2e9dacd8e8aae3506a2f4f12b1b117cdb..fb3ea29260480738297d5416aab2d346412b3490 100644
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -44,10 +44,9 @@ set(x86_kernels
     softmax_compute_x86
     dropout_compute_x86
     concat_compute_x86
-    conv_compute_x86 
-    pool_compute_x86  
-    batch_norm_compute_x86 
+    conv_compute_x86
+    pool_compute_x86
+    batch_norm_compute_x86
     )
 
 set(x86_kernels "${x86_kernels}" CACHE INTERNAL "x86 kernels")
- 
diff --git a/paddle/fluid/lite/operators/use_ops.h b/paddle/fluid/lite/operators/use_ops.h
index 933b3c849a390c335bd914c476c61636c607aa41..316e08ad4784849865b3d7722dfb7d1935d51247 100644
--- a/paddle/fluid/lite/operators/use_ops.h
+++ b/paddle/fluid/lite/operators/use_ops.h
@@ -13,9 +13,10 @@
 // limitations under the License.
 
 #pragma once
-/*
- * ATTENTION this header file can only include in .cc file.
- */
+
+// ATTENTION This can only include in a .cc file.
+
+#include "paddle/fluid/lite/core/op_registry.h"
 
 USE_LITE_OP(mul);
 USE_LITE_OP(fc);
diff --git a/paddle/fluid/lite/tools/build.sh b/paddle/fluid/lite/tools/build.sh
index 4436d91cdfd782ac6cbed9768c85a7bf01bead71..5094cee5b4504105bf899d08ab420d2833022f9a 100755
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -85,8 +85,8 @@ function build_test_server {
 
 # test_arm_android <some_test_name> <adb_port_number>
 function test_arm_android {
-    test_name=$1
-    port=$2
+    local test_name=$1
+    local port=$2
     if [[ "${test_name}x" == "x" ]]; then
         echo "test_name can not be empty"
         exit 1
@@ -99,12 +99,18 @@ function test_arm_android {
     echo "test name: ${test_name}"
     adb_work_dir="/data/local/tmp"
 
-    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite")
+    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite" "test_light_api")
     for skip_name in ${skip_list[@]} ; do
         [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
     done
 
-    testpath=$(find ./paddle/fluid -name ${test_name})
+    local testpath=$(find ./paddle/fluid -name ${test_name})
+
+    # if [[ "$test_name" == "test_light_api" ]]; then
+    #     local model_path=$(find . -name "lite_naive_model")
+    #     arm_push_necessary_file $port $model_path $adb_work_dir
+    # fi
+
     adb -s emulator-${port} push ${testpath} ${adb_work_dir}
     adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
     adb -s emulator-${port} shell "./${adb_work_dir}/${test_name}"
@@ -204,6 +210,7 @@ function test_arm {
     abi=$2
     lang=$3
     port=$4
+
     if [[ ${os} == "armlinux" ]]; then
         # TODO(hongming): enable test armlinux on armv8, armv7 and armv7hf
         echo "Skip test arm linux yet. armlinux must in another docker"
@@ -221,6 +228,7 @@ function test_arm {
         return 0
     fi
 
+
     echo "test file: ${TESTS_FILE}"
     for _test in $(cat $TESTS_FILE); do
         test_arm_android $_test $port
@@ -242,6 +250,14 @@ function prepare_emulator {
     sleep 1m
 }
 
+function arm_push_necessary_file {
+    local port=$1
+    local testpath=$2
+    local adb_work_dir=$3
+
+    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
+}
+
 
 # We split the arm unittest into several sub-tasks to parallel and reduce the overall CI timetime.
 # sub-task1
@@ -286,20 +302,22 @@ function build_test_arm_subtask_armlinux {
 
     prepare_emulator $port_armv8 $port_armv7
 
+    cur=$PWD
+
     # job 5
-    build_arm "armlinux" "armv8"
-    test_arm "armlinux" "armv8"
-    cd -
+    build_arm "armlinux" "armv8" "gcc" $port_armv8
+    test_arm "armlinux" "armv8" "gcc" $port_armv8
+    cd $cur
 
     # job 6
-    build_arm "armlinux" "armv7"
-    test_arm "armlinux" "armv7"
-    cd -
+    build_arm "armlinux" "armv7" "gcc" $port_armv8
+    test_arm "armlinux" "armv7" "gcc" $port_armv8
+    cd $cur
 
     # job 7
-    build_arm "armlinux" "armv7hf"
-    test_arm "armlinux" "armv7hf"
-    cd -
+    build_arm "armlinux" "armv7hf" "gcc" $port_armv8
+    test_arm "armlinux" "armv7hf" "gcc" $port_armv8
+    cd $cur
 
     adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
     echo "Done"
diff --git a/paddle/fluid/lite/utils/io.h b/paddle/fluid/lite/utils/io.h
index 4dba6f984292235d3f947477b09152bc37c2adb9..4e64ee1d4e4b016fadf40167fb96557e96061fba 100644
--- a/paddle/fluid/lite/utils/io.h
+++ b/paddle/fluid/lite/utils/io.h
@@ -14,15 +14,18 @@
 
 #pragma once
 
-#include <sys/stat.h>
+#ifndef LITE_WITH_ARM
+#include <bits/stdc++.h>
+#endif
 #include <fstream>
 #include <string>
 #include "paddle/fluid/lite/utils/cp_logging.h"
+#include "paddle/fluid/lite/utils/string.h"
 
 namespace paddle {
 namespace lite {
 
-static bool IsFileExists(const std::string &path) {
+static bool IsFileExists(const std::string& path) {
   std::ifstream file(path);
   bool res = file.is_open();
   if (res) {
@@ -31,5 +34,13 @@ static bool IsFileExists(const std::string &path) {
   return res;
 }
 
+// ARM mobile not support mkdir in C++
+#ifndef LITE_WITH_ARM
+static void MkDirRecur(const std::string& path) {
+  CHECK_EQ(system(string_format("mkdir -p %s", path.c_str()).c_str()), 0)
+      << "Cann't mkdir " << path;
+}
+#endif
+
 }  // namespace lite
 }  // namespace paddle
diff --git a/paddle/fluid/lite/utils/string.h b/paddle/fluid/lite/utils/string.h
index 31b131276bfa220f85a9a7606d504b6d330425b2..5e918bf5f841b3f8d18ccf9ff94721534ec6a698 100644
--- a/paddle/fluid/lite/utils/string.h
+++ b/paddle/fluid/lite/utils/string.h
@@ -74,5 +74,15 @@ static std::string Repr(const std::vector<std::string>& v) {
   return "{" + Join(tmp, ",") + "}";
 }
 
+static std::vector<std::string> Split(const std::string& s, char delim) {
+  std::stringstream ss(s);
+  std::string line;
+  std::vector<std::string> res;
+  while (std::getline(ss, line, delim)) {
+    res.push_back(line);
+  }
+  return res;
+}
+
 }  // namespace lite
 }  // namespace paddle