code clean - refine ARM compile (#17590)

* code clean - refine ARM cmake enhancement: - add lite_cc_library and lite_cc_test code clean: - remove ARM feed and fetch kernels, reuse the Host's remove unnecessary comments

code clean - refine ARM compile (#17590)
* code clean - refine ARM cmake enhancement: - add lite_cc_library and lite_cc_test code clean: - remove ARM feed and fetch kernels, reuse the Host's remove unnecessary comments
59122809 · Yan Chunwei · GitHub · 310fd514 · 59122809 · 59122809
28 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -121,7 +121,8 @@ endif()
 # for lite, both server and mobile framework.
 option(WITH_LITE "Enable lite framework" OFF)
 option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
-option(LITE_WITH_X86  "Enable X86 in lite mode" ON)
+option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
+option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)
 option(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK  "Enable light-weight framework" OFF)



--- a/cmake/configure.cmake
+++ b/cmake/configure.cmake
@@ -172,6 +172,10 @@ if (LITE_WITH_X86)
    add_definitions("-DLITE_WITH_X86")
 endif()

+if (LITE_WITH_ARM)
+    add_definitions("-DLITE_WITH_ARM")
+endif()
+
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
  add_definitions("-DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK")
 endif()
--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -427,7 +427,7 @@ function(raw_cc_test TARGET_NAME)
  endif()
 endfunction(raw_cc_test)

-function(lite_cc_test args)
+function(_lite_cc_test args)
  if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    message(STATUS "building lite raw test: ${args}")
    raw_cc_test(${args} ${ARGN})

--- a/paddle/fluid/framework/operator.cc
+++ b/paddle/fluid/framework/operator.cc
@@ -39,6 +39,10 @@ DEFINE_int32(inner_op_parallelism, 0, "number of threads for inner op");
 namespace paddle {
 namespace framework {

+OpDuppy op_duppy;
+Scope scope_duppy;
+RuntimeContext runtime_context_duppy({}, {});
+
 std::vector<std::tuple<platform::Place, LibraryType>> kKernelPriority = {
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kCUDNN),
    std::make_tuple(platform::CUDAPlace(0), LibraryType::kPlain),

--- a/paddle/fluid/framework/operator.h
+++ b/paddle/fluid/framework/operator.h
@@ -239,9 +239,10 @@ class OpDuppy : public OperatorBase {
  void RunImpl(const Scope& scope,
               const platform::Place& place) const override {}
 };
-OpDuppy op_duppy;
-Scope scope_duppy;
-RuntimeContext runtime_context_duppy({}, {});
+
+extern OpDuppy op_duppy;
+extern Scope scope_duppy;
+extern RuntimeContext runtime_context_duppy;

 class ExecutionContext {
 public:
@@ -255,7 +256,7 @@ class ExecutionContext {
        ctx_(ctx),
        kernel_configs_(configs) {}

-  ExecutionContext(const platform::DeviceContext& device_context)
+  explicit ExecutionContext(const platform::DeviceContext& device_context)
      : op_(op_duppy),
        scope_(scope_duppy),
        device_context_(device_context),

--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -3,9 +3,10 @@ if (NOT WITH_LITE)
 endif()

 message(WARNING "Lite enabled!")
-message(STATUS "LIGHT_FRAMEWORK: ${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
-message(STATUS "LITE_WITH_CUDA: ${LITE_WITH_CUDA}")
-message(STATUS "LITE_WITH_X86: ${LITE_WITH_X86}")
+message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
+message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
+message(STATUS "LITE_WITH_X86:\t${LITE_WITH_X86}")
+message(STATUS "LITE_WITH_ARM:\t${LITE_WITH_ARM}")

 set(LITE_MODEL_DIR "${THIRD_PARTY_PATH}/install")

@@ -29,6 +30,65 @@ function(lite_download_and_uncompress INSTALL_DIR URL FILENAME)
    )
 endfunction()

+function (lite_deps DEPS)
+  set(options "")
+  set(oneValueArgs "")
+  set(multiValueArgs DEPS X86_DEPS CUDA_DEPS ARM_DEPS)
+  cmake_parse_arguments(lite_deps "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+  set(${DEPS} ${lite_deps_DEPS} PARENT_SCOPE)
+
+  if(LITE_WITH_X86)
+    foreach(var ${lite_deps_X86_DEPS})
+      set(${DEPS} ${${DEPS}} ${var} PARENT_SCOPE)
+    endforeach()
+  endif()
+
+  if(LITE_WITH_CUDA)
+    foreach(var ${lite_deps_CUDA_DEPS})
+      set(${DEPS} ${${DEPS}} ${var} PARENT_SCOPE)
+    endforeach()
+  endif()
+
+  if(LITE_WITH_ARM)
+    foreach(var ${lite_deps_ARM_DEPS})
+      set(${DEPS} ${${DEPS}} ${var} PARENT_SCOPE)
+    endforeach()
+  endif()
+endfunction()
+
+function(lite_cc_library TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(deps "")
+    lite_deps(deps
+            DEPS ${args_DEPS}
+            X86_DEPS ${args_X86_DEPS}
+            CUDA_DEPS ${args_CUDA_DEPS}
+            ARM_DEPS ${args_ARM_DEPS}
+            )
+
+    cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
+endfunction()
+
+function(lite_cc_test TARGET)
+    set(options "")
+    set(oneValueArgs "")
+    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS)
+    cmake_parse_arguments(args "${options}" "${oneValueArgs}" "${multiValueArgs}" ${ARGN})
+
+    set(deps "")
+    lite_deps(deps
+            DEPS ${args_DEPS}
+            X86_DEPS ${args_X86_DEPS}
+            CUDA_DEPS ${args_CUDA_DEPS}
+            ARM_DEPS ${args_ARM_DEPS}
+            )
+    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
+endfunction()

 add_subdirectory(core)
 add_subdirectory(x86)
@@ -39,4 +99,3 @@ add_subdirectory(kernels)
 add_subdirectory(model_parser)
 add_subdirectory(utils)
 add_subdirectory(api)
- 
--- a/paddle/fluid/lite/api/cxx_api_bin.cc
+++ b/paddle/fluid/lite/api/cxx_api_bin.cc
@@ -25,22 +25,8 @@ namespace lite {

 void Run(const char* model_dir) {
  lite::ExecutorLite predictor;
-  // #ifndef LITE_WITH_CUDA
-  //   std::vector<Place> valid_places({Place{TARGET(kHost),
-  //   PRECISION(kFloat)}});
-  // #elif defined(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-  // #else
-  //   std::vector<Place> valid_places({
-  //       Place{TARGET(kHost), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-  //       Place{TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW)},
-  //       Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kNCHW)},
-  //       Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kNCHW)},
-  //       Place{TARGET(kCUDA), PRECISION(kAny), DATALAYOUT(kAny)},
-  //       Place{TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny)},
-  //   });
-  // #endif
-
-  std::vector<Place> valid_places({Place{TARGET(kARM), PRECISION(kFloat)}});
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});

  predictor.Build(model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
                  valid_places);
@@ -52,8 +38,6 @@ void Run(const char* model_dir) {
    data[i] = i;
  }

-  LOG(INFO) << "input " << *input_tensor;
-
  predictor.Run();

  auto* out = predictor.GetOutput(0);
@@ -61,7 +45,7 @@ void Run(const char* model_dir) {
  LOG(INFO) << "out " << out->data<float>()[0];
  LOG(INFO) << "out " << out->data<float>()[1];
  LOG(INFO) << "dims " << out->dims();
-  LOG(INFO) << "out " << *out;
+  LOG(INFO) << "out data size: " << out->data_size();
 }

 }  // namespace lite
@@ -79,12 +63,18 @@ USE_LITE_OP(fc);
 USE_LITE_OP(scale);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
-// USE_LITE_OP(io_copy);
+USE_LITE_OP(io_copy);
+
+USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
+USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
+
+#ifdef LITE_WITH_ARM
 USE_LITE_KERNEL(fc, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(scale, kARM, kFloat, kNCHW, def);
-USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
-USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
+// USE_LITE_KERNEL(feed, kARM, kAny, kAny, def);
+// USE_LITE_KERNEL(fetch, kARM, kAny, kAny, def);
+#endif  // LITE_WITH_ARM

 #ifdef LITE_WITH_CUDA
 USE_LITE_KERNEL(mul, kCUDA, kFloat, kNCHW, def);

--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
-cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest)
-cc_library(memory_lite SRCS memory.cc DEPS target_wrapper_lite target_wrapper_host)
-cc_library(target_wrapper_lite SRCS target_wrapper.cc)
+if (WITH_TESTING)
+    cc_library(lite_gtest_main SRCS lite_gtest_main.cc DEPS gtest)
+endif()
+cc_library(memory_lite SRCS memory.cc DEPS target_wrapper_lite)
+lite_cc_library(target_wrapper_lite SRCS target_wrapper.cc DEPS target_wrapper_host X86_DEPS target_wrapper_x86 CUDA_DEPS target_wrapper_cuda)
 cc_library(lite_tensor SRCS lite_tensor.cc DEPS memory_lite target_wrapper_lite)
 if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    cc_library(hvy_tensor SRCS hvy_tensor.cc DEPS lod_tensor)
@@ -40,10 +42,10 @@ cc_library(program_fake_utils SRCS program_fake_utils.cc DEPS mir_ssa_graph
        )

 lite_cc_test(test_scope_lite SRCS scope_test.cc DEPS scope_lite)
-lite_cc_test(test_kernel_lite SRCS kernel_test.cc DEPS kernel_lite target_wrapper_x86)
+lite_cc_test(test_kernel_lite SRCS kernel_test.cc DEPS kernel_lite target_wrapper_lite)
 lite_cc_test(test_op_lite SRCS op_lite_test.cc DEPS op_lite)
 lite_cc_test(test_tensor_lite SRCS lite_tensor_test.cc DEPS lite_tensor)
 lite_cc_test(test_type_system SRCS type_system_test.cc DEPS type_system utils_lite)
 #lite_cc_test(test_optimizer_lite SRCS optimizer_test.cc DEPS mir_pass_manager program_fake_utils mir_passes optimizer_lite fc_op_lite)
 lite_cc_test(test_types_lite SRCS types_test.cc DEPS types_lite)
- 
+lite_cc_test(test_memory_lite SRCS memory_test.cc DEPS memory_lite)
--- a/paddle/fluid/lite/core/memory.cc
+++ b/paddle/fluid/lite/core/memory.cc
@@ -15,5 +15,65 @@
 #include "paddle/fluid/lite/core/memory.h"

 namespace paddle {
-namespace lite {}  // namespace lite
+namespace lite {
+
+void* TargetMalloc(TargetType target, size_t size) {
+  void* data{nullptr};
+  switch (target) {
+    case TargetType::kHost:
+    case TargetType::kX86:
+    case TargetType::kARM:
+      data = TargetWrapper<TARGET(kHost)>::Malloc(size);
+      break;
+#ifdef LITE_WITH_CUDA
+    case TargetType::kCUDA:
+      data =
+          TargetWrapper<TARGET(kCUDA), cudaStream_t, cudaEvent_t>::Malloc(size);
+      break;
+#endif  // LITE_WITH_CUDA
+    default:
+      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
+  }
+  return data;
+}
+
+void TargetFree(TargetType target, void* data) {
+  switch (target) {
+    case TargetType::kHost:
+    case TargetType::kX86:
+    case TargetType::kARM:
+      TargetWrapper<TARGET(kHost)>::Free(data);
+      break;
+
+#ifdef LITE_WITH_CUDA
+    case TargetType::kCUDA:
+      TargetWrapper<TARGET(kX86)>::Free(data);
+      break;
+#endif  // LITE_WITH_CUDA
+    default:
+      LOG(FATAL) << "Unknown type";
+  }
+}
+
+void TargetCopy(TargetType target, void* dst, const void* src, size_t size) {
+  switch (target) {
+    case TargetType::kHost:
+    case TargetType::kX86:
+    case TargetType::kARM:
+      TargetWrapper<TARGET(kHost)>::MemcpySync(dst, src, size,
+                                               IoDirection::DtoD);
+      break;
+
+#ifdef LITE_WITH_CUDA
+    case TargetType::kCUDA:
+      TargetWrapper<TARGET(kCUDA)>::MemcpySync(dst, src, size,
+                                               IoDirection::DtoD);
+      break;
+#endif
+    default:
+      LOG(FATAL) << "unsupported type";
+  }
+}
+
+}  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/core/memory.h
+++ b/paddle/fluid/lite/core/memory.h
@@ -18,57 +18,16 @@
 namespace paddle {
 namespace lite {

-static void* TargetMalloc(TargetType target, size_t size) {
-  void* data{nullptr};
-  switch (target) {
-    case TargetType::kHost:
-#ifdef LITE_WITH_X86
-    case TargetType::kX86:
-#endif
-      data = TargetWrapper<TARGET(kHost)>::Malloc(size);
-      break;
-#ifdef LITE_WITH_CUDA
-    case TargetType::kCUDA:
-      data =
-          TargetWrapper<TARGET(kCUDA), cudaStream_t, cudaEvent_t>::Malloc(size);
-      break;
-#endif  // LITE_WITH_CUDA
-    default:
-      LOG(FATAL) << "Unknown supported target " << TargetToStr(target);
-  }
-  return data;
-}
-
-static void TargetFree(TargetType target, void* data) {
-  switch (static_cast<int>(target)) {
-    case static_cast<int>(TargetType::kX86):
-      TargetWrapper<TARGET(kX86)>::Free(data);
-      break;
-    case static_cast<int>(TargetType::kCUDA):
-      TargetWrapper<TARGET(kX86)>::Free(data);
-      break;
-    default:
-      LOG(FATAL) << "Unknown type";
-  }
-}
+// Malloc memory for a specific Target. All the targets should be an element in
+// the `switch` here.
+void* TargetMalloc(TargetType target, size_t size);

-static void TargetCopy(TargetType target, void* dst, const void* src,
-                       size_t size) {
-  switch (target) {
-    case TargetType::kX86:
-    case TargetType::kHost:
-      TargetWrapper<TARGET(kHost)>::MemcpySync(dst, src, size,
-                                               IoDirection::DtoD);
-      break;
+// Free memory for a specific Target. All the targets should be an element in
+// the `switch` here.
+void TargetFree(TargetType target, void* data);

-    case TargetType::kCUDA:
-      TargetWrapper<TARGET(kCUDA)>::MemcpySync(dst, src, size,
-                                               IoDirection::DtoD);
-      break;
-    default:
-      LOG(FATAL) << "unsupported type";
-  }
-}
+// Copy a buffer from host to another target.
+void TargetCopy(TargetType target, void* dst, const void* src, size_t size);

 // Memory buffer manager.
 class Buffer {

--- a/paddle/fluid/lite/kernels/host/relu_compute.cc
+++ b/paddle/fluid/lite/kernels/host/relu_compute.cc
@@ -12,4 +12,23 @@
 // See the License for the specific language governing permissions and
 // limitations under the License.

-#include "paddle/fluid/lite/kernels/host/relu_compute.h"
+#include "paddle/fluid/lite/core/memory.h"
+#include <gtest/gtest.h>
+
+namespace paddle {
+namespace lite {
+
+TEST(memory, test) {
+  auto* buf = TargetMalloc(TARGET(kX86), 10);
+  ASSERT_TRUE(buf);
+  TargetFree(TARGET(kX86), buf);
+
+#ifdef LITE_WITH_CUDA
+  auto* buf_cuda = TargetMalloc(TARGET(kCUDA), 10);
+  ASSERT_TRUE(buf_cuda);
+  TargetFree(Target(kCUDA), buf_cuda);
+#endif
+}
+
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/cuda/CMakeLists.txt
+++ b/paddle/fluid/lite/cuda/CMakeLists.txt
@@ -4,4 +4,3 @@ endif()

 nv_library(target_wrapper_cuda SRCS target_wrapper.cc)
 nv_library(cuda_blas_lite SRCS blas.cc)
- 
--- a/paddle/fluid/lite/host/CMakeLists.txt
+++ b/paddle/fluid/lite/host/CMakeLists.txt
-cc_library(target_wrapper_host SRCS target_wrapper.cc DEPS target_wrapper_lite)
- 
+cc_library(target_wrapper_host SRCS target_wrapper.cc)
--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
-if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+if(NOT (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM))
    return()
 endif()

@@ -9,14 +9,7 @@ cc_library(relu_compute_arm SRCS relu_compute.cc DEPS ${lite_kernel_deps})
 cc_library(mul_compute_arm SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
 cc_library(scale_compute_arm SRCS scale_compute.cc DEPS ${lite_kernel_deps} eigen3)

-cc_library(feed_compute_arm SRCS feed_compute.cc DEPS ${lite_kernel_deps})
-cc_library(fetch_compute_arm SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
-
-# lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS ${lite_kernel_deps} fc_compute_arm)
-
 set(arm_kernels
-    feed_compute_arm
-    fetch_compute_arm
    fc_compute_arm
    relu_compute_arm
    mul_compute_arm
@@ -24,4 +17,3 @@ set(arm_kernels
    )

 set(arm_kernels "${arm_kernels}" CACHE INTERNAL "arm kernels")
- 
--- a/paddle/fluid/lite/kernels/arm/feed_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/feed_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class FeedCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  using param_t = operators::FeedParam;
-
-  void Run() override {
-    auto &param = Param<operators::FeedParam>();
-    LOG(INFO) << "feed_list.size: " << param.feed_list->size();
-    LOG(INFO) << "col " << param.col;
-    const lite::Tensor &feed_item = (*param.feed_list)[0];
-    param.out->ShareDataWith(feed_item);
-    LOG(INFO) << "FEED input " << feed_item << " col " << param.col;
-    LOG(INFO) << "FEED output " << *param.out;
-  }
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(feed, kARM, kAny, kAny,
-                     paddle::lite::kernels::arm::FeedCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
-    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/fetch_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/fetch_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace arm {
-
-class FetchCompute
-    : public KernelLite<TARGET(kARM), PRECISION(kAny), DATALAYOUT(kAny)> {
- public:
-  using param_t = operators::FeedParam;
-
-  void Run() override {
-    auto& param = Param<operators::FetchParam>();
-    auto* fetch_list = param.fetch_list;
-    if (fetch_list->size() <= static_cast<size_t>(param.col)) {
-      fetch_list->resize(param.col + 1);
-    }
-
-    auto& dst = fetch_list->at(param.col);
-    dst.ShareDataWith(*param.input);
-  }
-};
-
-}  // namespace arm
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(fetch, kARM, kAny, kAny,
-                     paddle::lite::kernels::arm::FetchCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny),
-                                           DATALAYOUT(kAny), -1)})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM), PRECISION(kAny),
-                                              DATALAYOUT(kAny), -1)})
-    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/mul_compute.cc
@@ -59,9 +59,6 @@ class MulCompute : public KernelLite<TARGET(kARM), PRECISION(kFloat)> {
    mul_compute_eigen(param.x->data<float>(), x_shape.x, x_shape.y,  //
                      param.y->data<float>(), y_shape.x, y_shape.y,  //
                      param.output->mutable_data<float>());
-    LOG(INFO) << "MUL x " << *param.x;
-    LOG(INFO) << "MUL W " << *param.y;
-    LOG(INFO) << "MUL out " << *param.output;
  }

  virtual ~MulCompute() = default;

--- a/paddle/fluid/lite/kernels/host/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt
 message(STATUS "compile with lite host kernels")
-cc_library(fc_compute_host SRCS fc_compute.cc DEPS ${lite_kernel_deps} eigen3)
-cc_library(relu_compute_host SRCS relu_compute.cc DEPS ${lite_kernel_deps})
-cc_library(mul_compute_host SRCS mul_compute.cc DEPS ${lite_kernel_deps} eigen3)
-cc_library(scale_compute_host SRCS scale_compute.cc DEPS ${lite_kernel_deps} eigen3)

 cc_library(feed_compute_host SRCS feed_compute.cc DEPS ${lite_kernel_deps})
 cc_library(fetch_compute_host SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
@@ -10,11 +6,6 @@ cc_library(fetch_compute_host SRCS fetch_compute.cc DEPS ${lite_kernel_deps})
 set(host_kernels
    feed_compute_host
    fetch_compute_host
-    fc_compute_host
-    relu_compute_host
-    mul_compute_host
-    scale_compute_host
    )

 set(host_kernels "${host_kernels}" CACHE INTERNAL "host kernels")
- 
--- a/paddle/fluid/lite/kernels/host/fc_compute.cc
+++ b/paddle/fluid/lite/kernels/host/fc_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/kernels/host/fc_compute.h"
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/type_system.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-// NOTE should use pure std C++ implementation.
-void FcCompute::Run() {
-  auto& param = this->Param<operators::FcParam>();
-
-  CHECK_GE(param.input->dims().size(), 2UL);
-  CHECK_EQ(param.output->dims().size(), 2UL);
-
-  fc_compute_eigen(
-      param.input->data<float>(),  // x
-      param.input->dims().Slice(0, param.in_num_col_dims).production(),
-      param.input->dims()
-          .Slice(param.in_num_col_dims, param.input->dims().size())
-          .production(),
-      param.w->data<float>(),     // w
-      param.w->dims()[1],         // w_w
-      param.w->dims()[0],         // w_h
-      param.bias->data<float>(),  // b
-      param.output->mutable_data<float>());
-}
-
-// TargetType FcCompute::target() const { return TARGET(kHost); }
-
-// PrecisionType FcCompute::precision() const { return PRECISION(kFloat); }
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(fc, kHost, kFloat, kNCHW,
-                     paddle::lite::kernels::host::FcCompute, def)
-    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Bias", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("W", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
--- a/paddle/fluid/lite/kernels/host/fc_compute.h
+++ b/paddle/fluid/lite/kernels/host/fc_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/operators/fc_op.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-class FcCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
- public:
-  using param_t = operators::FcParam;
-
-  void Run() override;
-
-  // TargetType target() const override;
-  // PrecisionType precision() const override;
-
-  virtual ~FcCompute() = default;
-};
-
-template <typename T>
-void fc_compute_eigen(const T* x, int x_w, int x_h,  //
-                      const T* w, int w_w, int w_h,  //
-                      const T* b,                    //
-                      T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> W(w, w_h, w_w);
-  Eigen::Map<matrix_t> Out(out, x_h, w_h);
-
-  Out = X * W.transpose();
-
-  if (b) {
-    Eigen::Map<const Eigen::Matrix<T, Eigen::Dynamic, 1>> B(b, w_h);
-    Out = Out.array().rowwise() + B.transpose().array();
-  }
-}
-
-template <typename T>
-__attribute__((optimize("unroll-loops")))  //
-T dot(const T* x, const T* y, int dim) {
-  T out{};
-  for (int i = 0; i < dim; i++) {
-    out += x[i] * y[i];
-  }
-  return out;
-}
-
-template <typename T>
-void fc_compute_naive(const T* x, int x_w, int x_h,  //
-                      const T* w, int w_w, int w_h,  //
-                      const T* b,                    //
-                      T* out) {
-  CHECK_EQ(x_w, w_w);
-  // out shape: (x_h, w_w)
-  memset(out, 0, x_h * w_h * sizeof(T));
-
-  for (int r = 0; r < x_h; r++) {
-    for (int c = 0; c < w_h; c++) {
-      out[r * w_h + c] = dot(&x[r * x_w], &w[c * w_w], w_w) + b[c];
-    }
-  }
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/kernels/host/fc_compute_test.cc
+++ b/paddle/fluid/lite/kernels/host/fc_compute_test.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include "paddle/fluid/lite/kernels/host/fc_compute.h"
-#include <gtest/gtest.h>
-#include <vector>
-#include "paddle/fluid/lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-TEST(fc_compute_naive, test) {
-  lite::Tensor x, w, b, out, out1;
-  const int batch_size = 2;
-  x.Resize({batch_size, 3});
-  w.Resize({4, 3});
-  b.Resize({1, 4});
-  out.Resize({batch_size, 4});
-  out1.Resize({batch_size, 4});
-
-  auto x_data = x.mutable_data<float>();
-  auto w_data = w.mutable_data<float>();
-  auto b_data = b.mutable_data<float>();
-  auto out_data = out.mutable_data<float>();
-  auto out_data1 = out1.mutable_data<float>();
-
-  for (int i = 0; i < product(x.dims()); i++) x_data[i] = i;
-  for (int i = 0; i < product(w.dims()); i++) w_data[i] = i;
-  for (int i = 0; i < product(b.dims()); i++) b_data[i] = i;
-
-  fc_compute_naive(x_data, 3, batch_size,  //
-                   w_data, 3, 4,           //
-                   b_data, out_data);
-  fc_compute_eigen(x_data, 3, batch_size,  //
-                   w_data, 3, 4,           //
-                   b_data, out_data1);
-
-  for (int i = 0; i < product(out.dims()); i++) {
-    EXPECT_NEAR(out_data[0], out_data1[0], 1e-6);
-  }
-}
-
-TEST(fc_host, init) {
-  FcCompute fc;
-  ASSERT_EQ(fc.precision(), PRECISION(kFloat));
-  ASSERT_EQ(fc.target(), TARGET(kHost));
-}
-
-TEST(fc_host, algorithm) {
-  using matrix_t = Eigen::Matrix<float, Eigen::Dynamic, Eigen::Dynamic>;
-  using matrix_map_t = Eigen::Map<matrix_t>;
-
-  // dim 10, 20
-  std::vector<float> input(10 * 20);
-  std::vector<float> w(20 * 20);
-  std::vector<float> output(10 * 20);
-
-  Eigen::Map<const matrix_t> input_mat(input.data(), 10, 20);
-  Eigen::Map<const matrix_t> weight_mat(w.data(), 20, 20);
-  matrix_map_t output_mat(output.data(), 10, 20);
-
-  output_mat = weight_mat.transpose() * input_mat;
-}
-
-TEST(fc_host, compute) {
-  FcCompute fc;
-  operators::FcParam param;
-
-  lite::Tensor x;
-  lite::Tensor w;
-  lite::Tensor bias;
-  lite::Tensor output;
-
-  x.Resize(DDim(std::vector<int64_t>({1, 10, 20})));
-  w.Resize(DDim(std::vector<int64_t>({20, 20})));
-  bias.Resize(DDim(std::vector<int64_t>({1, 10})));
-  output.Resize(DDim(std::vector<int64_t>({10, 20})));
-
-  auto* x_data = x.mutable_data<float>();
-  auto* w_data = w.mutable_data<float>();
-  auto* bias_data = bias.mutable_data<float>();
-  auto* output_data = output.mutable_data<float>();
-
-  for (int i = 0; i < 10 * 20; i++) x_data[i] = i;
-  for (int i = 0; i < 20 * 20; i++) w_data[i] = i;
-  for (int i = 0; i < 10; i++) bias_data[i] = i;
-  for (int i = 0; i < 10 * 20; i++) output_data[i] = 0;
-
-  param.in_num_col_dims = 2;
-  param.input = &x;
-  param.w = &w;
-  param.bias = &bias;
-  param.output = &output;
-  param.in_mat_dims = x.dims();
-
-  fc.SetParam(param);
-  fc.Run();
-
-  LOG(INFO) << "x";
-  for (int i = 0; i < 10 * 20; i++) LOG(INFO) << x_data[i];
-
-  LOG(INFO) << "output:";
-  for (int i = 0; i < 10 * 20; i++) LOG(INFO) << output.data<float>()[i];
-}
-
-TEST(fc, retrive_op) {
-  auto fc =
-      KernelRegistry::Global().Create<TARGET(kHost), PRECISION(kFloat)>("fc");
-  ASSERT_TRUE(fc);
-}
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-USE_LITE_KERNEL(fc, kHost, kFloat, kNCHW, def);
--- a/paddle/fluid/lite/kernels/host/feed_compute.cc
+++ b/paddle/fluid/lite/kernels/host/feed_compute.cc
@@ -27,12 +27,12 @@ class FeedCompute

  void Run() override {
    auto &param = Param<operators::FeedParam>();
-    LOG(INFO) << "feed_list.size: " << param.feed_list->size();
-    LOG(INFO) << "col " << param.col;
+    VLOG(4) << "feed_list.size: " << param.feed_list->size();
+    VLOG(4) << "col " << param.col;
    const lite::Tensor &feed_item = (*param.feed_list)[0];
    param.out->ShareDataWith(feed_item);
-    LOG(INFO) << "FEED input " << feed_item << " col " << param.col;
-    LOG(INFO) << "FEED output " << *param.out;
+    VLOG(4) << "FEED input " << feed_item << " col " << param.col;
+    VLOG(4) << "FEED output " << *param.out;
  }
 };


--- a/paddle/fluid/lite/kernels/host/mul_compute.cc
+++ b/paddle/fluid/lite/kernels/host/mul_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-template <typename T>
-void mul_compute_eigen(const T* x, int x_h, int x_w, const T* y, int y_h,
-                       int y_w, T* out) {
-  using matrix_t =
-      Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic, Eigen::RowMajor>;
-
-  Eigen::Map<const matrix_t> X(x, x_h, x_w);
-  Eigen::Map<const matrix_t> Y(y, y_h, y_w);
-  Eigen::Map<matrix_t> Out(out, x_h, y_w);
-
-  Out = X * Y;
-}
-
-class MulCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void Run() override {
-    auto& param = Param<operators::MulParam>();
-    core::dim2 x_shape(
-        {static_cast<int>(
-             param.x->dims().Slice(0, param.x_num_col_dims).production()),
-         static_cast<int>(
-             param.x->dims()
-                 .Slice(param.x_num_col_dims, param.x->dims().size())
-                 .production())});
-    core::dim2 y_shape(
-        {static_cast<int>(
-             param.y->dims().Slice(0, param.y_num_col_dims).production()),
-         static_cast<int>(
-             param.y->dims()
-                 .Slice(param.y_num_col_dims, param.y->dims().size())
-                 .production())});
-
-    mul_compute_eigen(param.x->data<float>(), x_shape.x, x_shape.y,  //
-                      param.y->data<float>(), y_shape.x, y_shape.y,  //
-                      param.output->mutable_data<float>());
-    LOG(INFO) << "MUL x " << *param.x;
-    LOG(INFO) << "MUL W " << *param.y;
-    LOG(INFO) << "MUL out " << *param.output;
-  }
-
-  virtual ~MulCompute() = default;
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(mul, kHost, kFloat, kNCHW,
-                     paddle::lite::kernels::host::MulCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindInput("Y", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
--- a/paddle/fluid/lite/kernels/host/relu_compute.h
+++ b/paddle/fluid/lite/kernels/host/relu_compute.h
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#pragma once
-#include <algorithm>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-class ReluCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
- public:
-  void Run() override {
-    auto& param = Param<operators::ReluParam>();
-    auto n = param.input->dims().production();
-    const float* input = param.input->data<float>();
-    float* output = param.output->mutable_data<float>();
-    for (int i = 0; i < n; i++) {
-      output[i] = std::max(0.f, input[i]);
-    }
-  }
-
-  TargetType target() const override { return TARGET(kHost); }
-  PrecisionType precision() const override { return PRECISION(kFloat); }
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(relu, kHost, kFloat, kNCHW,
-                     paddle::lite::kernels::host::ReluCompute, def)
-    .Finalize();
--- a/paddle/fluid/lite/kernels/host/scale_compute.cc
+++ b/paddle/fluid/lite/kernels/host/scale_compute.cc
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-
-#include <Eigen/Core>
-#include "paddle/fluid/lite/core/kernel.h"
-#include "paddle/fluid/lite/core/op_registry.h"
-#include "paddle/fluid/lite/core/types.h"
-
-namespace paddle {
-namespace lite {
-namespace kernels {
-namespace host {
-
-template <typename T>
-void scale_compute(const T* x, T* out, int size, float scale, float bias,
-                   bool bias_before) {
-  if (bias_before) bias *= scale;
-  for (int i = 0; i < size; i++) {
-    out[i] = x[i] * scale + bias;
-  }
-}
-
-class ScaleCompute : public KernelLite<TARGET(kHost), PRECISION(kFloat)> {
- public:
-  using param_t = operators::MulParam;
-
-  void Run() override {
-    auto& param = Param<operators::ScaleParam>();
-    scale_compute(param.x->data<float>(), param.output->mutable_data<float>(),
-                  param.x->dims().production(), param.scale, param.bias,
-                  param.bias_after_scale);
-  }
-
-  virtual ~ScaleCompute() = default;
-};
-
-}  // namespace host
-}  // namespace kernels
-}  // namespace lite
-}  // namespace paddle
-
-REGISTER_LITE_KERNEL(scale, kHost, kFloat, kNCHW,
-                     paddle::lite::kernels::host::ScaleCompute, def)
-    .BindInput("X", {LiteType::GetTensorTy(TARGET(kHost))})
-    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kHost))})
-    .Finalize();
--- a/paddle/fluid/lite/kernels/host/use_kernels.h
+++ b/paddle/fluid/lite/kernels/host/use_kernels.h
@@ -15,8 +15,5 @@
 #pragma once
 #include "paddle/fluid/lite/core/op_registry.h"

-USE_LITE_KERNEL(fc, kHost, kFloat, kNCHW, def);
-USE_LITE_KERNEL(mul, kHost, kFloat, kNCHW, def);
-USE_LITE_KERNEL(scale, kHost, kFloat, kNCHW, def);
 USE_LITE_KERNEL(feed, kHost, kAny, kAny, def);
 USE_LITE_KERNEL(fetch, kHost, kAny, kAny, def);
--- a/paddle/fluid/lite/kernels/x86/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/x86/CMakeLists.txt
@@ -3,5 +3,4 @@ if(NOT LITE_WITH_X86)
 endif()

 cc_library(activation_compute SRCS activation_compute.cc DEPS ${lite_kernel_deps} activation_op)
-cc_library(elementwise_compute SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_op)
- 
+cc_library(elementwise_compute SRCS elementwise_compute.cc DEPS ${lite_kernel_deps} elementwise_sub_op)
--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -21,5 +21,5 @@ set(ops_lite
        io_copy_op_lite
        PARENT_SCOPE)

-lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc DEPS fc_op_lite fc_compute_host)
+lite_cc_test(test_fc_op_lite SRCS fc_op_test.cc DEPS fc_op_lite)