Lite/fix mobile combile2 (#18004)

2cee3058 · Yan Chunwei · GitHub · cbf59cbb · 2cee3058 · 2cee3058
12 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -79,6 +79,10 @@ function (lite_deps TARGET)

 endfunction()

+# Add names for lite libraries for latter compile. We use this name list to avoid compiling
+# the whole fluid project to accelerate the compile speed.
+set(offline_lib_registry_file "${CMAKE_BINARY_DIR}/lite_libs.txt")
+file(WRITE ${offline_lib_registry_file} "") # clean
 # cc_library with branch support.
 # The branches:
 #  X86_DEPS: works only when LITE_WITH_X86 is ON.
@@ -106,6 +110,9 @@ function(lite_cc_library TARGET)
            )

    cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+
+    # register a library name.
+    file(APPEND ${offline_lib_registry_file} "${TARGET}\n")
 endfunction()

 function(lite_cc_binary TARGET)
@@ -131,10 +138,6 @@ endfunction()
 # Add a unit-test name to file for latter offline manual test.
 set(offline_test_registry_file "${CMAKE_BINARY_DIR}/lite_tests.txt")
 file(WRITE ${offline_test_registry_file} "") # clean
-function (register_test_offline TARGET)
-  file(APPEND ${offline_test_registry_file} "${TARGET}\n")
-endfunction()
-
 # Test lite modules.
 function(lite_cc_test TARGET)
    set(options "")
@@ -155,7 +158,7 @@ function(lite_cc_test TARGET)
            HVY_DEPS ${args_HVY_DEPS}
            )
    _lite_cc_test(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ARGS ${args_ARGS})
-    register_test_offline("${TARGET}")
+    file(APPEND ${offline_test_registry_file} "${TARGET}\n")
 endfunction()

 add_subdirectory(core)

--- a/paddle/fluid/lite/core/CMakeLists.txt
+++ b/paddle/fluid/lite/core/CMakeLists.txt
@@ -19,7 +19,7 @@ endif()

 proto_library(framework_proto_lite SRCS framework.proto)

-cc_library(kernel_lite SRCS kernel.cc DEPS type_system target_wrapper_lite any_lite op_params_lite framework_proto_lite)
+cc_library(kernel_lite SRCS kernel.cc DEPS type_system target_wrapper_lite any_lite op_params_lite framework_proto_lite ${tensor_lite})
 cc_library(variable_lite SRCS variable.cc)
 cc_library(op_registry_lite SRCS op_registry.cc DEPS framework_proto_lite)
 cc_library(scope_lite SRCS scope.cc DEPS ${tensor_lite})
@@ -30,7 +30,7 @@ cc_library(op_lite SRCS op_lite.cc DEPS scope_lite op_registry_lite target_wrapp
 cc_library(types_lite SRCS types.cc)
 cc_library(type_system SRCS type_system.cc DEPS ${tensor_lite} target_wrapper_lite)

-cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite)
+lite_cc_library(program_lite SRCS program.cc DEPS op_lite kernel_lite compatible_pb_lite model_parser_lite HVY_DEPS framework_proto)
 cc_library(optimizer_lite SRCS optimizer.cc DEPS mir_pass_manager model_parser_lite program_lite)

 add_subdirectory(mir)

--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -28,31 +28,34 @@ cc_test(test_ssa_graph SRCS ssa_graph_test.cc DEPS
        mir_pass_manager
        program_fake_utils
        )
-set(test_variable_place_infrence_pass_DEPS
-        mul_op_lite
-        feed_op_lite
-        fetch_op_lite
-        io_copy_op_lite
-        ${host_kernels}
-        mir_passes
-        mir_pass_manager
-        optimizer_lite
-        program_fake_utils
-        target_wrapper_host
-        )
-if (LITE_WITH_CUDA)
-    set(test_variable_place_infrence_pass_DEPS
-            ${test_variable_place_infrence_pass_DEPS} target_wrapper_cuda
-            kernels_cuda
-            )
-endif()
-cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc DEPS
-        ${test_variable_place_infrence_pass_DEPS})
+# lite_cc_test(test_variable_place_infrence_pass SRCS variable_place_inference_pass_test.cc
+#   DEPS
+#       mul_op_lite
+#       feed_op_lite
+#       fetch_op_lite
+#       io_copy_op_lite
+#       ${host_kernels}
+#       mir_passes
+#       mir_pass_manager
+#       optimizer_lite
+#       program_fake_utils
+#       target_wrapper_host
+#   PROFILE_DEPS basic_profiler_lite
+#   CUDA_DEPS target_wrapper_cuda kernels_cuda
+#   ARM_DEPS mul_compute_arm
+#   X86_DEPS mul_compute_x86
+# )
+
+
+lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS mir_node mir_ssa_graph op_lite)
+lite_cc_test(test_pattern_matcher_lite SRCS pattern_matcher_test.cc DEPS pattern_matcher_lite)

-cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS mir_node mir_ssa_graph op_lite)
-cc_test(test_pattern_matcher_lite SRCS pattern_matcher_test.cc DEPS pattern_matcher_lite)
+lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher_lite)

-cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher_lite)
-cc_test(test_pattern_matcher_high_api SRCS pattern_matcher_high_api_test.cc DEPS
+# TODO(wz) replace framework/proto to lite proto.
+if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    # it depends on the fluid/framework/proto, that is too heavy for mobile execution.
+    lite_cc_test(test_pattern_matcher_high_api SRCS pattern_matcher_high_api_test.cc DEPS
        pattern_matcher_high_api proto_desc mir_pass_manager fc_op_lite mul_op_lite elementwise_ops_lite
        mir_passes compatible_pb_lite program_lite ${ops_lite})
+endif()
--- a/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
+++ b/paddle/fluid/lite/core/mir/static_kernel_pick_pass.cc
@@ -37,6 +37,8 @@ void StaticKernelPickPass::Apply(const std::unique_ptr<SSAGraph>& graph) {
    if (!node.IsStmt()) continue;
    auto& instruct = node.AsStmt();
    std::vector<std::pair<size_t, std::unique_ptr<KernelBase>>> scored;
+    CHECK(!instruct.valid_kernels.empty()) << "No kernels found for "
+                                           << instruct.op_type;
    for (auto&& kernel : instruct.valid_kernels) {
      size_t score = KernelGrade(*kernel);
      scored.emplace_back(score, std::move(kernel));

--- a/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
+++ b/paddle/fluid/lite/core/mir/variable_place_inference_pass_test.cc
@@ -42,6 +42,12 @@ TEST(variable_place_inference_pass, test) {
      Place{
          TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW),
      },
+      Place{
+          TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW),
+      },
+      Place{
+          TARGET(kX86), PRECISION(kAny), DATALAYOUT(kAny),
+      },
  });

  Program program(*desc->Proto(), scope, places);
@@ -58,7 +64,15 @@ TEST(variable_place_inference_pass, test) {
  });

  Place prefered_place{
+#ifdef PADDLE_WITH_CUDA
      TARGET(kCUDA), PRECISION(kFloat), DATALAYOUT(kNCHW),
+#else
+#ifdef PADDLE_WITH_ARM
+      TARGET(kARM), PRECISION(kFloat), DATALAYOUT(kNCHW),
+#else   // X86
+      TARGET(kX86), PRECISION(kFloat), DATALAYOUT(kNCHW),
+#endif  // ARM
+#endif
  };
  optimizer.KernelPickPreferPlace(prefered_place);
  optimizer.Run(std::move(program), places, factor, passes);
@@ -72,3 +86,16 @@ USE_LITE_OP(mul);
 USE_LITE_OP(feed);
 USE_LITE_OP(fetch);
 USE_LITE_OP(io_copy);
+
+#ifdef LITE_WITH_X86
+USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_CUDA
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, host_to_device);
+USE_LITE_KERNEL(io_copy, kCUDA, kAny, kAny, device_to_host);
+#endif
--- a/paddle/fluid/lite/core/program.h
+++ b/paddle/fluid/lite/core/program.h
@@ -55,8 +55,14 @@ struct Program {

  const std::list<std::string>& weights() const { return weights_; }
  const std::list<std::string>& tmp_vars() const { return tmp_vars_; }
+  std::list<std::string>* mutable_weights() { return &weights_; }
+  std::list<std::string>* mutable_tmp_vars() { return &tmp_vars_; }
+
  const std::list<std::shared_ptr<OpLite>>& ops() const { return ops_; }
+  std::list<std::shared_ptr<OpLite>>* mutable_ops() { return &ops_; }
+
  lite::Scope* exec_scope() { return exec_scope_; }
+  lite::Scope* scope() { return scope_.get(); }

 private:
  // Build from a program and scope.

--- a/paddle/fluid/lite/core/program_fake_utils.h
+++ b/paddle/fluid/lite/core/program_fake_utils.h
@@ -33,11 +33,11 @@ Program FakeProgram() {
    std::string w1 = "w" + std::to_string(id);
    std::string b1 = "b" + std::to_string(id);
    std::string out1 = "out" + std::to_string(id);
-    auto w1v = program.scope_->Var(w1)->GetMutable<lite::Tensor>();
-    auto b1v = program.scope_->Var(b1)->GetMutable<lite::Tensor>();
-    auto out1v = program.scope_->Var(out1)->GetMutable<lite::Tensor>();
+    auto w1v = program.scope()->Var(w1)->GetMutable<lite::Tensor>();
+    auto b1v = program.scope()->Var(b1)->GetMutable<lite::Tensor>();
+    auto out1v = program.scope()->Var(out1)->GetMutable<lite::Tensor>();

-    lite::OpDesc desc;
+    cpp::OpDesc desc;
    desc.SetInput("Input", {x});
    desc.SetInput("W", {w1});
    desc.SetInput("Bias", {b1});
@@ -46,12 +46,12 @@ Program FakeProgram() {
    desc.SetAttr("in_num_col_dims", 1);

    // add to input
-    program.tmp_vars_.push_back(w1);
-    program.tmp_vars_.push_back(b1);
+    program.mutable_tmp_vars()->push_back(w1);
+    program.mutable_tmp_vars()->push_back(b1);

    auto fc_op = LiteOpRegistry::Global().Create("fc");
-    fc_op->Attach(desc, program.scope_.get());
-    program.ops_.emplace_back(std::move(fc_op));
+    fc_op->Attach(desc, program.scope());
+    program.mutable_ops()->emplace_back(std::move(fc_op));

    w1v->Resize(DDimHvy(std::vector<int64_t>({100, 100})));
    b1v->Resize(DDimHvy(std::vector<int64_t>({100, 1})));
@@ -64,8 +64,8 @@ Program FakeProgram() {
  // out1, w2, b2 -fc-> out2

  std::string x = "x";
-  program.tmp_vars_.push_back(x);
-  auto* xv = program.scope_->Var(x)->GetMutable<lite::Tensor>();
+  program.mutable_tmp_vars()->push_back(x);
+  auto* xv = program.scope()->Var(x)->GetMutable<lite::Tensor>();
  xv->Resize(DDimHvy(std::vector<int64_t>({100, 100})));

  for (int i = 0; i < 3; i++) {

--- a/paddle/fluid/lite/gen_code/CMakeLists.txt
+++ b/paddle/fluid/lite/gen_code/CMakeLists.txt
 lite_cc_library(gen_code_lite SRCS gen_code.cc
-        DEPS program_lite op_lite scope
+        DEPS program_lite op_lite scope_lite
        cpp_op_desc_lite
        HVY_DEPS operator)
 lite_cc_library(paddle_infer_gencode SRCS paddle_infer.cc DEPS program_lite utils_lite)

-lite_cc_test(test_gen_code_lite SRCS gen_code_test.cc DEPS gen_code_lite ${tensor_lite}
+if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+    lite_cc_test(test_gen_code_lite SRCS gen_code_test.cc DEPS gen_code_lite ${tensor_lite}
            mul_op_lite
            compatible_pb_lite
            model_parser_lite
            X86_DEPS mul_compute_x86
+            ARM_DEPS mul_compute_arm
            ARGS --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)

-if (NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    lite_cc_library(__generated_code__
        SRCS ${CMAKE_BINARY_DIR}/paddle/fluid/lite/gen_code/__generated_code__.cc
        DEPS scope_lite op_lite kernel_lite paddle_infer_gencode

--- a/paddle/fluid/lite/gen_code/gen_code_test.cc
+++ b/paddle/fluid/lite/gen_code/gen_code_test.cc
@@ -136,4 +136,10 @@ TEST(gen_code, optimized_program) {
 }  // namespace paddle

 USE_LITE_OP(mul);
+#ifdef LITE_WITH_X86
 USE_LITE_KERNEL(mul, kX86, kFloat, kNCHW, def);
+#endif
+
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(mul, kARM, kFloat, kNCHW, def);
+#endif
--- a/paddle/fluid/lite/kernels/host/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/host/CMakeLists.txt
@@ -12,4 +12,4 @@ set(host_kernels
    reshape_compute_host
    )

-set(host_kernels "${host_kernels}" CACHE INTERNAL "host kernels")
+set(host_kernels "${host_kernels}" CACHE GLOBAL "host kernels")
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -2,6 +2,7 @@
 set -ex

 TESTS_FILE="./lite_tests.txt"
+LIBS_FILE="./lite_libs.txt"

 readonly common_flags="-DWITH_LITE=ON -DLITE_WITH_LIGHT_WEIGHT_FRAMEWORK=OFF -DWITH_PYTHON=OFF -DWITH_TESTING=ON -DLITE_WITH_ARM=OFF"

@@ -42,18 +43,21 @@ function cmake_arm {
        -DARM_TARGET_OS=$1 -DARM_TARGET_ARCH_ABI=$2
 }

-# function build {
-#     file=$1
-#     for _test in $(cat $file); do
-#         make $_test -j$(expr $(nproc) - 2)
-#     done
-# }
+function build {
+    file=$1
+    for _test in $(cat $file); do
+        make $_test -j$(expr $(nproc) - 2)
+    done
+}

 # It will eagerly test all lite related unittests.
 function test_lite {
    local file=$1
    echo "file: ${file}"
+
    for _test in $(cat $file); do
+        # We move the build phase here to make the 'gen_code' test compiles after the
+        # corresponding test is executed and the C++ code generates.
        make $_test -j$(expr $(nproc) - 2)
        ctest -R $_test -V
    done
@@ -98,8 +102,10 @@ function build_test_server {
    cd ./build
    export LD_LIBRARY_PATH="$LD_LIBRARY_PATH:/paddle/build/third_party/install/mklml/lib"
    cmake_x86_for_CI
-    #build $TESTS_FILE
+    # compile the tests and execute them.
    test_lite $TESTS_FILE
+    # build the remaining libraries to check compiling error.
+    build $LIBS_FILE
 }

 # Build the code and run lite server tests. This is executed in the CI system.
@@ -129,7 +135,6 @@ function build_test_arm {
            build_dir=build.lite.${os}.${abi}
            mkdir -p $build_dir
            cd $build_dir
-
            cmake_arm ${os} ${abi}
            build $TESTS_FILE

@@ -177,10 +182,11 @@ function main {
                TESTS_FILE="${i#*=}"
                shift
                ;;
-            # build)
-            #     build $TESTS_FILE
-            #     shift
-            #     ;;
+            build)
+                build $TESTS_FILE
+                build $LIBS_FILE
+                shift
+                ;;
            cmake_x86)
                cmake_x86
                shift