add full_api_static target and fix building errors, test=develop (#2064)

* add full_api_static target and fix building errors, test=develop * fix build errors, test=develop * fix code style, test=develop * fix lite/model_parser/pb/var_desc.cc, test=develop * fix building errors, test=develop * modify lite/tools/debug/CMakeLists.txt, test=develop

add full_api_static target and fix building errors, test=develop (#2064)
* add full_api_static target and fix building errors, test=develop * fix build errors, test=develop * fix code style, test=develop * fix lite/model_parser/pb/var_desc.cc, test=develop * fix building errors, test=develop * modify lite/tools/debug/CMakeLists.txt, test=develop
4a948cfc · 石晓伟 · GitHub · 26925ab9 · 4a948cfc · 4a948cfc
65 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -176,6 +176,7 @@ include(generic)            # simplify cmake module
 include(ccache)             # set ccache for compilation
 include(util)               # set unittest and link libs
 include(version)            # set PADDLE_VERSION
+include(flags)
 set(CMAKE_CXX_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")
 set(CMAKE_C_FLAGS_RELWITHDEBINFO "-O3 -g -DNDEBUG")

--- a/cmake/generic.cmake
+++ b/cmake/generic.cmake
@@ -105,8 +105,8 @@ set_property(GLOBAL PROPERTY FLUID_MODULES "")
 function(find_fluid_modules TARGET_NAME)
  get_filename_component(__target_path ${TARGET_NAME} ABSOLUTE)
  string(REGEX REPLACE "^${PADDLE_SOURCE_DIR}/" "" __target_path ${__target_path})
-  string(FIND "${__target_path}" "fluid" pos)
+  string(FIND "${__target_path}" "lite" pos)
-  if(pos GREATER 1)
+  if((pos GREATER 0) OR (pos EQUAL 0))
    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
    set(fluid_modules ${fluid_modules} ${TARGET_NAME})
    set_property(GLOBAL PROPERTY FLUID_MODULES "${fluid_modules}")
@@ -369,6 +369,7 @@ function(cc_binary TARGET_NAME)
  endif()
  get_property(os_dependency_modules GLOBAL PROPERTY OS_DEPENDENCY_MODULES)
  target_link_libraries(${TARGET_NAME} ${os_dependency_modules})
+  find_fluid_modules(${TARGET_NAME})
 endfunction(cc_binary)
 function(cc_test TARGET_NAME)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
@@ -126,12 +126,12 @@ function(lite_cc_library TARGET)
            )
    if (args_SHARED OR ARGS_shared)
-        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS} SHARED)
+        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} SHARED)
    elseif (args_MODULE OR ARGS_module)
        add_library(${TARGET} MODULE ${args_SRCS})
        add_dependencies(${TARGET} ${deps} ${args_DEPS})
    else()
-        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    endif()
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
@@ -163,7 +163,7 @@ function(lite_cc_binary TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            )
-    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
+    cc_binary(${TARGET} SRCS ${args_SRCS} DEPS ${deps})
    target_compile_options(${TARGET} BEFORE PRIVATE -Wno-ignored-qualifiers)
    if (NOT APPLE)
        # strip binary target to reduce size

--- a/lite/api/CMakeLists.txt
+++ b/lite/api/CMakeLists.txt
@@ -210,6 +210,8 @@ if (NOT LITE_ON_TINY_PUBLISH)
      FPGA_DEPS ${fpga_kernels})
    # The final inference library for just MobileConfig.
    bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
+    get_property(fluid_modules GLOBAL PROPERTY FLUID_MODULES)
+    cc_library(api_full_static SRCS DEPS paddle_api_full cxx_api paddle_api light_api  ${cxx_api_deps} ${ops} ${host_kernels} ${cuda_kernels} program tensor memory naive_buffer types ${fluid_modules} protobuf)
 endif()
 bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
 #-----------------------------------------------------------------------------------------------------

--- a/lite/api/benchmark.cc
+++ b/lite/api/benchmark.cc
@@ -18,9 +18,6 @@
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
 #include "lite/utils/cp_logging.h"

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -15,6 +15,7 @@
 #include "lite/api/cxx_api.h"
 #include <string>
 #include "lite/api/paddle_api.h"
+#include "lite/core/device_info.h"
 #include "lite/core/version.h"
 namespace paddle {
@@ -49,6 +50,9 @@ class CxxPaddleApiImpl : public lite_api::PaddlePredictor {
 CxxPaddleApiImpl::CxxPaddleApiImpl() {}
 void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
+#ifdef LITE_WITH_CUDA
+  Env<TARGET(kCUDA)>::Init();
+#endif
  auto places = config.valid_places();
  places.emplace_back(TARGET(kHost), PRECISION(kAny), DATALAYOUT(kAny));
  raw_predictor_.Build(config, places);

--- a/lite/api/model_test.cc
+++ b/lite/api/model_test.cc
@@ -16,9 +16,6 @@
 #include <string>
 #include <vector>
 #include "lite/api/paddle_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/api/test_helper.h"
 #include "lite/core/device_info.h"
 #include "lite/utils/cp_logging.h"

--- a/lite/backends/cuda/math/cudnn_conv.cc
+++ b/lite/backends/cuda/math/cudnn_conv.cc
@@ -286,7 +286,6 @@ bool CudnnConv2DInt8<Ptype_out>::create(const operators::ConvParam& param,
    }
  }
  this->scale_.Resize({oc});
-  auto* scale_data = this->scale_.template mutable_data<float>(TARGET(kCUDA));
  this->scale_.template Assign<float, lite::DDim, TARGET(kCUDA)>(
      weight_scale.data(), this->scale_.dims());

--- a/lite/backends/cuda/math/cudnn_conv.h
+++ b/lite/backends/cuda/math/cudnn_conv.h
@@ -32,17 +32,17 @@ class CudnnConv2DBase {
 public:
  CudnnConv2DBase()
      : handle_(NULL),
-        workspace_data_(NULL),
+        fwd_algo_((cudnnConvolutionFwdAlgo_t)0),
-        workspace_(NULL),
-        conv_desc_(NULL),
        input_desc_(NULL),
        output_desc_(NULL),
+        bias_desc_(NULL),
        filter_desc_(NULL),
+        conv_desc_(NULL),
        act_desc_(NULL),
-        bias_desc_(NULL),
+        workspace_data_(NULL),
+        workspace_(NULL),
        workspace_fwd_sizes_(0),
-        workspace_size_inbytes_(0),
+        workspace_size_inbytes_(0) {}
-        fwd_algo_((cudnnConvolutionFwdAlgo_t)0) {}
  ~CudnnConv2DBase() {
    if (conv_desc_) {
@@ -85,10 +85,10 @@ class CudnnConv2DBase {
  cudnnActivationDescriptor_t act_desc_;
  bool with_relu_act_{true};
-  size_t workspace_fwd_sizes_;
-  size_t workspace_size_inbytes_;  // size of underlying storage
  void* workspace_data_;  // underlying storage
  void* workspace_;       // aliases into _workspaceData
+  size_t workspace_fwd_sizes_;
+  size_t workspace_size_inbytes_;  // size of underlying storage
  const bool use_tensor_core_ = true;
  const size_t workspace_limit_bytes_ = 4 * 1024 * 1024;
@@ -104,6 +104,7 @@ template <PrecisionType Ptype_out>
 class CudnnConv2D : public CudnnConv2DBase<Ptype_out> {
 public:
  CudnnConv2D() : CudnnConv2DBase<Ptype_out>() {}
+  virtual ~CudnnConv2D() = default;
  virtual bool init(const operators::ConvParam& param,
                    Context<TARGET(kCUDA)>* ctx);
@@ -117,6 +118,7 @@ template <PrecisionType Ptype_out>
 class CudnnConv2DInt8 : CudnnConv2DBase<Ptype_out> {
 public:
  CudnnConv2DInt8() : CudnnConv2DBase<Ptype_out>() {}
+  virtual ~CudnnConv2DInt8() = default;
  virtual bool init(const operators::ConvParam& param,
                    Context<TARGET(kCUDA)>* ctx);

--- a/lite/backends/x86/dynamic_loader.cc
+++ b/lite/backends/x86/dynamic_loader.cc
@@ -153,7 +153,8 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
      dso_handle = GetDsoHandleFromDefaultPath(dlPath, dynload_flags);
    }
  }
-  auto error_msg =
+/*
+auto error_msg =
    "Failed to find dynamic library: %s ( %s ) \n Please specify "
    "its path correctly using following ways: \n Method. set "
    "environment variable LD_LIBRARY_PATH on Linux or "
@@ -161,8 +162,9 @@ static inline void* GetDsoHandleFromSearchPath(const std::string& search_root,
    "export LD_LIBRARY_PATH=... \n Note: After Mac OS 10.11, "
    "using the DYLD_LIBRARY_PATH is impossible unless System "
    "Integrity Protection (SIP) is disabled.";
+*/
 #if !defined(_WIN32)
-  auto errorno = dlerror();
+// auto errorno = dlerror();
 #else
  auto errorno = GetLastError();
 #endif  // !_WIN32

--- a/lite/backends/x86/math/beam_search.cc
+++ b/lite/backends/x86/math/beam_search.cc
@@ -49,6 +49,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
                                        end_id,
                                        is_accumulated);
    auto selected_items = ToMap(items, high_level.back());
+    /*
    if (FLAGS_v == 3) {
      VLOG(3) << "selected_items:";
      for (size_t i = 0; i < selected_items.size(); ++i) {
@@ -58,6 +59,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
        }
      }
    }
+    */
    PruneEndBeams(pre_ids, abs_lod, &selected_items, level, end_id);
    // calculate the output tensor's height
@@ -69,7 +71,8 @@ class BeamSearchFunctor<TARGET(kX86), T> {
    // the output tensor shape should be [num_instances, 1]
    // auto dims = framework::make_ddim(
    //     std::vector<int64_t>({static_cast<int>(num_instances), 1}));
-    lite::DDim dims(std::vector<int64_t>({num_instances, 1L}));
+    lite::DDim dims(
+        std::vector<int64_t>({static_cast<int>(num_instances), 1L}));
    selected_ids->Resize(dims);
    auto *selected_ids_data = selected_ids->mutable_data<int64_t>(TARGET(kX86));
@@ -296,7 +299,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
      result.emplace_back(top_beam);
    }
+    /*
    if (FLAGS_v == 3) {
      VLOG(3) << "SelectTopBeamSizeItems result size " << result.size();
      for (auto &items : result) {
@@ -306,7 +309,7 @@ class BeamSearchFunctor<TARGET(kX86), T> {
        }
      }
    }
+    */
    return result;
  }
 };

--- a/lite/backends/x86/math/detail/activation_functions.h
+++ b/lite/backends/x86/math/detail/activation_functions.h
@@ -48,6 +48,7 @@ inline ActivationType GetActivationType(const std::string &type) {
  LOG(ERROR) << "Not support type " << type;
  // PADDLE_ENFORCE(false, "Not support type %s", type);
  // PADDLE_THROW("Not support type %s.", type);
+  return ActivationType();
 }
 namespace forward {

--- a/lite/backends/x86/math/tree2col.cc
+++ b/lite/backends/x86/math/tree2col.cc
@@ -107,7 +107,8 @@ class Tree2ColFunctor<lite::TargetType::kX86, T> {
    //    patch->mutable_data<T>({static_cast<int64_t>(patch_size),
    //                            static_cast<int64_t>(patch_elem_size)},
    //                           cpu_place);
-    patch->Resize({static_cast<int64_t>(patch_size, patch_elem_size)});
+    patch->Resize({static_cast<int64_t>(patch_size),
+                   static_cast<int64_t>(patch_elem_size)});
    auto *patch_data = patch->mutable_data<T>(lite::TargetType::kX86);
    constant(context, patch, 0);
    const T *features = node_features.data<T>();

--- a/lite/core/arena/framework.h
+++ b/lite/core/arena/framework.h
@@ -42,6 +42,7 @@ class TestCase {
      : place_(place), scope_(new Scope), alias_(alias) {
    ctx_ = ContextScheduler::Global().NewContext(place_.target);
  }
+  virtual ~TestCase() {}
  void Prepare() {
    PrepareScopes();
@@ -138,20 +139,18 @@ class TestCase {
  }
 private:
+  Place place_;
  std::shared_ptr<Scope> scope_;
+  std::string alias_;
  // The workspace for the Instruction.
  Scope* inst_scope_{};
  // The workspace for the baseline implementation.
  Scope* base_scope_{};
  std::unique_ptr<cpp::OpDesc> op_desc_;
  std::unique_ptr<Instruction> instruction_;
-  Place place_;
-  std::string alias_;
 };
 class Arena {
-  float abs_error_{};
 public:
  Arena(std::unique_ptr<TestCase>&& tester,
        const Place& place,
@@ -203,12 +202,14 @@ class Arena {
      default:
        LOG(FATAL) << "not support type " << PrecisionToStr(type->precision());
+        return false;
    }
  }
 private:
  std::unique_ptr<TestCase> tester_;
  Place place_;
+  float abs_error_;
 };
 template <typename T>

--- a/lite/core/context.h
+++ b/lite/core/context.h
@@ -192,10 +192,10 @@ class Context<TargetType::kCUDA> {
    ctx->cublas_fp32_ = cublas_fp32_;
  }
-  const cudaStream_t exec_stream() { return exec_stream_; }
+  const cudaStream_t& exec_stream() const { return exec_stream_; }
  void SetExecStream(cudaStream_t stream) { exec_stream_ = stream; }
-  const cudaStream_t io_stream() { return io_stream_; }
+  const cudaStream_t& io_stream() const { return io_stream_; }
  void SetIoStream(cudaStream_t stream) { io_stream_ = stream; }
  std::shared_ptr<cuda::Blas<float>> cublas_fp32() { return cublas_fp32_; }

--- a/lite/core/device_info.h
+++ b/lite/core/device_info.h
@@ -167,7 +167,7 @@ class Device<TARGET(kCUDA)> {
  int id() { return idx_; }
  int max_stream() { return max_stream_; }
-  int SetId(int idx) { idx_ = idx; }
+  void SetId(int idx) { idx_ = idx; }
  std::string name() { return device_prop_.name; }
  int core_num() { return device_prop_.multiProcessorCount; }
  float max_memory() { return device_prop_.totalGlobalMem / 1048576.; }
@@ -186,8 +186,8 @@ class Device<TARGET(kCUDA)> {
  void GetInfo();
 private:
-  int max_stream_;
  int idx_{0};
+  int max_stream_;
  cudaDeviceProp device_prop_;
  std::string device_name_;
  float max_memory_;

--- a/lite/core/mir/pass_utils.cc
+++ b/lite/core/mir/pass_utils.cc
@@ -46,7 +46,7 @@ std::set<T> Types<T>::ValidSet(const T& element) const {
  return std::set<T>({element});
 }
-bool ExpandPlaces(std::set<Place>* places, const Place& place) {
+void ExpandPlaces(std::set<Place>* places, const Place& place) {
  static const Types<TargetType> target_set({TARGET(kHost),
                                             TARGET(kX86),
                                             TARGET(kCUDA),

--- a/lite/core/mir/subgraph/subgraph_program_pass.cc
+++ b/lite/core/mir/subgraph/subgraph_program_pass.cc
@@ -278,19 +278,21 @@ int SubgraphProgramPass::FuseSubgraphID(
    const std::unique_ptr<SSAGraph>& graph) {
  int sub_id = 1;  // id start from 1 not 0
  for (auto& item : graph->StmtTopologicalOrder()) {
-    bool inputvar = 0;
+    // bool inputvar = false;
    if (!item->IsStmt()) continue;
    auto& stmt = item->AsStmt();
+    /*
    if (stmt.subgraph_id() == -1) {
      for (auto& i : item->outlinks) {
        for (auto& j : i->outlinks) {
          if (j->IsStmt()) {
            auto& jstmt = j->AsStmt();
-            if (jstmt.subgraph_id() == 0) inputvar = 1;
+            if (jstmt.subgraph_id() == 0) inputvar = true;
          }
        }
      }
    }
+    */
    if (stmt.subgraph_id() != 0) continue;
    ChangeAllOutConnectedID(item, sub_id);
    sub_id++;

--- a/lite/core/mir/subgraph/subgraph_program_pass_test.cc
+++ b/lite/core/mir/subgraph/subgraph_program_pass_test.cc
@@ -214,7 +214,6 @@ TEST(SubGraphTest, SimpleNet) {
  auto* pass = new mir::subgraph::SubgraphProgramPass;
  ASSERT_EQ(pass->FuseSubgraph(graph, supported_op_types), 1);
-  const int num_nodes = graph->nodes().size();
  ASSERT_EQ(graph->nodes().size(), 9);
  // LOG(INFO) << "After NPU Pass \n" << Visualize(graph.get());
 }

--- a/lite/core/mir/type_layout_cast_pass.cc
+++ b/lite/core/mir/type_layout_cast_pass.cc
@@ -110,7 +110,6 @@ void TypeLayoutTransformPass::AddLayoutInst(
  bool is_found = false;
  for (auto& kernel : kernels) {
    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
    if (TypeCompatible(*in_arg_ty, from)) {
      is_found = true;
      selected_kernels.emplace_back(std::move(kernel));

--- a/lite/core/mir/type_target_cast_pass.cc
+++ b/lite/core/mir/type_target_cast_pass.cc
@@ -114,7 +114,6 @@ void TypeTargetTransformPass::AddIoCopyInst(
  std::vector<std::unique_ptr<KernelBase>> selected_kernels;
  for (auto& kernel : kernels) {
    const Type* in_arg_ty = kernel->GetInputDeclType("Input");
-    const Type* out_arg_ty = kernel->GetOutputDeclType("Out");
    if (TypeCompatible(*in_arg_ty, from)) {
      is_found = true;
      selected_kernels.emplace_back(std::move(kernel));

--- a/lite/core/op_registry.h
+++ b/lite/core/op_registry.h
@@ -209,7 +209,7 @@ class KernelRegistry final {
    ss << "Count of kernel kinds: ";
    int count = 0;
    for (auto &item : kernel_info_map_) {
-      for (auto &kernel : item.second) ++count;
+      count += item.second.size();
    }
    ss << count << "\n";

--- a/lite/fluid/data_type.cc
+++ b/lite/fluid/data_type.cc
@@ -68,6 +68,7 @@ framework::proto::VarType::Type ToDataType(std::type_index type) {
    return it->second;
  }
  PADDLE_THROW("Not support %s as tensor type", type.name());
+  return static_cast<framework::proto::VarType::Type>(-1);
 }
 std::type_index ToTypeIndex(framework::proto::VarType::Type type) {
@@ -77,6 +78,7 @@ std::type_index ToTypeIndex(framework::proto::VarType::Type type) {
  }
  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
               static_cast<int>(type));
+  return std::type_index(typeid(void));
 }
 std::string DataTypeToString(const framework::proto::VarType::Type type) {
@@ -86,6 +88,7 @@ std::string DataTypeToString(const framework::proto::VarType::Type type) {
  }
  PADDLE_THROW("Not support framework::proto::VarType::Type(%d) as tensor type",
               static_cast<int>(type));
+  return std::string();
 }
 size_t SizeOfType(framework::proto::VarType::Type type) {
@@ -93,7 +96,8 @@ size_t SizeOfType(framework::proto::VarType::Type type) {
  if (it != gDataTypeMap().proto_to_size_.end()) {
    return it->second;
  }
-  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type));
+  PADDLE_THROW("Not support %s as tensor type", DataTypeToString(type).c_str());
+  return 0;
 }
 }  // namespace fluid

--- a/lite/fluid/selected_rows.cc
+++ b/lite/fluid/selected_rows.cc
@@ -163,7 +163,7 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key,
  if (iter == id_to_index_.end()) {
    rwlock_->UNLock();
    if (!auto_grown) {
-      PADDLE_THROW("key %d not found", key);
+      PADDLE_THROW("key %ld not found", key);
    }
    rwlock_->WRLock();
    auto map_size = id_to_index_.size();
@@ -171,7 +171,7 @@ int64_t SelectedRows::AutoGrownIndex(int64_t key,
    if (map_size != vector_size) {
      rwlock_->UNLock();
      PADDLE_THROW(
-          "id_to_index_ size %d should have the same size with rows_ %d",
+          "id_to_index_ size %lu should have the same size with rows_ %lu",
          map_size,
          vector_size);
    }

--- a/lite/fluid/selected_rows.h
+++ b/lite/fluid/selected_rows.h
@@ -82,7 +82,7 @@ class SelectedRows {
  int64_t Index(int64_t key) const {
    auto it = std::find(rows_.begin(), rows_.end(), key);
    if (it == rows_.end()) {
-      PADDLE_THROW("id %s not in table", key);
+      PADDLE_THROW("id %ld not in table", key);
    }
    return static_cast<int64_t>(std::distance(rows_.begin(), it));
  }

--- a/lite/kernels/cuda/calib_compute_cuda_test.cc
+++ b/lite/kernels/cuda/calib_compute_cuda_test.cc
@@ -75,7 +75,6 @@ TEST(calib_cuda, int8_to_fp32) {
  output.Resize({n, c, h, w});
  output_cpu.Resize({n, c, h, w});
  // initialize the data of input tensors
-  auto* x_data = x.mutable_data<int8_t>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<int8_t>();
  for (int i = 0; i < x.dims().production(); i++) {
    float sign = i % 3 == 0 ? -1.0f : 1.0f;
@@ -131,7 +130,6 @@ TEST(calib_cuda, fp32_to_int8) {
  output.Resize({n, c, h, w});
  output_cpu.Resize({n, c, h, w});
  // initialize the data of input tensors
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<float>();
  for (int i = 0; i < x.dims().production(); i++) {
    float sign = i % 3 == 0 ? -1.0f : 1.0f;

--- a/lite/kernels/cuda/conv_compute_test.cc
+++ b/lite/kernels/cuda/conv_compute_test.cc
@@ -53,7 +53,6 @@ TEST(conv_compute, fp32) {
  y_cpu.Resize({n, c_o, h_o, w_o});
  bias_cpu.Resize({c_o});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
  float* x_cpu_data = x_cpu.mutable_data<float>();
  float* filter_cpu_data = filter_cpu.mutable_data<float>();
@@ -127,7 +126,6 @@ TEST(conv_compute, int8) {
  y_cpu.Resize({1, 1, 1, c});
  bias_cpu.Resize({c});
-  auto* x_data = x.mutable_data<int8_t>(TARGET(kCUDA));
  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<int8_t>();
  auto* filter_cpu_data = filter_cpu.mutable_data<int8_t>();
@@ -194,7 +192,6 @@ TEST(conv_compute, int8_int8_out) {
  y_cpu.Resize({1, 1, 1, c});
  bias_cpu.Resize({c});
-  auto* x_data = x.mutable_data<int8_t>(TARGET(kCUDA));
  auto* y_data = y.mutable_data<int8_t>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<int8_t>();
  auto* filter_cpu_data = filter_cpu.mutable_data<int8_t>();

--- a/lite/kernels/cuda/elementwise_add_compute_test.cc
+++ b/lite/kernels/cuda/elementwise_add_compute_test.cc
@@ -56,8 +56,6 @@ TEST(elementwise_add, normal) {
  y_ref.Resize({n, c, h, w});
  out_ref.Resize({n, c, h, w});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<float>();

--- a/lite/kernels/cuda/leaky_relu_compute_test.cc
+++ b/lite/kernels/cuda/leaky_relu_compute_test.cc
@@ -35,7 +35,6 @@ TEST(leaky_relu, normal) {
  x_cpu.Resize({h, w});
  y_cpu.Resize({h, w});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
  auto* y_data = y.mutable_data<float>(TARGET(kCUDA));
  float* x_cpu_data = x_cpu.mutable_data<float>();
  float* y_cpu_data = x_cpu.mutable_data<float>();

--- a/lite/kernels/cuda/nearest_interp_compute_test.cc
+++ b/lite/kernels/cuda/nearest_interp_compute_test.cc
@@ -80,7 +80,6 @@ TEST(nearest_interp, normal) {
  Tensor x_ref, osz_ref, out_ref;
  int n = 1, c = 3, in_h = 40, in_w = 40;
-  int in_chw = c * in_h * in_w;
  int out_h = 80, out_w = 80;
  float scale = 2.0;
@@ -101,8 +100,6 @@ TEST(nearest_interp, normal) {
  osz_ref.Resize({2});
  out_ref.Resize({n, c, out_h, out_w});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* osz_data = osz.mutable_data<float>(TARGET(kCUDA));
  auto* out_data = out.mutable_data<float>(TARGET(kCUDA));
  float* x_cpu_data = x_cpu.mutable_data<float>();

--- a/lite/kernels/cuda/transpose_compute_test.cc
+++ b/lite/kernels/cuda/transpose_compute_test.cc
@@ -41,7 +41,6 @@ void nchw2nhwc_ref(lite::Tensor* input,
  int input_c = input->dims()[1];
  int input_h = input->dims()[2];
  int input_w = input->dims()[3];
-  int output_n = output->dims()[0];
  int output_c = output->dims()[1];
  int output_h = output->dims()[2];
  int output_w = output->dims()[3];
@@ -75,7 +74,6 @@ void nhwc2nchw_ref(lite::Tensor* input,
  int input_h = input->dims()[1];
  int input_w = input->dims()[2];
  int input_c = input->dims()[3];
-  int output_n = output->dims()[0];
  int output_h = output->dims()[1];
  int output_w = output->dims()[2];
  int output_c = output->dims()[3];
@@ -145,7 +143,6 @@ TEST(transpose_nchw, normal) {
  x_ref.Resize({N, C, H, W});
  out_ref.Resize({N, H, W, C});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<float>();
  auto* out_cpu_data = out_cpu.mutable_data<float>();
  auto* x_ref_data = x_ref.mutable_data<float>();
@@ -200,7 +197,6 @@ TEST(transpose_nhwc, normal) {
  x_ref.Resize({N, H, W, C});
  out_ref.Resize({N, C, H, W});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<float>();
  auto* out_cpu_data = out_cpu.mutable_data<float>();
  auto* x_ref_data = x_ref.mutable_data<float>();
@@ -253,7 +249,6 @@ TEST(transpose, normal) {
  x_ref.Resize({C, H, W});
  out_ref.Resize({W, C, H});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
  auto* x_cpu_data = x_cpu.mutable_data<float>();
  auto* out_cpu_data = out_cpu.mutable_data<float>();
  auto* x_ref_data = x_ref.mutable_data<float>();

--- a/lite/kernels/cuda/yolo_box_compute_test.cc
+++ b/lite/kernels/cuda/yolo_box_compute_test.cc
@@ -180,8 +180,6 @@ TEST(yolo_box, normal) {
  boxes_ref.Resize({n, m, 4});
  scores_ref.Resize({n, cls, m});
-  auto* x_data = x.mutable_data<float>(TARGET(kCUDA));
-  auto* sz_data = sz.mutable_data<float>(TARGET(kCUDA));
  auto* boxes_data = boxes.mutable_data<float>(TARGET(kCUDA));
  auto* scores_data = scores.mutable_data<float>(TARGET(kCUDA));

--- a/lite/kernels/x86/activation_compute.h
+++ b/lite/kernels/x86/activation_compute.h
@@ -64,6 +64,7 @@ bool Activate(const lite::Tensor* X, lite::Tensor* Out) {
  auto x = lite::fluid::EigenVector<T>::Flatten(*X);
  auto out = lite::fluid::EigenVector<T>::Flatten(*Out);
  Functor()(place, x, out);
+  return true;
 }
 // square(x) = x^2

--- a/lite/model_parser/naive_buffer/naive_buffer_wrapper_test.cc
+++ b/lite/model_parser/naive_buffer/naive_buffer_wrapper_test.cc
@@ -293,7 +293,7 @@ TEST(NaiveBufferWrapper, ProgramDesc) {
  // Set ProgramDesc
  nb_desc0.SetVersion(1);
  for (int i = 0; i < 3; ++i) {
-    auto* item = nb_desc0.AddBlock<proto::BlockDesc>();
+    nb_desc0.AddBlock<proto::BlockDesc>();
  }
  // Save model

--- a/lite/model_parser/naive_buffer/op_desc.h
+++ b/lite/model_parser/naive_buffer/op_desc.h
@@ -130,6 +130,7 @@ class OpDesc : public OpDescAPI {
      DEF_ONE(LONGS);
      default:
        LOG(FATAL) << "Unknown attribute type";
+        return static_cast<AttrType>(-1);
    }
 #undef DEF_ONE
  }

--- a/lite/model_parser/naive_buffer/param_desc.cc
+++ b/lite/model_parser/naive_buffer/param_desc.cc
@@ -97,6 +97,7 @@ VarDescAPI::VarDataType ParamDesc::GetDataType() const {
    default:
      LOG(FATAL) << "Unknown var data type";
  }
+  return VarDescAPI::VarDataType();
 #undef GET_DATA_TYPE_CASE_ITEM
 }

--- a/lite/model_parser/naive_buffer/var_desc.cc
+++ b/lite/model_parser/naive_buffer/var_desc.cc
@@ -51,6 +51,7 @@ VarDescAPI::Type VarDesc::GetType() const {
    GET_TYPE_CASE_ITEM(READER);
    default:
      LOG(FATAL) << "Unknown var type";
+      return VarDescAPI::Type();
  }
 #undef GET_TYPE_CASE_ITEM
 }

--- a/lite/model_parser/pb/op_desc.h
+++ b/lite/model_parser/pb/op_desc.h
@@ -121,6 +121,7 @@ class OpDesc : public OpDescAPI {
      DEF_ONE(LONGS);
      default:
        LOG(FATAL) << "Unknown attribute type";
+        return static_cast<AttrType>(-1);
    }
 #undef DEF_ONE
  }

--- a/lite/model_parser/pb/var_desc.cc
+++ b/lite/model_parser/pb/var_desc.cc
@@ -39,6 +39,7 @@ VarDescAPI::Type VarDesc::GetType() const {
    GET_TYPE_CASE_ITEM(READER);
    default:
      LOG(FATAL) << "Unknown var type";
+      return VarDescAPI::Type();
  }
 #undef GET_TYPE_CASE_ITEM
 }

--- a/lite/operators/gru_unit_op.cc
+++ b/lite/operators/gru_unit_op.cc
@@ -32,7 +32,6 @@ bool GRUUnitOpLite::CheckShape() const {
  auto hidden_prev_dims = param_.hidden_prev->dims();
  auto weight_dims = param_.weight->dims();
-  int batch_size = input_dims[0];
  int input_size = input_dims[1];
  int frame_size = hidden_prev_dims[1];
  int weight_height = weight_dims[0];

--- a/lite/operators/im2sequence_op.cc
+++ b/lite/operators/im2sequence_op.cc
@@ -29,7 +29,6 @@ bool Im2SequenceOp::CheckShape() const { return true; }
 bool Im2SequenceOp::InferShape() const {
  CHECK_OR_FALSE(param_.Out);
  // TODO(Superjomn) Enable data sharing.
-  auto inputs = param_.X;
  auto input_dims = param_.X->dims();
  int img_num = input_dims[0];
  int img_channels = input_dims[1];

--- a/lite/operators/is_empty_op.cc
+++ b/lite/operators/is_empty_op.cc
@@ -21,7 +21,7 @@ namespace operators {
 bool IsEmptyOp::CheckShape() const { return true; }
-bool IsEmptyOp::InferShape() const {}
+bool IsEmptyOp::InferShape() const { return true; }
 bool IsEmptyOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
  param_.X =

--- a/lite/operators/range_op.cc
+++ b/lite/operators/range_op.cc
@@ -33,7 +33,7 @@ template <typename T>
 void GetSize(T start, T end, T step, int64_t* size) {
  CHECK(!std::equal_to<T>()(step, 0))
      << "The step of range op should not be 0.";
-  CHECK(((start < end) && (step > 0)) || (start > end) && (step < 0))
+  CHECK(((start < end) && (step > 0)) || ((start > end) && (step < 0)))
      << "The step should be greater than 0 while start < end. And the "
         "step should be less than 0 while start > end.";
  *size = std::is_integral<T>::value

--- a/lite/operators/yolo_box_op.cc
+++ b/lite/operators/yolo_box_op.cc
@@ -48,7 +48,6 @@ bool YoloBoxOp::CheckShape() const {
 bool YoloBoxOp::InferShape() const {
  auto* X = param_.X;
-  auto* ImgSize = param_.ImgSize;
  auto anchors = param_.anchors;
  int anchor_num = anchors.size() / 2;
  auto class_num = param_.class_num;

--- a/lite/tests/kernels/affine_channel_compute_test.cc
+++ b/lite/tests/kernels/affine_channel_compute_test.cc
@@ -64,8 +64,6 @@ class AffineChannelComputeTester : public arena::TestCase {
    if (data_layout_ == "NCHW") {
      int channel = x_dims_[1];
-      int height = x_dims_[2];
-      int width = x_dims_[3];
      int size = x_dims_[2] * x_dims_[3];
      int in_channel = channel * size;
      for (int n = 0; n < num; n++) {

--- a/lite/tests/kernels/box_coder_compute_test.cc
+++ b/lite/tests/kernels/box_coder_compute_test.cc
@@ -121,16 +121,10 @@ class BoxCoderComputeTester : public arena::TestCase {
    auto* output_box = scope->NewTensor(output_box_);
    CHECK(output_box);
    output_box->Resize(target_box_dims_);
-    auto* output_box_data = output_box->mutable_data<float>();
    auto* prior_box = scope->FindTensor(prior_box_);
-    const auto* prior_box_data = prior_box->data<float>();
    auto* prior_box_var = scope->FindTensor(prior_box_var_);
-    const auto* prior_box_var_data = prior_box_var->data<float>();
    auto* target_box = scope->FindTensor(target_box_);
-    const auto* target_box_data = target_box->data<float>();
    box_coder_ref(output_box,
                  prior_box,

--- a/lite/tests/kernels/cast_compute_test.cc
+++ b/lite/tests/kernels/cast_compute_test.cc
@@ -45,10 +45,6 @@ class CastComputeTester : public arena::TestCase {
      auto* output_data = out->mutable_data<float>();
      auto* x = scope->FindTensor(input_);
      auto* x_data = x->data<char>();
-      int num = x_dims_[0];
-      int channel = x_dims_[1];
-      int size = x_dims_[2] * x_dims_[3];
-      int in_channel = channel * size;
      auto* output_data_tmp = output_data;
      auto* x_data_tmp = x_data;
      for (int i = 0; i < x_dims_.production(); i++) {
@@ -60,10 +56,6 @@ class CastComputeTester : public arena::TestCase {
      auto* output_data = out->mutable_data<float>();
      auto* x = scope->FindTensor(input_);
      auto* x_data = x->data<int32_t>();
-      int num = x_dims_[0];
-      int channel = x_dims_[1];
-      int size = x_dims_[2] * x_dims_[3];
-      int in_channel = channel * size;
      auto* output_data_tmp = output_data;
      auto* x_data_tmp = x_data;
      for (int i = 0; i < x_dims_.production(); i++) {

--- a/lite/tests/kernels/conv2d_transpose_compute_test.cc
+++ b/lite/tests/kernels/conv2d_transpose_compute_test.cc
@@ -190,7 +190,6 @@ bool deconv_basic(const Dtype1* din,
  auto* workspace_ptr = workspace_tensor.mutable_data<Dtype2>();
  int group_size_in = win * hin * chin / group;
-  int group_size_out = wout * hout * chout / group;
  int group_size_coldata = m * n;
  int group_size_weights = chin * chout * kernel_w * kernel_h / (group * group);
  bool flag_1x1s1p1 = (kernel_w == 1) && (kernel_h == 1) && (stride_h == 1) &&

--- a/lite/tests/kernels/elementwise_compute_test.cc
+++ b/lite/tests/kernels/elementwise_compute_test.cc
@@ -43,7 +43,6 @@ class ElementwiseComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {
@@ -94,7 +93,6 @@ class ElementwiseSubComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {
@@ -145,7 +143,6 @@ class ElementwiseMulComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {
@@ -196,7 +193,6 @@ class ElementwiseMaxComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {
@@ -249,7 +245,6 @@ class FusionElementwiseAddActivationComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {
@@ -308,7 +303,6 @@ class FusionElementwiseSubActivationComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {
@@ -367,7 +361,6 @@ class FusionElementwiseMulActivationComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {
@@ -426,7 +419,6 @@ class FusionElementwiseMaxActivationComputeTester : public arena::TestCase {
    auto* x = scope->FindTensor(inputx_);
    const auto* x_data = x->data<float>();
-    auto* y = scope->FindTensor(inputy_);
    const auto* y_data = x->data<float>();
    for (int i = 0; i < dims_.production(); i++) {

--- a/lite/tests/kernels/fc_compute_test.cc
+++ b/lite/tests/kernels/fc_compute_test.cc
@@ -51,10 +51,10 @@ class FcOPTest : public arena::TestCase {
  std::string weight_ = "w";
  std::string bias_ = "b";
  std::string out_ = "out";
-  int in_num_col_dims_{1};
  DDim dims_{{1, 128}};
  DDim wdims_{{128, 4}};
  DDim bdims_{{4}};
+  int in_num_col_dims_{1};
 public:
  FcOPTest(const Place& place,

--- a/lite/tests/kernels/gru_unit_test.cc
+++ b/lite/tests/kernels/gru_unit_test.cc
@@ -243,11 +243,11 @@ class GRUUnitTester : public arena::TestCase {
  std::string reset_hidden_prev_ = "reset_hidden_prev";
  std::string hidden_ = "hidden";
-  DDim dims_{{16, 256 * 3}};
  // 0: indentity; 1: sigmoid; 2: tanh; 3: relu
  int gate_activation_{1};
  int activation_{2};
  bool origin_mode_{false};
+  DDim dims_{{16, 256 * 3}};
 public:
  GRUUnitTester(const Place& place,

--- a/lite/tests/kernels/lrn_compute_test.cc
+++ b/lite/tests/kernels/lrn_compute_test.cc
@@ -123,7 +123,6 @@ class LrnComputeTester : public arena::TestCase {
    int H = dims_[2];
    int W = dims_[3];
-    int pre_pad = (local_size_ - 1) / 2;
    int offset_num = 0;
    int offset_within_channel = 0;
    int dst_id;

--- a/lite/tests/kernels/matmul_compute_test.cc
+++ b/lite/tests/kernels/matmul_compute_test.cc
@@ -120,12 +120,12 @@ class MatMulComputeTester : public arena::TestCase {
  // common attributes for this op.
  std::string x_ = "X";
  std::string y_ = "Y";
-  std::string out_ = "Out";
-  DDim x_dims_;
-  DDim y_dims_;
  bool x_transpose_;
  bool y_transpose_;
  float alpha_;
+  std::string out_ = "Out";
+  DDim x_dims_;
+  DDim y_dims_;
 public:
  MatMulComputeTester(const Place& place,

--- a/lite/tests/kernels/pad2d_compute_test.cc
+++ b/lite/tests/kernels/pad2d_compute_test.cc
@@ -26,8 +26,8 @@ class Pad2dComputeTester : public arena::TestCase {
  std::string input_ = "X";
  std::string output_ = "Out";
  DDim dims_{{1, 1, 14, 14}};
-  std::vector<int> paddings_;
  std::string mode_{"constant"};
+  std::vector<int> paddings_;
  float pad_value_ = 0.f;
  std::string data_format_{"NCHW"};

--- a/lite/tests/kernels/prior_box_compute_test.cc
+++ b/lite/tests/kernels/prior_box_compute_test.cc
@@ -125,7 +125,6 @@ void prior_box_compute_ref(const lite::Tensor* input,
      if (fixed_size_.size() > 0) {
        for (int s = 0; s < fixed_size_.size(); ++s) {
          int fixed_size = fixed_size_[s];
-          int com_idx = 0;
          box_width = fixed_size;
          box_height = fixed_size;

--- a/lite/tests/kernels/reduce_max_compute_test.cc
+++ b/lite/tests/kernels/reduce_max_compute_test.cc
@@ -28,7 +28,7 @@ void reduce_n(const float* src,
              int width_in) {
  int hw_size = height_in * width_in;
  int chw_size = channel_in * hw_size;
-  int data_index, src_index, src_index0;
+  int data_index, src_index;
  for (int c = 0; c < channel_in; ++c) {
    for (int h = 0; h < height_in; ++h) {
      for (int w = 0; w < width_in; ++w) {
@@ -196,9 +196,9 @@ class ReduceMaxComputeTester : public arena::TestCase {
  std::string input_ = "x";
  std::string output_ = "out";
  std::vector<int> dim_{0};
-  DDim x_dims_{{3, 2, 3, 4}};
  bool keep_dim_ = false;
  bool reduce_all_ = false;
+  DDim x_dims_{{3, 2, 3, 4}};
 public:
  ReduceMaxComputeTester(const Place& place,

--- a/lite/tests/kernels/reduce_mean_compute_test.cc
+++ b/lite/tests/kernels/reduce_mean_compute_test.cc
@@ -28,7 +28,7 @@ void reduce_mean_n(const float* src,
                   int width_in) {
  int hw_size = height_in * width_in;
  int chw_size = channel_in * hw_size;
-  int data_index, src_index, src_index0;
+  int data_index, src_index;
  for (int c = 0; c < channel_in; ++c) {
    for (int h = 0; h < height_in; ++h) {
      for (int w = 0; w < width_in; ++w) {
@@ -195,8 +195,8 @@ class ReduceMeanComputeTester : public arena::TestCase {
  std::string input_ = "x";
  std::string output_ = "out";
  std::vector<int> dim_{0};
-  DDim x_dims_{{3, 2, 3, 4}};
  bool keep_dim_ = false;
+  DDim x_dims_{{3, 2, 3, 4}};
  bool reduce_all_ = false;
 public:

--- a/lite/tests/kernels/sequence_expand_compute_test.cc
+++ b/lite/tests/kernels/sequence_expand_compute_test.cc
@@ -25,10 +25,10 @@ class SequenceExpandComputeTester : public arena::TestCase {
  const std::string input_x_ = "x";
  const std::string input_y_ = "y";
  const std::string output_ = "out";
-  int ref_level_ = -1;
-  DDim dims_{{4, 1}};
  LoD lod_x_{{0, 2, 4}};
  LoD lod_y_{{0, 1, 4}};
+  int ref_level_ = -1;
+  DDim dims_{{4, 1}};
 public:
  SequenceExpandComputeTester(const Place& place,
@@ -50,7 +50,6 @@ class SequenceExpandComputeTester : public arena::TestCase {
    const auto* x_data = x->data<float>();
    (x->mutable_lod())->clear();
    (x->mutable_lod())->push_back(lod_x_[0]);
-    int x_rank = dims_.size();
    auto width = x->numel() / dims_[0];
    auto lod_x = x->lod();
@@ -59,7 +58,6 @@ class SequenceExpandComputeTester : public arena::TestCase {
    for (int i = 0; i < lod_y_.size(); i++) {
      (y->mutable_lod())->push_back(lod_y_[i]);
    }
-    const auto* y_data = y->data<float>();
    if (ref_level_ == -1) {
      ref_level_ = lod_y_.size() - 1;
    }

--- a/lite/tests/kernels/sequence_pool_compute_test.cc
+++ b/lite/tests/kernels/sequence_pool_compute_test.cc
@@ -25,9 +25,9 @@ class SequencePoolComputeTester : public arena::TestCase {
  // common attributes for this op.
  std::string input_ = "x";
  std::string output_ = "out";
-  DDim dims_{{5, 1}};
  LoD lod_{{0, 2, 5}};
  std::string pool_type_ = "SUM";
+  DDim dims_{{5, 1}};
 public:
  SequencePoolComputeTester(const Place& place,

--- a/lite/tests/math/gemm_int8_compute_test.cc
+++ b/lite/tests/math/gemm_int8_compute_test.cc
@@ -60,10 +60,6 @@ bool test_gemm_int8(bool tra,
  Tensor tc_basic_fp32;
  Tensor tbias;
-  int lda = tra ? m : k;
-  int ldb = trb ? k : n;
-  int ldc = n;
  ta.Resize({m, k});
  tb.Resize({k, n});
  tc_int8.Resize({m, n});
@@ -94,6 +90,16 @@ bool test_gemm_int8(bool tra,
    scale_merge_int8[j] = scale_merge_fp32[j] / scale_c[0];
  }
+  LOG(INFO) << "gemm_int8 M: " << m << ", N: " << n << ", K: " << k
+            << ", transA: " << (tra ? "true" : "false")
+            << ", transB: " << (trb ? "true" : "false")
+            << ", relu: " << (has_relu ? "true" : "false")
+            << ", bias: " << (has_bias ? "true" : "false");
+#ifdef LITE_WITH_ARM
+  int lda = tra ? m : k;
+  int ldb = trb ? k : n;
+  int ldc = n;
  auto da = ta.mutable_data<int8_t>();
  auto db = tb.mutable_data<int8_t>();
  auto dc_int8 = tc_int8.mutable_data<int8_t>();
@@ -102,12 +108,6 @@ bool test_gemm_int8(bool tra,
  auto dc_basic_fp32 = tc_basic_fp32.mutable_data<float>();
  auto dbias = tbias.mutable_data<float>();
-  LOG(INFO) << "gemm_int8 M: " << m << ", N: " << n << ", K: " << k
-            << ", transA: " << (tra ? "true" : "false")
-            << ", transB: " << (trb ? "true" : "false")
-            << ", relu: " << (has_relu ? "true" : "false")
-            << ", bias: " << (has_bias ? "true" : "false");
-#ifdef LITE_WITH_ARM
  if (FLAGS_check_result) {
    Tensor ta_fp32;
    Tensor tb_fp32;

--- a/lite/tools/ci_build.sh
+++ b/lite/tools/ci_build.sh
@@ -42,7 +42,7 @@ function prepare_workspace {
    cp ../${DEBUG_TOOL_PATH_PREFIX}/analysis_tool.py ./${DEBUG_TOOL_PATH_PREFIX}/
    # clone submodule
-    #git submodule update --init --recursive
+    # git submodule update --init --recursive
    prepare_thirdparty
 }

--- a/lite/tools/debug/CMakeLists.txt
+++ b/lite/tools/debug/CMakeLists.txt
 lite_cc_library(debug_utils SRCS debug_utils.cc DEPS op_params model_parser)
-lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK OR LITE_ON_MODEL_OPTIMIZE_TOOL)
+  lite_cc_binary(lite_model_debug_tool SRCS model_debug_tool.cc
    DEPS
    cxx_api
    debug_utils
    target_wrapper_host
    mir_passes
    gflags
+    logging
    ${ops} ${host_kernels}
    X86_DEPS ${x86_kernels}
    ARM_DEPS ${arm_kernels}
    NPU_DEPS ${npu_kernels}
    FPGA_DEPS ${fpga_kernels}
    CL_DEPS ${opencl_kernels})
+endif()
--- a/lite/tools/debug/model_debug_tool.cc
+++ b/lite/tools/debug/model_debug_tool.cc
@@ -16,9 +16,6 @@
 #include <string>
 #include <vector>
 #include "lite/api/cxx_api.h"
-#include "lite/api/paddle_use_kernels.h"
-#include "lite/api/paddle_use_ops.h"
-#include "lite/api/paddle_use_passes.h"
 #include "lite/core/op_registry.h"
 #include "lite/model_parser/model_parser.h"
 #include "lite/model_parser/pb/program_desc.h"
@@ -47,6 +44,9 @@ void Run(DebugConfig* conf) {
 #endif
 #ifdef LITE_WITH_FPGA
      Place{TARGET(kFPGA), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_CUDA
+      Place{TARGET(kCUDA), PRECISION(kFloat)},
 #endif
  });
@@ -68,6 +68,12 @@ void Run(DebugConfig* conf) {
 #endif
 #ifdef LITE_WITH_X86
                  Place{TARGET(kX86), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_FPGA
+                  Place{TARGET(kFPGA), PRECISION(kFloat)},
+#endif
+#ifdef LITE_WITH_CUDA
+                  Place{TARGET(kCUDA), PRECISION(kFloat)},
 #endif
                  valid_places,
                  passes);

--- a/lite/utils/logging.h
+++ b/lite/utils/logging.h
@@ -18,6 +18,9 @@
 */
 #pragma once
+#ifndef _LOGGING_H_
+#define _LOGGING_H_
 #include <assert.h>
 #include <sys/time.h>
 #include <sys/types.h>
@@ -183,3 +186,4 @@ class VoidifyFatal : public Voidify {
 }  // namespace lite
 }  // namespace paddle
+#endif
--- a/lite/utils/paddle_enforce.h
+++ b/lite/utils/paddle_enforce.h
@@ -35,5 +35,5 @@
  CHECK_GT((a), (b)) << paddle::lite::string_format("" __VA_ARGS__);
 #ifndef PADDLE_THROW
-#define PADDLE_THROW
+#define PADDLE_THROW(...) printf("" __VA_ARGS__);
 #endif