diff --git a/CMakeLists.txt b/CMakeLists.txt
index 6a84745c1ffb1a892467286a2681aac6eb5a6c37..7475fed772b96fb373c7fe03c6a6fb650181849c 100644
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -1,10 +1,11 @@
 cmake_minimum_required(VERSION 3.0.0)
 
 option(USE_OPENMP "openmp support" ON)
-option(DEBUGING "enable debug mode" OFF)
-option(USE_EXCEPTION "use std exception" OFF)
+option(DEBUGING "enable debug mode" ON)
+option(USE_EXCEPTION "use std exception" ON)
 option(SYMBOL_HIDDEN "symbol hidden" OFF) # on when use jni or ios io
 option(LOG_PROFILE "log profile" OFF)
+
 # select the platform to build
 option(CPU "armv7 with neon" ON)
 option(GPU_MALI "mali gpu" OFF)
@@ -15,7 +16,6 @@ if(FPGA)
     option(FPGAV2 "fpga v2" OFF)
 endif()
 
-
 project(paddle-mobile)
 
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
@@ -247,5 +247,3 @@ elseif(FPGA)
     add_subdirectory(test)
 endif()
 
-add_subdirectory(test)
-
diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp
index 85fcc44a360a35e8100f6a8af6d0977fb577c7c0..bd463b0bd2aba2ef031a3fc5182ff92910c18f5b 100644
--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -95,12 +95,13 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
 }
 
 template <typename Dtype>
-void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
+static void LoadMemInternal(void **data, framework::LoDTensor *tensor,
+                            bool quant_uint8 = false) {
   char **data_buf = reinterpret_cast<char **>(data);
   int64_t size = tensor->numel();
   Dtype *tensor_data = tensor->mutable_data<Dtype>();
-  if (0) {
-    // TODO(hjchen2) should be moved into operator init function
+  if (quant_uint8) {
+    // should be moved into operator init function
     float min_value;
     float max_value;
     memory::Copy(&min_value, data_buf, sizeof(float));
@@ -156,7 +157,8 @@ void Executor<Dtype, P>::LoadMemory(
   // parse tensor from stream
   switch (tensor_desc.DataType()) {
     case framework::VARTYPE_TYPE_FP32:
-      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
+      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor,
+                             program_.quantification);
       break;
     case framework::VARTYPE_TYPE_INT8:
       LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
@@ -263,7 +265,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
   framework::Variable *g_feed_value = program_.scope->Var("feed");
   framework::Tensor *feed_tensor =
       g_feed_value->GetMutable<framework::LoDTensor>();
-  DLOG << "feed_tensor dim: " << feed_tensor->dims();
   feed_tensor->Resize(t.dims());
   feed_tensor->ShareDataWith(t);
   std::shared_ptr<framework::BlockDesc> to_predict_block =
@@ -298,15 +299,7 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
   for (int i = 0; i < profile.size(); i++) {
     const auto &pInfo = profile[i];
     uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    if (ops[i]->Type() == "conv2d") {
-      auto inputs = ops[i]->Inputs();
-      auto *filter = framework::GetVarValue<framework::LoDTensor>(
-          "Filter", inputs, *(program_.scope));
-      int kernel_size = filter->dims()[2];
-      _tp[ops[i]->Type() + "_" + std::to_string(kernel_size)] += timeCost;
-    } else {
-      _tp[ops[i]->Type()] += timeCost;
-    }
+    _tp[ops[i]->Type()] += timeCost;
   }
   printf("====================[ profile ]======================\n");
   using prof_t = std::pair<std::string, uint64_t>;
@@ -376,14 +369,6 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
   for (int i = 0; i < profile.size(); i++) {
     const auto &pInfo = profile[i];
     uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
-    if (ops[i]->Type() == "conv2d") {
-      auto inputs = ops[i]->Inputs();
-      auto input_keys = ops[i]->GetInputKeys();
-      auto *filter = framework::GetVarValue<framework::LoDTensor>(
-          input_keys[1], inputs, *(program_.scope));
-      int kernel_size = filter->dims()[2];
-      printf("kernel size: %d\n", kernel_size);
-    }
     _tp[ops[i]->Type()] += timeCost;
   }
   printf("====================[ profile ]======================\n");
diff --git a/tools/toolchains/arm-android-neon.cmake b/tools/toolchains/arm-android-neon.cmake
index 33a70d82bd78c15dd28ea7574d6df324f8cc64aa..5e431059a974810b2fd0481e0942447f57bf1286 100644
--- a/tools/toolchains/arm-android-neon.cmake
+++ b/tools/toolchains/arm-android-neon.cmake
@@ -3,4 +3,3 @@ set(ANDROID_PIE TRUE)
 set(ANDROID_STL "c++_static")
 set(ANDROID_PLATFORM "android-22")
 include("${CMAKE_CURRENT_LIST_DIR}/../android-cmake/android.toolchain.cmake")
-#include("/Users/chenhoujiang/Project/android-ndk-r16b/build/cmake/android.toolchain.cmake")