test=develop

b5c77670 · DannyIsFunny · 7126524d · 880a2636 · b5c77670 · b5c77670
35 changed file
--- a/lite/CMakeLists.txt
+++ b/lite/CMakeLists.txt
@@ -369,6 +369,8 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND LITE_WITH_ARM)
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_cv/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_cv/Makefile"
                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/mask_detection" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/mask_detection/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/mask_detection/Makefile"
+                COMMAND cp -r "${CMAKE_SOURCE_DIR}/lite/demo/cxx/test_libs" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx"
+                COMMAND cp "${CMAKE_SOURCE_DIR}/lite/demo/cxx/makefiles/test_libs/Makefile.${ARM_TARGET_OS}.${ARM_TARGET_ARCH_ABI}" "${INFER_LITE_PUBLISH_ROOT}/demo/cxx/test_libs/Makefile"
            )
            add_dependencies(publish_inference_android_cxx_demos logging gflags)
            add_dependencies(publish_inference_cxx_lib publish_inference_android_cxx_demos)

--- a/lite/api/_paddle_use_ops.h
+++ b/lite/api/_paddle_use_ops.h
@@ -48,6 +48,7 @@ USE_LITE_OP(concat)
 USE_LITE_OP(conv2d)
 USE_LITE_OP(depthwise_conv2d)
 USE_LITE_OP(pool2d)
+USE_LITE_OP(max_pool2d_with_index)
 USE_LITE_OP(batch_norm)
 USE_LITE_OP(fusion_elementwise_sub_activation)
 USE_LITE_OP(transpose)

--- a/lite/api/cxx_api_impl.cc
+++ b/lite/api/cxx_api_impl.cc
@@ -71,10 +71,11 @@ void CxxPaddleApiImpl::Init(const lite_api::CxxConfig &config) {
  threads_ = config.threads();
 #if (defined LITE_WITH_X86) && (defined PADDLE_WITH_MKLML) && \
    !(defined LITE_ON_MODEL_OPTIMIZE_TOOL)
-//  set_thread_by input is disabled here, because this inference is proved unstable
-//  int num_threads = config.x86_math_library_num_threads();
-//  int real_num_threads = num_threads > 1 ? num_threads : 1;
-  int real_num_threads=1;
+  //  set_thread_by input is disabled here, because this inference is proved
+  //  unstable
+  //  int num_threads = config.x86_math_library_num_threads();
+  //  int real_num_threads = num_threads > 1 ? num_threads : 1;
+  int real_num_threads = 1;
  paddle::lite::x86::MKL_Set_Num_Threads(real_num_threads);
  omp_set_num_threads(real_num_threads);
  VLOG(3) << "set_x86_math_library_math_threads() is set successfully and the "

--- a/lite/api/lite_multithread_test.cc
+++ b/lite/api/lite_multithread_test.cc
@@ -36,7 +36,7 @@ DEFINE_string(model_dir_0, "", "model_dir_0");
 DEFINE_string(input_shape_0,
              "1,3,224,224",
              "input shapes another, separated by colon and comma");
-
+DEFINE_string(target, "arm", "main target for Predictor: arm, opencl");
 DEFINE_bool(use_optimize_nb,
            false,
            "optimized & naive buffer model for mobile devices");
@@ -51,9 +51,19 @@ void OutputOptModel(const std::string& load_model_dir,
                    const std::vector<std::vector<int64_t>>& input_shapes) {
  lite_api::CxxConfig config;
  config.set_model_dir(load_model_dir);
-  config.set_valid_places({
-      Place{TARGET(kARM), PRECISION(kFloat)},
-  });
+  if (FLAGS_target == "arm") {
+    config.set_valid_places({
+        Place{TARGET(kARM), PRECISION(kFloat)},
+    });
+  } else if (FLAGS_target == "opencl") {
+    config.set_valid_places({
+        Place{TARGET(kOpenCL), PRECISION(kFP16), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kFloat), DATALAYOUT(kNCHW)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kImageDefault)},
+        Place{TARGET(kOpenCL), PRECISION(kAny), DATALAYOUT(kNCHW)},
+        Place{TARGET(kARM)},  // enable kARM CPU kernel when no opencl kernel
+    });
+  }
  auto predictor = lite_api::CreatePaddlePredictor(config);

  // delete old optimized model
@@ -78,7 +88,7 @@ void Run(const std::vector<std::vector<int64_t>>& input_shapes,
         int tid,
         const int warmup_times = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);

@@ -197,7 +207,7 @@ void RunTestType_10(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);

@@ -218,13 +228,13 @@ void RunTestType_11(const std::vector<std::vector<int64_t>>& input_shapes,
                    const int repeat,
                    int warmup = 5) {
  lite_api::MobileConfig config;
-  config.set_model_dir(model_dir);
+  config.set_model_from_file(model_dir + ".nb");
  config.set_power_mode(power_mode);
  config.set_threads(thread_num);

  auto predictor = lite_api::CreatePaddlePredictor(config);

-  config.set_model_dir(model_dir_0);
+  config.set_model_from_file(model_dir_0 + ".nb");
  auto predictor_0 = lite_api::CreatePaddlePredictor(config);

  for (int i = 0; i < 2 * repeat; i += 2) {
@@ -246,7 +256,8 @@ int main(int argc, char** argv) {
  gflags::ParseCommandLineFlags(&argc, &argv, true);
  if (FLAGS_model_dir == "") {
    LOG(INFO) << "usage: "
-              << "--model_dir /path/to/your/model";
+              << "--model_dir /path/to/your/model --model_dir_0 "
+                 "/path/to/your/model0  --target `arm` or `opencl`";
    exit(0);
  }
  std::string save_optimized_model_dir = "";

--- a/lite/api/test_classify_lite_bm.cc
+++ b/lite/api/test_classify_lite_bm.cc
@@ -15,6 +15,7 @@
 #include <gflags/gflags.h>
 #include <gtest/gtest.h>
 #include <fstream>
+#include <thread>  //NOLINT
 #include <vector>
 #include "lite/api/cxx_api.h"
 #include "lite/api/paddle_use_kernels.h"
@@ -30,14 +31,18 @@ DEFINE_string(input_img_txt_path,
 namespace paddle {
 namespace lite {

-void TestModel(const std::vector<Place>& valid_places) {
+const int g_batch_size = 1;
+const int g_thread_num = 1;
+
+void instance_run() {
  lite::Predictor predictor;
  std::vector<std::string> passes;
+  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
+                                   Place{TARGET(kX86), PRECISION(kFloat)}});
  predictor.Build(FLAGS_model_dir, "", "", valid_places, passes);
-
  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(
-      std::vector<DDim::value_type>({1, 3, FLAGS_im_height, FLAGS_im_width})));
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>(
+      {g_batch_size, 3, FLAGS_im_height, FLAGS_im_width})));
  auto* data = input_tensor->mutable_data<float>();
  auto item_size = input_tensor->dims().production();
  if (FLAGS_input_img_txt_path.empty()) {
@@ -45,12 +50,15 @@ void TestModel(const std::vector<Place>& valid_places) {
      data[i] = 1;
    }
  } else {
-    std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
-    if (!fs.is_open()) {
-      LOG(FATAL) << "open input_img_txt error.";
-    }
-    for (int i = 0; i < item_size; i++) {
-      fs >> data[i];
+    for (int j = 0; j < g_batch_size; j++) {
+      std::fstream fs(FLAGS_input_img_txt_path, std::ios::in);
+      if (!fs.is_open()) {
+        LOG(FATAL) << "open input_img_txt error.";
+      }
+      for (int i = 0; i < item_size / g_batch_size; i++) {
+        fs >> data[i];
+      }
+      data += j * item_size / g_batch_size;
    }
  }
  for (int i = 0; i < FLAGS_warmup; ++i) {
@@ -72,6 +80,7 @@ void TestModel(const std::vector<Place>& valid_places) {
  FILE* fp = fopen("result.txt", "wb");
  for (int i = 0; i < out.size(); i++) {
    auto* out_data = out[i]->data<float>();
+    LOG(INFO) << out[i]->numel();
    for (int j = 0; j < out[i]->numel(); j++) {
      fprintf(fp, "%f\n", out_data[j]);
    }
@@ -79,6 +88,16 @@ void TestModel(const std::vector<Place>& valid_places) {
  fclose(fp);
 }

+void TestModel(const std::vector<Place>& valid_places) {
+  std::vector<std::unique_ptr<std::thread>> instances_vec;
+  for (int i = 0; i < g_thread_num; ++i) {
+    instances_vec.emplace_back(new std::thread(&instance_run));
+  }
+  for (int i = 0; i < g_thread_num; ++i) {
+    instances_vec[i]->join();
+  }
+}
+
 TEST(Classify, test_bm) {
  std::vector<Place> valid_places({Place{TARGET(kBM), PRECISION(kFloat)},
                                   Place{TARGET(kX86), PRECISION(kFloat)}});

--- a/lite/backends/x86/math/math_function.cc
+++ b/lite/backends/x86/math/math_function.cc
@@ -128,8 +128,7 @@ struct RowwiseAdd<lite::TargetType::kX86, T> {
    T* output_data = output->template mutable_data<T>();
    for (int64_t i = 0; i < in_dims[0]; ++i) {
      for (int64_t j = 0; j < size; ++j) {
-      output_data[i * size + j] =
-            input_data[i * size + j] + vector_data[j];
+        output_data[i * size + j] = input_data[i * size + j] + vector_data[j];
      }
    }
  }

--- a/lite/core/mir/fusion/conv_bn_fuser.cc
+++ b/lite/core/mir/fusion/conv_bn_fuser.cc
@@ -103,9 +103,12 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  std::string conv_weight_name = matched.at("conv_weight")->arg()->name;
  auto conv_weight_t =
      scope->FindVar(conv_weight_name)->GetMutable<lite::Tensor>();
+  auto groups = conv_op_desc->GetAttr<int>("groups");
+  bool depthwise = false;
  if (conv_type_ == "conv2d_transpose") {
+    depthwise = (conv_weight_t->dims()[0] == conv_weight_t->dims()[1] * groups);
    CHECK_EQ(static_cast<size_t>(bn_scale_t->data_size()),
-             static_cast<size_t>(conv_weight_t->dims()[1]))
+             static_cast<size_t>(conv_weight_t->dims()[1] * groups))
        << "The BN bias's size should be equal to the size of the first "
        << "dim size of the conv weights";
  } else {
@@ -159,7 +162,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
    // compute new conv_weight for int8
    auto weight_scale =
        conv_op_desc->GetAttr<std::vector<float>>("weight_scale");
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];
@@ -199,7 +202,7 @@ void ConvBNFuser::InsertNewNode(SSAGraph* graph, const key2nodes_t& matched) {
  } else {
    // compute new conv_weight
    auto conv_weight_d = conv_weight_t->mutable_data<float>();
-    if (conv_type_ == "conv2d_transpose") {
+    if (conv_type_ == "conv2d_transpose" && !depthwise) {
      int c_size = conv_weight_t->dims()[1] * conv_weight_t->dims()[2] *
                   conv_weight_t->dims()[3];
      int hw = conv_weight_t->dims()[2] * conv_weight_t->dims()[3];

--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv7
+ARM_ABI = arm7
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/armeabi-v7a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/armeabi-v7a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
--- a/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+++ b/lite/demo/cxx/makefiles/test_libs/Makefile.android.armv8
+ARM_ABI = arm8
+export ARM_ABI
+
+include ../Makefile.def
+
+LITE_ROOT=../../../
+
+THIRD_PARTY_DIR=${LITE_ROOT}/third_party
+
+OPENCV_VERSION=opencv4.1.0
+
+OPENCV_LIBS = ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgcodecs.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_imgproc.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/libs/libopencv_core.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtegra_hal.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjpeg-turbo.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibwebp.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibpng.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibjasper.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/liblibtiff.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libIlmImf.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libtbb.a \
+              ../../../third_party/${OPENCV_VERSION}/arm64-v8a/3rdparty/libs/libcpufeatures.a
+
+OPENCV_INCLUDE = -I../../../third_party/${OPENCV_VERSION}/arm64-v8a/include
+
+CXX_INCLUDES = $(INCLUDES) ${OPENCV_INCLUDE} -I$(LITE_ROOT)/cxx/include -I${THIRD_PARTY_DIR}/gflags/include
+
+CXX_LIBS = ${OPENCV_LIBS} ${THIRD_PARTY_DIR}/gflags/lib/libgflags.a $(SYSTEM_LIBS)
+
+LITE_FULL_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_full_api_shared
+LITE_FULL_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_full_bundled.a
+LITE_LIGHT_SHAPRED_LIBS=-L$(LITE_ROOT)/cxx/lib/ -lpaddle_light_api_shared
+LITE_LIGHT_STATIC_LIBS=$(LITE_ROOT)/cxx/lib/libpaddle_api_light_bundled.a
+
+##########
+fetch_opencv:
+	@ test -d ${THIRD_PARTY_DIR} ||  mkdir ${THIRD_PARTY_DIR}
+	@ test -e ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz || \
+      (echo "fetch opencv libs" && \
+      wget -P ${THIRD_PARTY_DIR} https://paddle-inference-dist.bj.bcebos.com/${OPENCV_VERSION}.tar.gz)
+	@ test -d ${THIRD_PARTY_DIR}/${OPENCV_VERSION} || \
+      tar -zxvf ${THIRD_PARTY_DIR}/${OPENCV_VERSION}.tar.gz -C ${THIRD_PARTY_DIR}
+
+test_helper.o: test_helper.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o test_helper.o -c test_helper.cc
+
+classification_full.o: classification_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_full.o -c classification_full.cc
+
+classification_light.o: classification_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o classification_light.o -c classification_light.cc
+
+classification_full_shared: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+classification_full_static: fetch_opencv classification_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_full.o test_helper.o -o classification_full_static  ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+classification_light_shared: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+classification_light_static: fetch_opencv classification_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) classification_light.o test_helper.o -o classification_light_static  ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+######
+yolov3_full.o: yolov3_full.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_full.o -c yolov3_full.cc
+
+yolov3_light.o: yolov3_light.cc
+	$(CC) $(SYSROOT_COMPLILE) $(CXX_DEFINES) $(CXX_INCLUDES) $(CXX_FLAGS) -o yolov3_light.o -c yolov3_light.cc
+
+yolov3_full_shared: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_FULL_SHAPRED_LIBS}
+
+yolov3_full_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_full.o test_helper.o -o yolov3_full_static   ${LITE_FULL_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+yolov3_light_shared: fetch_opencv yolov3_light.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_shared  $(CXX_LIBS) $(LDFLAGS) ${LITE_LIGHT_SHAPRED_LIBS}
+
+yolov3_light_static: fetch_opencv yolov3_full.o test_helper.o
+	$(CC) $(SYSROOT_LINK) $(CXXFLAGS_LINK) yolov3_light.o test_helper.o -o yolov3_light_static   ${LITE_LIGHT_STATIC_LIBS} $(CXX_LIBS) $(LDFLAGS)
+
+#####
+all: classification_full_shared classification_full_static classification_light_shared classification_light_static yolov3_full_shared yolov3_full_static yolov3_light_shared yolov3_light_static 
+
+clean:
+	rm -f *.o
+	rm -f classification_full_shared
+	rm -r classification_full_static
+	rm -r classification_light_shared
+	rm -f classification_light_static
+	rm -f yolov3_full_shared
+	rm -f yolov3_full_static
+	rm -f yolov3_light_shared
+	rm -f yolov3_light_static
--- a/lite/demo/cxx/test_libs/classification_full.cc
+++ b/lite/demo/cxx/test_libs/classification_full.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(model_dir,
+              "",
+              "the path of the model, the model and param files is under "
+              "model_dir.");
+DEFINE_string(model_filename,
+              "",
+              "the filename of model file. When the model is combined formate, "
+              "please set model_file.");
+DEFINE_string(param_filename,
+              "",
+              "the filename of param file, set param_file when the model is "
+              "combined formate.");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, 65, "The max value index in output tensor");
+
+// Optimize model for ARM CPU.
+// If the model is not combined, set model_filename and params_filename as empty
+void OptModel(const std::string& load_model_dir,
+              const std::string& model_filename,
+              const std::string& params_filename,
+              const std::string& save_model_path) {
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  if (!model_filename.empty() && !params_filename.empty()) {
+    config.set_model_file(load_model_dir + "/" + model_filename);
+    config.set_param_file(load_model_dir + "/" + params_filename);
+  }
+  std::vector<paddle::lite_api::Place> vaild_places = {
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
+  };
+  config.set_valid_places(vaild_places);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::string cmd_str = "rm -rf " + save_model_path;
+  int ret = system(cmd_str.c_str());
+  if (ret == 0) {
+    std::cout << "Delete old optimized model " << save_model_path << std::endl;
+  }
+  predictor->SaveOptimizedModel(save_model_path,
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  std::cout << "Load model from " << load_model_dir << std::endl;
+  std::cout << "Save optimized model to " << save_model_path << std::endl;
+}
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test.---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test.---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--model_dir: the path of not optimized model \n"
+           "--model_filename: the model filename of not optimized model \n"
+           "--param_filename: the param filename of not optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+
+  const int height = 224;
+  const int width = 224;
+  std::string model_dir = FLAGS_model_dir;
+  if (model_dir.back() == '/') {
+    model_dir.pop_back();
+  }
+  std::string optimized_model_path = model_dir + "_opt2";
+  OptModel(FLAGS_model_dir,
+           FLAGS_model_filename,
+           FLAGS_param_filename,
+           optimized_model_path);
+  std::string run_model_path = optimized_model_path + ".nb";
+
+  // Run test
+  Run(run_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/demo/cxx/test_libs/classification_light.cc
+++ b/lite/demo/cxx/test_libs/classification_light.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(optimized_model_path, "", "the path of optimized model");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_double(out_max_value, 0.0, "The max value in output tensor");
+DEFINE_double(threshold,
+              1e-3,
+              "If the max value diff is smaller than threshold, pass test");
+DEFINE_int32(out_max_value_index, -1, "The max value index in output tensor");
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const float out_max_value,
+         const int out_max_value_index,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  float max_value = out_data[0];
+  int max_index = 0;
+  for (int i = 0; i < output_num; i++) {
+    if (max_value < out_data[i]) {
+      max_value = out_data[i];
+      max_index = i;
+    }
+  }
+
+  std::cout << "max_value:" << max_value << std::endl;
+  std::cout << "max_index:" << max_index << std::endl;
+  std::cout << "max_value_ground_truth:" << out_max_value << std::endl;
+  std::cout << "max_index_ground_truth:" << out_max_value_index << std::endl;
+  if (max_index != out_max_value_index ||
+      fabs(max_value - out_max_value) > threshold) {
+    std::cerr << "----------Fail Test---------- \n\n";
+  } else {
+    std::cout << "----------Pass Test---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_optimized_model_path.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--optimized_model_path: the path of optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_max_value: The max value in output tensor \n"
+           "--threshold: If the max value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n"
+           "--out_max_value_index: The max value index in output tensor \n";
+    exit(1);
+  }
+
+  const int height = 224;
+  const int width = 224;
+  // Run test
+  Run(FLAGS_optimized_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      FLAGS_out_max_value,
+      FLAGS_out_max_value_index,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/demo/cxx/test_libs/prepare.sh
+++ b/lite/demo/cxx/test_libs/prepare.sh
+make clean
+make all -j
+
+gf=test_lite_lib_files
+if [ -d ${gf} ];then
+    rm -rf ${gf}
+fi
+mkdir ${gf}
+
+mv classification_full_shared ${gf}
+mv classification_full_static ${gf}
+mv classification_light_shared ${gf}
+mv classification_light_static ${gf}
+mv yolov3_full_shared ${gf}
+mv yolov3_full_static ${gf}
+mv yolov3_light_shared ${gf}
+mv yolov3_light_static ${gf}
+cp run.sh ${gf}
+
+make clean
+
+cp -r ../../../cxx/ ${gf}
+mv ${gf}/cxx ${gf}/lite
+
+if [ ! -f "test_libs_models_imgs.tgz" ];then
+    wget https://paddle-inference-dist.cdn.bcebos.com/PaddleLite/test_libs_models_imgs.tgz
+fi
+tar zxvf test_libs_models_imgs.tgz
+mv test_libs_models_imgs ${gf}
+mv ${gf}/test_libs_models_imgs ${gf}/models_imgs
--- a/lite/demo/cxx/test_libs/run.sh
+++ b/lite/demo/cxx/test_libs/run.sh
+export LD_LIBRARY_PATH=$PWD/lite/lib/:${LD_LIBRARY_PATH}
+
+# mobilenetv1
+
+./classification_light_shared \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+./classification_light_static \
+    --optimized_model_path=models_imgs/models/mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+./classification_full_static \
+    --model_dir=models_imgs/models/mobilenetv1 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+./classification_full_shared \
+    --model_dir=models_imgs/models/mobilenetv1 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.936887 \
+    --out_max_value_index=65
+
+# mobilenetv2
+
+./classification_light_shared \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+./classification_light_static \
+    --optimized_model_path=models_imgs/models/mobilenetv2.nb \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+./classification_full_static \
+    --model_dir=models_imgs/models/mobilenetv2 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+./classification_full_shared \
+    --model_dir=models_imgs/models/mobilenetv2 \
+    --img_txt_path=models_imgs/images/classification.jpg.txt \
+    --out_max_value=0.868888 \
+    --out_max_value_index=65
+
+# yolov3
+
+./yolov3_light_shared \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb  \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+
+./yolov3_light_static \
+    --optimized_model_path=models_imgs/models/yolov3_mobilenetv1.nb \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+
+./yolov3_full_static \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1 \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
+
+./yolov3_full_shared \
+    --model_dir=models_imgs/models/yolov3_mobilenetv1 \
+    --img_txt_path=models_imgs/images/yolov3.jpg.txt \
+    --out_values=0,0.153605,174.494,199.729,562.075,604.014
--- a/lite/demo/cxx/test_libs/test_helper.cc
+++ b/lite/demo/cxx/test_libs/test_helper.cc
+// Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "test_helper.h"  // NOLINT
+
+#include <sys/time.h>
+#include <time.h>
+#include <algorithm>
+#include <cstdio>
+#include <fstream>
+#include <iomanip>
+#include <iostream>
+#include <numeric>
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+
+double GetCurrentUS() {
+  struct timeval time;
+  gettimeofday(&time, NULL);
+  return 1e+6 * time.tv_sec + time.tv_usec;
+}
+
+int64_t ShapeProduction(const std::vector<int64_t>& shape) {
+  int64_t num = 1;
+  for (auto i : shape) {
+    num *= i;
+  }
+  return num;
+}
+
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str) {
+  std::vector<int64_t> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    int num = atoi(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+
+std::vector<double> GetDoubleNumsFromStr(const std::string& str) {
+  std::vector<double> nums;
+  std::string tmp_str = str;
+  while (!tmp_str.empty()) {
+    double num = atof(tmp_str.data());
+    nums.push_back(num);
+    size_t next_offset = tmp_str.find(",");
+    if (next_offset == std::string::npos) {
+      break;
+    } else {
+      tmp_str = tmp_str.substr(next_offset + 1);
+    }
+  }
+  return nums;
+}
+
+// fill tensor with mean and scale and trans layout: nhwc -> nchw, neon speed up
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale) {
+  float32x4_t vmean0 = vdupq_n_f32(mean[0]);
+  float32x4_t vmean1 = vdupq_n_f32(mean[1]);
+  float32x4_t vmean2 = vdupq_n_f32(mean[2]);
+  float32x4_t vscale0 = vdupq_n_f32(1.f / scale[0]);
+  float32x4_t vscale1 = vdupq_n_f32(1.f / scale[1]);
+  float32x4_t vscale2 = vdupq_n_f32(1.f / scale[2]);
+
+  float* dout_c0 = dout;
+  float* dout_c1 = dout + size;
+  float* dout_c2 = dout + size * 2;
+
+  int i = 0;
+  for (; i < size - 3; i += 4) {
+    float32x4x3_t vin3 = vld3q_f32(din);
+    float32x4_t vsub0 = vsubq_f32(vin3.val[0], vmean0);
+    float32x4_t vsub1 = vsubq_f32(vin3.val[1], vmean1);
+    float32x4_t vsub2 = vsubq_f32(vin3.val[2], vmean2);
+    float32x4_t vs0 = vmulq_f32(vsub0, vscale0);
+    float32x4_t vs1 = vmulq_f32(vsub1, vscale1);
+    float32x4_t vs2 = vmulq_f32(vsub2, vscale2);
+    vst1q_f32(dout_c0, vs0);
+    vst1q_f32(dout_c1, vs1);
+    vst1q_f32(dout_c2, vs2);
+
+    din += 12;
+    dout_c0 += 4;
+    dout_c1 += 4;
+    dout_c2 += 4;
+  }
+  for (; i < size; i++) {
+    *(dout_c0++) = (*(din++) - mean[0]) / scale[0];
+    *(dout_c0++) = (*(din++) - mean[1]) / scale[1];
+    *(dout_c0++) = (*(din++) - mean[2]) / scale[2];
+  }
+}
+
+// Process img and set it as input
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dest_data,
+                 float* means,
+                 float* scales) {
+  cv::Mat rgb_img;
+  cv::cvtColor(img, rgb_img, cv::COLOR_BGR2RGB);
+  cv::resize(rgb_img, rgb_img, cv::Size(width, height), 0.f, 0.f);
+  cv::Mat imgf;
+  rgb_img.convertTo(imgf, CV_32FC3, 1 / 255.f);
+  const float* dimg = reinterpret_cast<const float*>(imgf.data);
+  neon_mean_scale(dimg, dest_data, width * height, means, scales);
+}
--- a/lite/demo/cxx/test_libs/test_helper.h
+++ b/lite/demo/cxx/test_libs/test_helper.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+#include <string>
+#include <vector>
+
+#include "opencv2/core.hpp"
+#include "opencv2/imgcodecs.hpp"
+#include "opencv2/imgproc.hpp"
+
+double GetCurrentUS();
+
+int64_t ShapeProduction(const std::vector<int64_t>& shape);
+
+std::vector<int64_t> GetIntNumsFromStr(const std::string& str);
+std::vector<double> GetDoubleNumsFromStr(const std::string& str);
+
+void neon_mean_scale(
+    const float* din, float* dout, int size, float* mean, float* scale);
+
+void process_img(const cv::Mat& img,
+                 int width,
+                 int height,
+                 float* dst_data,
+                 float* means,
+                 float* scales);
--- a/lite/demo/cxx/test_libs/yolov3_full.cc
+++ b/lite/demo/cxx/test_libs/yolov3_full.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(model_dir,
+              "",
+              "the path of the model, the model and param files is under "
+              "model_dir.");
+DEFINE_string(model_filename,
+              "",
+              "the filename of model file. When the model is combined formate, "
+              "please set model_file.");
+DEFINE_string(param_filename,
+              "",
+              "the filename of param file, set param_file when the model is "
+              "combined formate.");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_string(out_values,
+              "",
+              "The output values, separated by colon and comma");
+DEFINE_double(threshold,
+              1e-3,
+              "If the output value diff is smaller than threshold, pass test");
+
+void OptModel(const std::string& load_model_dir,
+              const std::string& model_filename,
+              const std::string& params_filename,
+              const std::string& save_model_path) {
+  paddle::lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  if (!model_filename.empty() && !params_filename.empty()) {
+    config.set_model_file(load_model_dir + "/" + model_filename);
+    config.set_param_file(load_model_dir + "/" + params_filename);
+  }
+  std::vector<paddle::lite_api::Place> vaild_places = {
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kFloat)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt32)},
+      paddle::lite_api::Place{TARGET(kARM), PRECISION(kInt64)},
+  };
+  config.set_valid_places(vaild_places);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  std::string cmd_str = "rm -rf " + save_model_path;
+  int ret = system(cmd_str.c_str());
+  if (ret == 0) {
+    std::cout << "Delete old optimized model " << save_model_path << std::endl;
+  }
+  predictor->SaveOptimizedModel(save_model_path,
+                                paddle::lite_api::LiteModelType::kNaiveBuffer);
+  std::cout << "Load model from " << load_model_dir << std::endl;
+  std::cout << "Save optimized model to " << save_model_path << std::endl;
+}
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const std::vector<double>& out_values,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+  auto shape_tensor = predictor->GetInput(1);
+  shape_tensor->Resize({1, 2});
+  auto* shape_data = shape_tensor->mutable_data<int>();
+  shape_data[0] = height;
+  shape_data[1] = width;
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  bool is_pass = true;
+  for (int i = 0; i < output_num && i < out_values.size(); i++) {
+    std::cout << "id:" << i << " out_data:" << out_data[i]
+              << " gt_data:" << out_values[i] << std::endl;
+    if (fabs(out_data[i] - out_values[i]) > threshold) {
+      is_pass = false;
+    }
+  }
+  if (is_pass) {
+    std::cout << "----------Pass test---------- \n\n";
+  } else {
+    std::cout << "----------Fail test---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_model_dir.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--model_dir: the path of not optimized model \n"
+           "--model_filename: the model filename of not optimized model \n"
+           "--param_filename: the param filename of not optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_values: The output values, separated by colon and comma.\n"
+           "--threshold: If the out value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n";
+    exit(1);
+  }
+
+  const int height = 608;
+  const int width = 608;
+  std::vector<double> out_values = GetDoubleNumsFromStr(FLAGS_out_values);
+
+  std::string model_dir = FLAGS_model_dir;
+  if (model_dir.back() == '/') {
+    model_dir.pop_back();
+  }
+  std::string optimized_model_path = model_dir + "_opt2";
+  OptModel(FLAGS_model_dir,
+           FLAGS_model_filename,
+           FLAGS_param_filename,
+           optimized_model_path);
+  std::string run_model_path = optimized_model_path + ".nb";
+
+  // Run test
+  Run(run_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      out_values,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/demo/cxx/test_libs/yolov3_light.cc
+++ b/lite/demo/cxx/test_libs/yolov3_light.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include <gflags/gflags.h>
+#include <fstream>
+#include <iostream>
+#include "paddle_api.h"   // NOLINT
+#include "test_helper.h"  // NOLINT
+
+DEFINE_string(optimized_model_path, "", "the path of the optimized model");
+DEFINE_string(img_path, "", "the path of input image");
+DEFINE_string(img_txt_path,
+              "",
+              "the path of input image, the image is processed "
+              " and saved in txt file");
+DEFINE_string(out_values,
+              "",
+              "The output values, separated by colon and comma");
+DEFINE_double(threshold,
+              1e-3,
+              "If the output value diff is smaller than threshold, pass test");
+
+void Run(const std::string& model_path,
+         const std::string& img_path,
+         const std::string& img_txt_path,
+         const std::vector<double>& out_values,
+         const float threshold,
+         const int height,
+         const int width) {
+  // set config and create predictor
+  paddle::lite_api::MobileConfig config;
+  config.set_threads(3);
+  config.set_model_from_file(model_path);
+
+  auto predictor = paddle::lite_api::CreatePaddlePredictor(config);
+
+  // set input
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize({1, 3, height, width});
+  auto input_data = input_tensor->mutable_data<float>();
+  if (img_txt_path.size() > 0) {
+    std::fstream fs(img_txt_path);
+    if (!fs.is_open()) {
+      std::cerr << "Fail to open img txt file:" << img_txt_path << std::endl;
+    }
+    int num = 1 * 3 * height * width;
+    for (int i = 0; i < num; i++) {
+      fs >> input_data[i];
+    }
+  } else {
+    cv::Mat img = imread(img_path, cv::IMREAD_COLOR);
+    if (!img.data) {
+      std::cerr << "Fail to open img:" << img_path << std::endl;
+      exit(1);
+    }
+    float means[3] = {0.485f, 0.456f, 0.406f};
+    float scales[3] = {0.229f, 0.224f, 0.225f};
+    process_img(img, width, height, input_data, means, scales);
+  }
+  auto shape_tensor = predictor->GetInput(1);
+  shape_tensor->Resize({1, 2});
+  auto* shape_data = shape_tensor->mutable_data<int>();
+  shape_data[0] = height;
+  shape_data[1] = width;
+
+  predictor->Run();
+
+  auto out_tensor = predictor->GetOutput(0);
+  auto* out_data = out_tensor->data<float>();
+  int64_t output_num = ShapeProduction(out_tensor->shape());
+  bool is_pass = true;
+  for (int i = 0; i < output_num && i < out_values.size(); i++) {
+    std::cout << "id:" << i << " out_data:" << out_data[i]
+              << " gt_data:" << out_values[i] << std::endl;
+    if (fabs(out_data[i] - out_values[i]) > threshold) {
+      is_pass = false;
+    }
+  }
+  if (is_pass) {
+    std::cout << "----------Pass test---------- \n\n";
+  } else {
+    std::cout << "----------Fail test---------- \n\n";
+  }
+}
+
+int main(int argc, char** argv) {
+  // Check inputs
+  google::ParseCommandLineFlags(&argc, &argv, true);
+  if (FLAGS_optimized_model_path.empty() ||
+      (FLAGS_img_path.empty() && FLAGS_img_txt_path.empty())) {
+    std::cerr << "Input error." << std::endl;
+    std::cerr
+        << "Usage: " << argv[0] << std::endl
+        << "--optimized_model_path: the path of optimized model \n"
+           "--img_txt_path: the path of input image, the image is processed \n"
+           "  and saved in txt file \n"
+           "--img_path: the path of input image \n"
+           "--out_values: The output values, separated by colon and comma.\n"
+           "--threshold: If the out value diff is smaller than threshold,\n"
+           "  pass test. Default 1e-3.\n";
+    exit(1);
+  }
+
+  const int height = 608;
+  const int width = 608;
+  std::vector<double> out_values = GetDoubleNumsFromStr(FLAGS_out_values);
+
+  // Run test
+  Run(FLAGS_optimized_model_path,
+      FLAGS_img_path,
+      FLAGS_img_txt_path,
+      out_values,
+      FLAGS_threshold,
+      height,
+      width);
+  return 0;
+}
--- a/lite/kernels/bm/bridges/CMakeLists.txt
+++ b/lite/kernels/bm/bridges/CMakeLists.txt
@@ -36,6 +36,7 @@ lite_cc_library(subgraph_bridge_shape_op_bm SRCS shape_op.cc DEPS ${bm_subgraph_
 lite_cc_library(subgraph_bridge_split_op_bm SRCS split_op.cc DEPS ${bm_subgraph_bridge_deps})
 lite_cc_library(subgraph_bridge_matmul_op_bm SRCS matmul_op.cc DEPS ${bm_subgraph_bridge_deps})

+
 set(bm_subgraph_bridges
        subgraph_bridge_registry
        subgraph_bridge_engine

--- a/lite/kernels/bm/bridges/act_op.cc
+++ b/lite/kernels/bm/bridges/act_op.cc
@@ -54,6 +54,8 @@ int ActConverter(void* ctx, OpLite* op, KernelBase* kernel) {
    active_type_id = ACTIVE_SQRT;
  } else if (op_type == "square") {
    active_type_id = ACTIVE_SQUARE;
+  } else if (op_type == "sigmoid") {
+    active_type_id = ACTIVE_SIGMOID;
  } else {
    LOG(FATAL) << "[BM] unsupport act type";
    return FAILED;
@@ -102,3 +104,6 @@ REGISTER_SUBGRAPH_BRIDGE(leaky_relu,
                         paddle::lite::subgraph::bm::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(sqrt, kBM, paddle::lite::subgraph::bm::ActConverter);
 REGISTER_SUBGRAPH_BRIDGE(square, kBM, paddle::lite::subgraph::bm::ActConverter);
+REGISTER_SUBGRAPH_BRIDGE(sigmoid,
+                         kBM,
+                         paddle::lite::subgraph::bm::ActConverter);
--- a/lite/kernels/bm/bridges/graph.cc
+++ b/lite/kernels/bm/bridges/graph.cc
@@ -20,11 +20,14 @@ namespace lite {
 namespace subgraph {
 namespace bm {

+pthread_mutex_t Graph::mutex_compiler_ = PTHREAD_MUTEX_INITIALIZER;
+
 void Graph::AddNode(const std::string& name) {
  nodes_.insert(std::make_pair(name, name));
 }

 void Graph::CreateCompilerHandle() {
+  pthread_mutex_lock(&mutex_compiler_);
 #ifdef BM1682
  compiler_handle_ = create_bmcompiler("BM1682");
 #else
@@ -33,6 +36,8 @@ void Graph::CreateCompilerHandle() {
  CHECK(compiler_handle_ != nullptr);
 }

+void Graph::UnlockCompilerMutex() { pthread_mutex_unlock(&mutex_compiler_); }
+
 }  // namespace bm
 }  // namespace subgraph
 }  // namespace lite

--- a/lite/kernels/bm/bridges/graph.h
+++ b/lite/kernels/bm/bridges/graph.h
@@ -14,6 +14,7 @@

 #pragma once

+#include <pthread.h>
 #include <memory>
 #include <string>
 #include <unordered_map>
@@ -36,10 +37,12 @@ class Graph {
  }
  void CreateCompilerHandle();
  void* GetCompilerHandle() { return compiler_handle_; }
+  void UnlockCompilerMutex();

 private:
  std::unordered_map<std::string, std::string> nodes_;
  void* compiler_handle_;
+  static pthread_mutex_t mutex_compiler_;
 };

 }  // namespace bm

--- a/lite/kernels/bm/bridges/paddle_use_bridges.h
+++ b/lite/kernels/bm/bridges/paddle_use_bridges.h
@@ -58,3 +58,5 @@ USE_SUBGRAPH_BRIDGE(depthwise_conv2d_transpose, kBM);
 USE_SUBGRAPH_BRIDGE(shape, kBM);
 USE_SUBGRAPH_BRIDGE(split, kBM);
 USE_SUBGRAPH_BRIDGE(matmul, kBM);
+USE_SUBGRAPH_BRIDGE(max_pool2d_with_index, kBM);
+USE_SUBGRAPH_BRIDGE(sigmoid, kBM);
--- a/lite/kernels/bm/bridges/pool_op.cc
+++ b/lite/kernels/bm/bridges/pool_op.cc
@@ -11,7 +11,10 @@
 // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 // See the License for the specific language governing permissions and
 // limitations under the License.
+#include <bmcompiler_defs.h>
 #include <bmcompiler_if.h>
+#include <bmcompiler_if_lite.h>
+#include <user_bmcpu_common.h>
 #include "lite/kernels/bm/bridges/graph.h"
 #include "lite/kernels/bm/bridges/utility.h"
 #include "lite/kernels/npu/bridges/registry.h"
@@ -54,46 +57,84 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
  shape[0] = &i_output_shape_data[0];
  name[0] = static_cast<const char*>(output_var_name.c_str());
  dim[0] = output_dims.size();
-  auto pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  std::string pooling_type;
+  if (op_info->HasAttr("pooling_type")) {
+    pooling_type = op_info->GetAttr<std::string>("pooling_type");
+  } else if (op_type == "max_pool2d_with_index") {
+    pooling_type = "max";
+  }
  CHECK(pooling_type == "max" || pooling_type == "avg");
  auto ksize = op_info->GetAttr<std::vector<int>>("ksize");
  auto paddings = op_info->GetAttr<std::vector<int>>("paddings");
  auto strides = op_info->GetAttr<std::vector<int>>("strides");
  auto global_pooling = op_info->GetAttr<bool>("global_pooling");
-  auto ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  bool ceil_mode = false;
+  if (op_info->HasAttr("ceil_mode")) {
+    ceil_mode = op_info->GetAttr<bool>("ceil_mode");
+  }
+  bool adaptive = false;
+  if (op_info->HasAttr("adaptive")) {
+    adaptive = op_info->GetAttr<bool>("adaptive");
+  }
  bool average_exclusive = false;
  if (pooling_type == "avg") {
    average_exclusive = op_info->GetAttr<bool>("exclusive");
  }
+  if (output_dims[2] == 1 && output_dims[3] == 1) {
+    global_pooling = true;
+  }
  if (global_pooling) {
    paddings[0] = 0;
    paddings[1] = 0;
    ksize[0] = i_x_shape_data[2];
    ksize[1] = i_x_shape_data[3];
  }
-  add_pooling_layer(
-      graph->GetCompilerHandle(),
-      const_cast<const int*>(&i_x_shape_data[0]),
-      x_dims.size(),
-      static_cast<const char*>(x_var_name.c_str()),
-      1,
-      shape,
-      dim,
-      name,
-      ksize[0],
-      ksize[1],
-      paddings[0],
-      paddings[0],
-      paddings[1],
-      paddings[1],
-      strides[0],
-      strides[1],
-      (ksize[0] > 1 && ksize[1] > 1) && pooling_type == "max" ? 0 : 1,
-      static_cast<int>(average_exclusive),
-      static_cast<int>(global_pooling),
-      static_cast<int>(ceil_mode),
-      static_cast<const char*>(unique_op_name.c_str()),
-      nullptr);
+  bool is_max = (pooling_type == "max");
+  if (adaptive && !global_pooling) {
+    user_cpu_param_t bm_param;
+    bm_param.op_type = USER_PADDLE_ADAPTIVE_POOL;
+    bm_param.u.adaptive_pool_parm.is_avg = !is_max;
+    int32_t* in_shape[1];
+    int32_t in_dim[1];
+    const char* in_name[1];
+    in_shape[0] = &i_x_shape_data[0];
+    in_name[0] = static_cast<const char*>(x_var_name.c_str());
+    in_dim[0] = x_dims.size();
+    add_user_cpu_layer(graph->GetCompilerHandle(),
+                       1,
+                       in_shape,
+                       in_dim,
+                       in_name,
+                       1,
+                       shape,
+                       dim,
+                       name,
+                       &bm_param,
+                       static_cast<int>(sizeof(bm_param)));
+  } else {
+    add_pooling_layer(graph->GetCompilerHandle(),
+                      const_cast<const int*>(&i_x_shape_data[0]),
+                      x_dims.size(),
+                      static_cast<const char*>(x_var_name.c_str()),
+                      1,
+                      shape,
+                      dim,
+                      name,
+                      ksize[0],
+                      ksize[1],
+                      paddings[0],
+                      paddings[0],
+                      paddings[1],
+                      paddings[1],
+                      strides[0],
+                      strides[1],
+                      is_max ? 0 : 1,
+                      static_cast<int>(average_exclusive),
+                      static_cast<int>(global_pooling),
+                      static_cast<int>(ceil_mode),
+                      static_cast<const char*>(unique_op_name.c_str()),
+                      nullptr);
+  }
  graph->AddNode(output_var_name);
  return SUCCESS;
 }
@@ -105,3 +146,6 @@ int PoolConverter(void* ctx, OpLite* op, KernelBase* kernel) {
 REGISTER_SUBGRAPH_BRIDGE(pool2d,
                         kBM,
                         paddle::lite::subgraph::bm::PoolConverter);
+REGISTER_SUBGRAPH_BRIDGE(max_pool2d_with_index,
+                         kBM,
+                         paddle::lite::subgraph::bm::PoolConverter);
--- a/lite/kernels/bm/subgraph_compute.cc
+++ b/lite/kernels/bm/subgraph_compute.cc
@@ -40,6 +40,7 @@ int SubgraphEngine::BuildDeviceProgram() {
    op->CheckShape();
    op->InferShape();
    std::string op_type = op->op_info()->Type();
+    LOG(INFO) << op_type;
    if (!bridges.Exists(op_type, TARGET(kBM))) {
      return subgraph::FAILED;
    }
@@ -59,6 +60,7 @@ int SubgraphEngine::BuildDeviceProgram() {
  unsigned int data_size = 0;
  bm_hd_ = static_cast<bm_handle_t>(ctx.GetHandle());
  finish_bmcompiler_data(graph.GetCompilerHandle(), &bmodel_data, &data_size);
+  graph.UnlockCompilerMutex();
  bmrt_hd_ = bmrt_create(bm_hd_);
  if (false == bmrt_load_bmodel_data(bmrt_hd_, bmodel_data, data_size)) {
    return subgraph::FAILED;

--- a/lite/kernels/x86/fc_compute.h
+++ b/lite/kernels/x86/fc_compute.h
@@ -82,7 +82,7 @@ class FCFunctor {
          memcpy(X1_data + i * KK, X + i * K, K * sizeof(T));
        }
      };
-      parallel_memcpy_x(0,M);
+      parallel_memcpy_x(0, M);
      blas.GEMM(false,
                false,
                M,
@@ -103,14 +103,14 @@ class FCFunctor {
            memcpy(Y + i * N, Y1_data + i * NN, N * sizeof(T));
          }
        };
-        parallel_memcpy_y(0,M);
+        parallel_memcpy_y(0, M);
        return;
      }

-      parallel_compute(0,M);
+      parallel_compute(0, M);
    } else {
      blas.MatMul(M, N, K, X, W, Y);
-     if (!B) {
+      if (!B) {
        return;
      }
      parallel_compute(0, M);

--- a/lite/kernels/x86/mul_compute.h
+++ b/lite/kernels/x86/mul_compute.h
@@ -13,11 +13,11 @@
 // limitations under the License.
 #pragma once

+#include <chrono>
 #include "lite/backends/x86/math/blas.h"
 #include "lite/core/kernel.h"
 #include "lite/core/op_registry.h"
 #include "lite/core/types.h"
-#include <chrono>
 #include "lite/fluid/eigen.h"
 namespace paddle {
 namespace lite {
@@ -68,11 +68,13 @@ class MulCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
      y_matrix = *y;
    }

-
-Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> mat_test(x_matrix.mutable_data<T>(), x_matrix.dims()[0], x_matrix.dims()[1]);
-Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> mat1_test(y_matrix.mutable_data<T>(), y_matrix.dims()[0], y_matrix.dims()[1]);
-Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_test(z->mutable_data<T>(), z->dims()[0], z->dims()[1]);
-out_test = mat_test * mat1_test;
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> mat_test(
+        x_matrix.mutable_data<T>(), x_matrix.dims()[0], x_matrix.dims()[1]);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> mat1_test(
+        y_matrix.mutable_data<T>(), y_matrix.dims()[0], y_matrix.dims()[1]);
+    Eigen::Map<Eigen::Matrix<T, Eigen::Dynamic, Eigen::Dynamic>> out_test(
+        z->mutable_data<T>(), z->dims()[0], z->dims()[1]);
+    out_test = mat_test * mat1_test;
  }

  virtual ~MulCompute() = default;

--- a/lite/kernels/x86/sequence_reshape_compute.h
+++ b/lite/kernels/x86/sequence_reshape_compute.h
@@ -56,8 +56,7 @@ class SequenceReshapeCompute
        out_lod[0][i + 1] = out_lod[0][i] + offset;
      }
    }
-    out->Resize(std::vector<int64_t>{in->numel()/out_width,
-                                     out_width});
+    out->Resize(std::vector<int64_t>{in->numel() / out_width, out_width});
    auto* dst_ptr = out->template mutable_data<T>();
    auto size = in->numel() * sizeof(T);
    std::memcpy(dst_ptr, in->template data<T>(), size);
@@ -76,8 +75,8 @@ class SequenceReshapeFloatCompute
    auto& param = *param_.get_mutable<operators::SequenceReshapeParam>();
    auto* in = param.x;
    auto* out = param.output;
-    auto out_data= out->mutable_data<T>();
-    for(int i=0;i<out->dims().production(); i++){
+    auto out_data = out->mutable_data<T>();
+    for (int i = 0; i < out->dims().production(); i++) {
      out_data[i] = 0;
    }
    int out_width = param.new_dim;
@@ -103,8 +102,7 @@ class SequenceReshapeFloatCompute
        out_lod[0][i + 1] = out_lod[0][i] + offset;
      }
    }
-    out->Resize(std::vector<int64_t>{in->numel()/out_width,
-                                     out_width});
+    out->Resize(std::vector<int64_t>{in->numel() / out_width, out_width});
    auto* dst_ptr = out->mutable_data<T>();
    auto size = in->numel() * sizeof(T);
    std::memcpy(dst_ptr, in->data<T>(), size);

--- a/lite/operators/CMakeLists.txt
+++ b/lite/operators/CMakeLists.txt
@@ -108,6 +108,7 @@ add_operator(collect_fpn_proposals_op_lite extra SRCS collect_fpn_proposals_op.c
 add_operator(distribute_fpn_proposals_op_lite extra SRCS distribute_fpn_proposals_op.cc DEPS ${op_DEPS})
 add_operator(crf_decoding_op_lite extra SRCS crf_decoding_op.cc DEPS ${op_DEPS})
 add_operator(ctc_align_op_lite extra SRCS ctc_align_op.cc DEPS ${op_DEPS})
+add_operator(max_pool_with_index_op extra SRCS max_pool_with_index_op.cc DEPS ${op_DEPS})

 # for OCR specific
 add_operator(while_op extra SRCS while_op.cc DEPS ${op_DEPS})

--- a/lite/operators/max_pool_with_index_op.cc
+++ b/lite/operators/max_pool_with_index_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#include "lite/operators/max_pool_with_index_op.h"
+#include <algorithm>
+#include "lite/core/op_registry.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+bool MaxPoolWithIndexOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.x);
+  CHECK_OR_FALSE(param_.output);
+
+  const auto& x_dims = param_.x->dims();
+  const auto& strides = param_.strides;
+  const auto& ksize = param_.ksize;
+  const auto& paddings = *param_.paddings;
+  // "Pooling intput should be 4-D or 5-D tensor."
+  CHECK_OR_FALSE(x_dims.size() == 4 || x_dims.size() == 5);
+  // Input size and pooling size should be consistent.
+  CHECK_OR_FALSE(x_dims.size() - ksize.size() == 2U);
+  // Strides size and pooling size should be the same.
+  CHECK_OR_FALSE(ksize.size() == strides.size());
+  // Paddings size must be 4.
+  CHECK_OR_FALSE(paddings.size() == 4L);
+
+  return true;
+}
+
+inline int MaxPoolOutputSize(int input_size,
+                             int filter_size,
+                             int padding,
+                             int stride) {
+  int output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  return output_size;
+}
+
+bool MaxPoolWithIndexOpLite::InferShapeImpl() const {
+  const auto x_dims = param_.x->dims();
+  const auto ksize = param_.ksize;
+  std::vector<int64_t> output_shape({x_dims[0], x_dims[1]});
+  const auto& strides = param_.strides;
+  const auto& paddings = *param_.paddings;
+  const auto adaptive = param_.adaptive;
+
+  if (adaptive) {
+    output_shape.insert(output_shape.end(), ksize.begin(), ksize.end());
+  } else {
+    for (size_t i = 0; i < ksize.size(); ++i) {
+      output_shape.push_back(
+          MaxPoolOutputSize(x_dims[i + 2], ksize[i], paddings[i], strides[i]));
+    }
+  }
+  param_.output->Resize(lite::DDim(output_shape));
+  return true;
+}
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+
+REGISTER_LITE_OP(max_pool2d_with_index,
+                 paddle::lite::operators::MaxPoolWithIndexOpLite);
--- a/lite/operators/max_pool_with_index_op.h
+++ b/lite/operators/max_pool_with_index_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+
+#pragma once
+
+#include <algorithm>
+#include <memory>
+#include <string>
+#include <vector>
+#include "lite/core/kernel.h"
+#include "lite/core/op_lite.h"
+#include "lite/core/scope.h"
+#include "lite/core/tensor.h"
+#include "lite/operators/op_params.h"
+#include "lite/utils/all.h"
+
+namespace paddle {
+namespace lite {
+namespace operators {
+
+class MaxPoolWithIndexOpLite : public OpLite {
+ public:
+  MaxPoolWithIndexOpLite() {}
+
+  explicit MaxPoolWithIndexOpLite(const std::string &type) : OpLite(type) {}
+
+  bool CheckShape() const override;
+
+  bool InferShapeImpl() const override;
+
+  // TODO(Superjomn) replace framework::OpDesc with a lite one.
+  bool AttachImpl(const cpp::OpDesc &op_desc, lite::Scope *scope) override {
+    auto x = op_desc.Input("X").front();
+    auto out = op_desc.Output("Out").front();
+    auto mask = op_desc.Output("Mask").front();
+
+    CHECK(scope->FindVar(x));
+    CHECK(scope->FindVar(out));
+    CHECK(scope->FindVar(mask));
+    param_.x = scope->FindVar(x)->GetMutable<lite::Tensor>();
+    param_.output = scope->FindVar(out)->GetMutable<lite::Tensor>();
+
+    param_.ksize = op_desc.GetAttr<std::vector<int>>("ksize");
+    param_.global_pooling = op_desc.GetAttr<bool>("global_pooling");
+    param_.strides = op_desc.GetAttr<std::vector<int>>("strides");
+    auto paddings = op_desc.GetAttr<std::vector<int>>("paddings");
+    if (op_desc.HasAttr("adaptive")) {
+      param_.adaptive = op_desc.GetAttr<bool>("adaptive");
+    }
+    // 2-pad to 4-pad
+    if (paddings.size() == 2L) {
+      for (size_t i = 0; i < 2L; ++i) {
+        int copy_pad = *(paddings.begin() + 2 * i);
+        paddings.insert(paddings.begin() + 2 * i + 1, copy_pad);
+      }
+    } else {
+      if (paddings.size() != 4L) {
+        LOG(FATAL)
+            << "Paddings size should be the same or twice as the inputs size.";
+      }
+    }
+    param_.paddings = std::make_shared<std::vector<int>>(paddings);
+    return true;
+  }
+
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+
+  std::string DebugString() const override { return "max_pool2d_with_index"; }
+
+ private:
+  mutable PoolParam param_;
+};
+
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/build.bat
+++ b/build.bat
@@ -2,7 +2,7 @@
 setlocal
 setlocal enabledelayedexpansion

-set source_path=%~dp0
+set source_path=%~dp0\\..\\..\\
 rem  global variables
 set BUILD_EXTRA=OFF
 set BUILD_JAVA=ON
@@ -92,16 +92,16 @@ goto:eof
        ) else (
               echo "The directory of third_party exists, the third-party-05b862.tar.gz exists."
               call:rm_rebuild_dir "%workspace%\third-party"
-               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+               !python_path! %workspace%\lite\tools\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
        )
    ) else (
        if NOT EXIST "%workspace%\third-party-05b862.tar.gz" (
            echo "The directory of third_party not exists, the third-party-05b862.tar.gz not exists."
            call:download_third_party
-            !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+            !python_path! %workspace%\lite\tools\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
        ) else (
            echo "The directory of third_party not exists, the third-party-05b862.tar.gz exists."
-               !python_path! %workspace%\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
+               !python_path! %workspace%\lite\tools\untar.py %source_path%\third-party-05b862.tar.gz %workspace%
        )

    )
@@ -131,4 +131,4 @@ if "%tmp_var:~-1%"==" " (
    set "tmp_var=%tmp_var:~0,-1%"
    goto remove_left_space
 )
-goto:eof
\ No newline at end of file
+goto:eof
--- a/lite/tools/untar.py
+++ b/lite/tools/untar.py
+# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+
+import tarfile, os
+import sys
+
+
+def untar(fname, dirs):
+    """
+    extract the tar.gz file
+    :param fname: the name of tar.gz file
+    :param dirs: the path of decompressed file 
+    :return: bool
+    """
+    try:
+        t = tarfile.open(name=fname, mode='r:gz')
+        t.extractall(path=dirs)
+        return True
+    except Exception as e:
+        print(e)
+        return False
+
+
+untar(sys.argv[1], sys.argv[2])
--- a/mobile/src/operators/op_param.h
+++ b/mobile/src/operators/op_param.h
@@ -494,6 +494,7 @@ class ConvParam : public OpParam {
    EXEC_DEPTHWISE3x3_FLOAT,
    EXEC_SLIDINGWINDOW1x1_FLOAT,
    EXEC_SLIDINGWINDOW3x3_FLOAT,
+    EXEC_SLIDINGWINDOW3x3_WITH_GROUP_FLOAT,
    EXEC_SLIDINGWINDOW5x5_FLOAT,
    EXEC_SLIDINGWINDOW7x7_FLOAT,
    EXEC_GEMM1x1s1_FLOAT,

--- a/mobile/test/CMakeLists.txt
+++ b/mobile/test/CMakeLists.txt
@@ -549,6 +549,9 @@ if (ENABLE_ALL_TEST)
        ADD_EXECUTABLE(test-net-performance net/test_net_performance.cpp test_helper.h test_include.h executor_for_test.h)
        target_link_libraries(test-net-performance paddle-mobile)

+        ADD_EXECUTABLE(test-infer-imfix net/test_inference_imfix.cpp test_helper.h test_include.h executor_for_test.h)
+        target_link_libraries(test-infer-imfix paddle-mobile)
+
 #        ADD_EXECUTABLE(test-inference-ercy net/test_inference_ercy.cpp test_helper.h test_include.h executor_for_test.h)
 #        target_link_libraries(test-inference-api-v2 paddle-mobile)


--- a/mobile/test/net/test_inference_imfix.cpp
+++ b/mobile/test/net/test_inference_imfix.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+#include "../test_helper.h"
+#include "io/paddle_inference_api.h"
+
+using namespace paddle_mobile;  // NOLINT
+
+PaddleMobileConfig GetConfig() {
+  PaddleMobileConfig config;
+  config.precision = PaddleMobileConfig::FP32;
+  config.device = PaddleMobileConfig::kGPU_CL;
+  config.pre_post_type = PaddleMobileConfig::NONE_PRE_POST;
+
+  config.prog_file = "../models/imagefixmodel/model";
+  config.param_file = "../models/imagefixmodel/params";
+  config.lod_mode = false;
+  config.load_when_predict = false;
+  return config;
+}
+
+int main() {
+  PaddleMobileConfig config = GetConfig();
+  auto predictor =
+      CreatePaddlePredictor<PaddleMobileConfig,
+                            PaddleEngineKind::kPaddleMobile>(config);
+
+  // factor
+  int input_rgb_len = 1 * 3 * 256 * 256;
+  std::vector<float> input_rgb_v(input_rgb_len, 1);
+  // SetupData<float>(input_rgb_v.data(), input_rgb_len, 0.f, 1.f);
+
+  PaddleTensor input_rgb;
+  input_rgb.shape = std::vector<int>({1, 3, 256, 256});
+  input_rgb.data = PaddleBuf(input_rgb_v.data(), input_rgb_len * sizeof(float));
+  input_rgb.dtype = PaddleDType::FLOAT32;
+  input_rgb.layout = LayoutType::LAYOUT_CHW;
+
+  // remap
+  int input_mask_len = 1 * 3 * 256 * 256;
+  std::vector<float> input_mask_v(input_mask_len, 1);
+  // SetupData<float>(input_mask_v.data(), input_mask_len, 0.f, 1.f);
+
+  PaddleTensor input_mask;
+  input_mask.shape = std::vector<int>({1, 3, 256, 256});
+  input_mask.data =
+      PaddleBuf(input_mask_v.data(), input_mask_len * sizeof(float));
+  input_mask.dtype = PaddleDType::FLOAT32;
+  input_mask.layout = LayoutType::LAYOUT_CHW;
+
+  PaddleTensor output0;
+  output0.shape = std::vector<int>({});
+  output0.data = PaddleBuf();
+  output0.dtype = PaddleDType::FLOAT32;
+  output0.layout = LayoutType::LAYOUT_CHW;
+
+  // PaddleTensor output1;
+  // output1.shape = std::vector<int>({});
+  // output1.data = PaddleBuf();
+  // output1.dtype = PaddleDType::FLOAT32;
+  // output1.layout = LayoutType::LAYOUT_CHW;
+
+  // PaddleTensor output2;
+  // output2.shape = std::vector<int>({});
+  // output2.data = PaddleBuf();
+  // output2.dtype = PaddleDType::FLOAT32;
+  // output2.layout = LayoutType::LAYOUT_CHW;
+
+  // PaddleTensor output3;
+  // output3.shape = std::vector<int>({});
+  // output3.data = PaddleBuf();
+  // output3.dtype = PaddleDType::FLOAT32;
+  // output3.layout = LayoutType::LAYOUT_CHW;
+  std::cout << "feed : " << std::endl;
+
+  predictor->Feed("input_rgb", input_rgb);
+
+  std::cout << "feed : " << std::endl;
+
+  predictor->Feed("input_mask", input_mask);
+
+  std::cout << "run : " << std::endl;
+
+  predictor->Run();
+
+  std::cout << "fetch : " << std::endl;
+
+  predictor->Fetch("save_infer_model/scale_0", &output0);
+
+  float* out_ptr0 = reinterpret_cast<float*>(output0.data.data());
+  std::cout << " print output0 : " << std::endl;
+  int numel = output0.data.length() / sizeof(float);
+  int stride = numel / 20;
+  stride = stride > 0 ? stride : 1;
+  for (size_t j = 0; j < numel; j += stride) {
+    std::cout << out_ptr0[j] << " ";
+  }
+  std::cout << std::endl;
+
+  return 0;
+}