Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into temp/debug1

9fa47bf7 · Shixiaowei02 · 758fd379 · 0731af04 · 9fa47bf7 · 9fa47bf7
16 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -147,6 +147,7 @@ endif()
 # for lite, both server and mobile framework.
 option(WITH_LITE "Enable lite framework" OFF)
+option(LITE_WITH_JAVA "Enable Java JNI lib in lite mode" OFF)
 option(LITE_WITH_CUDA "Enable CUDA in lite mode" OFF)
 option(LITE_WITH_X86  "Enable X86 in lite mode"  ON)
 option(LITE_WITH_ARM  "Enable ARM in lite mode"  OFF)

--- a/cmake/lite.cmake
+++ b/cmake/lite.cmake
+# Bundle several static libraries into one.
+function(bundle_static_library tgt_name bundled_tgt_name fake_target)
+  list(APPEND static_libs ${tgt_name})
+  function(_recursively_collect_dependencies input_target)
+    set(_input_link_libraries LINK_LIBRARIES)
+    get_target_property(_input_type ${input_target} TYPE)
+    if (${_input_type} STREQUAL "INTERFACE_LIBRARY")
+      set(_input_link_libraries INTERFACE_LINK_LIBRARIES)
+    endif()
+    get_target_property(public_dependencies ${input_target} ${_input_link_libraries})
+    foreach(dependency IN LISTS public_dependencies)
+      if(TARGET ${dependency})
+        get_target_property(alias ${dependency} ALIASED_TARGET)
+        if (TARGET ${alias})
+          set(dependency ${alias})
+        endif()
+        get_target_property(_type ${dependency} TYPE)
+        if (${_type} STREQUAL "STATIC_LIBRARY")
+          list(APPEND static_libs ${dependency})
+        endif()
+        get_property(library_already_added
+          GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency})
+        if (NOT library_already_added)
+          set_property(GLOBAL PROPERTY _${tgt_name}_static_bundle_${dependency} ON)
+          _recursively_collect_dependencies(${dependency})
+        endif()
+      endif()
+    endforeach()
+    set(static_libs ${static_libs} PARENT_SCOPE)
+  endfunction()
+  _recursively_collect_dependencies(${tgt_name})
+  list(REMOVE_DUPLICATES static_libs)
+  set(bundled_tgt_full_name
+    ${CMAKE_BINARY_DIR}/${CMAKE_STATIC_LIBRARY_PREFIX}${bundled_tgt_name}${CMAKE_STATIC_LIBRARY_SUFFIX})
+  message(STATUS "+++++ bundled_tgt_full_name: ${bundled_tgt_full_name}")
+  file(WRITE ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
+    "CREATE ${bundled_tgt_full_name}\n" )
+  foreach(tgt IN LISTS static_libs)
+    file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in
+      "ADDLIB $<TARGET_FILE:${tgt}>\n")
+  endforeach()
+  file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "SAVE\n")
+  file(APPEND ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in "END\n")
+  file(GENERATE
+    OUTPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar
+    INPUT ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar.in)
+  set(ar_tool ${CMAKE_AR})
+  if (CMAKE_INTERPROCEDURAL_OPTIMIZATION)
+    set(ar_tool ${CMAKE_CXX_COMPILER_AR})
+  endif()
+  add_custom_command(
+    COMMAND ${ar_tool} -M < ${CMAKE_BINARY_DIR}/${bundled_tgt_name}.ar
+    OUTPUT ${bundled_tgt_full_name}
+    COMMENT "Bundling ${bundled_tgt_name}"
+    VERBATIM)
+  add_custom_target(${fake_target} ALL DEPENDS ${bundled_tgt_full_name})
+  add_dependencies(${fake_target} ${tgt_name})
+  add_library(${bundled_tgt_name} STATIC IMPORTED)
+  set_target_properties(${bundled_tgt_name}
+    PROPERTIES
+      IMPORTED_LOCATION ${bundled_tgt_full_name}
+      INTERFACE_INCLUDE_DIRECTORIES $<TARGET_PROPERTY:${tgt_name},INTERFACE_INCLUDE_DIRECTORIES>)
+  add_dependencies(${bundled_tgt_name} ${fake_target})
+endfunction()
--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -2,6 +2,8 @@ if (NOT WITH_LITE)
    return()
 endif()
+include(lite)
 message(WARNING "Lite enabled!")
 message(STATUS "LIGHT_FRAMEWORK:\t${LITE_WITH_LIGHT_WEIGHT_FRAMEWORK}")
 message(STATUS "LITE_WITH_CUDA:\t${LITE_WITH_CUDA}")
@@ -85,9 +87,9 @@ function (lite_deps TARGET)
  endif()
  set(${TARGET} ${deps} PARENT_SCOPE)
 endfunction()
 # A fake target to include all the libraries and tests the lite module depends.
 add_custom_target(lite_compile_deps COMMAND echo 1)
@@ -95,6 +97,10 @@ add_custom_target(lite_compile_deps COMMAND echo 1)
 # the whole fluid project to accelerate the compile speed.
 set(offline_lib_registry_file "${CMAKE_BINARY_DIR}/lite_libs.txt")
 file(WRITE ${offline_lib_registry_file} "") # clean
+set(__lite_cc_files "";"")
+set(__lite_cc_files "${CMAKE_BINARY_DIR}/lite_cc_files.txt")
+file(WRITE ${__lite_cc_files} "") # clean
 # cc_library with branch support.
 # The branches:
 #  X86_DEPS: works only when LITE_WITH_X86 is ON.
@@ -104,7 +110,7 @@ file(WRITE ${offline_lib_registry_file} "") # clean
 #  LIGHT_DEPS:    LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 #  HVY_DEPS:      NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
 function(lite_cc_library TARGET)
-    set(options STATIC static SHARED shared)
+    set(options SHARED shared STATIC static MODULE module)
    set(oneValueArgs "")
    set(multiValueArgs SRCS DEPS X86_DEPS CUDA_DEPS ARM_DEPS PROFILE_DEPS LIGHT_DEPS
      HVY_DEPS ARGS)
@@ -120,14 +126,24 @@ function(lite_cc_library TARGET)
            LIGHT_DEPS ${args_LIGHT_DEPS}
            HVY_DEPS ${args_HVY_DEPS}
            )
-    if (${args_SHARED} OR ${args_shared})
+    if (args_SHARED OR ARGS_shared)
        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS} SHARED)
+    elseif (args_MODULE OR ARGS_module)
+        add_library(${TARGET} MODULE ${args_SRCS})
+        add_dependencies(${TARGET} ${deps} ${args_DEPS})
    else()
        cc_library(${TARGET} SRCS ${args_SRCS} DEPS ${deps} ${args_DEPS})
    endif()
+    foreach(cc_file ${args_SRCS})
+        file(APPEND ${__lite_cc_files} "${cc_file}\n")
+    endforeach()
    # collect targets need to compile for lite
+    if (args_SRCS)
        add_dependencies(lite_compile_deps ${TARGET})
+    endif()
    # register a library name.
    file(APPEND ${offline_lib_registry_file} "${TARGET}\n")
@@ -224,9 +240,9 @@ add_custom_target(publish_inference_cxx_lib ${TARGET}
        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
        COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
        COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
-        COMMAND cp "${CMAKE_BINARY_DIR}/paddle/fluid/lite/api/libpaddle_api_full.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+        COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_full_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
        )
-add_dependencies(publish_inference_cxx_lib paddle_api_full)
+add_dependencies(publish_inference_cxx_lib bundle_full_api)
 add_dependencies(publish_inference_lite publish_inference_cxx_lib)
 if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
@@ -235,9 +251,9 @@ if (LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    add_custom_target(publish_inference_mobile_lib ${TARGET}
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/mobile/lib"
            COMMAND mkdir -p "${INFER_LITE_PUBLISH_ROOT}/mobile/include"
-            COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/cxx/include"
+            COMMAND cp "${CMAKE_SOURCE_DIR}/paddle/fluid/lite/api/paddle_*.h" "${INFER_LITE_PUBLISH_ROOT}/mobile/include"
-            COMMAND cp "${CMAKE_BINARY_DIR}/paddle/fluid/lite/api/libpaddle_api_light.a" "${INFER_LITE_PUBLISH_ROOT}/cxx/lib"
+            COMMAND cp "${CMAKE_BINARY_DIR}/libpaddle_api_light_bundled.a" "${INFER_LITE_PUBLISH_ROOT}/mobile/lib"
            )
-    add_dependencies(publish_inference_mobile_lib paddle_api_light)
+    add_dependencies(publish_inference_mobile_lib paddle_api_light bundle_light_api)
    add_dependencies(publish_inference_lite publish_inference_mobile_lib)
 endif()
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -102,18 +102,39 @@ lite_cc_test(test_apis_lite SRCS apis_test.cc
 lite_cc_library(paddle_api_lite SRCS paddle_api.cc DEPS op_params_lite)
-lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api_lite paddle_api_lite light_api_lite)
+#-----------------------------------------------------------------------------------------------------
+# The final inference library for both CxxConfig and MobileConfig.
+lite_cc_library(paddle_api_full SRCS cxx_api_impl.cc DEPS cxx_api_lite paddle_api_lite light_api_lite
+  ${ops_lite}
+  ARM_DEPS ${arm_kernels}
+  )
+# The final inference library for just MobileConfig.
 lite_cc_library(paddle_api_light SRCS light_api_impl.cc DEPS light_api_lite paddle_api_lite)
+bundle_static_library(paddle_api_full paddle_api_full_bundled bundle_full_api)
+bundle_static_library(paddle_api_light paddle_api_light_bundled bundle_light_api)
+#-----------------------------------------------------------------------------------------------------
 lite_cc_test(test_paddle_api_lite SRCS paddle_api_test.cc DEPS paddle_api_full paddle_api_light
  ${ops_lite}
  ARM_DEPS ${arm_kernels}
  X86_DEPS ${x86_kernels}
  ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model SERIAL)
+lite_cc_test(test_model_bin SRCS model_test.cc DEPS paddle_api_full paddle_api_light
+  ${ops_lite}
+  ARM_DEPS ${arm_kernels}
+  X86_DEPS ${x86_kernels})
 if (WITH_TESTING)
    add_dependencies(test_paddle_api_lite extern_lite_download_lite_naive_model_tar_gz)
 endif()
+if (LITE_WITH_JAVA AND LITE_WITH_ARM)
+    add_subdirectory(android)
+endif()
 #lite_cc_binary(cxx_api_lite_bin SRCS cxx_api_bin.cc
    #X86_DEPS operator
    #DEPS light_api_lite model_parser_lite target_wrapper_host mir_passes

--- a/paddle/fluid/lite/api/android/CMakeLists.txt
+++ b/paddle/fluid/lite/api/android/CMakeLists.txt
+if ((NOT LITE_WITH_JAVA) OR (NOT LITE_WITH_ARM))
+  return()
+endif()
+add_subdirectory(jni)
--- a/paddle/fluid/lite/api/android/jni/.gitignore
+++ b/paddle/fluid/lite/api/android/jni/.gitignore
+/PaddleListTest.class
+/PaddleLite.class
+/bin/
--- a/paddle/fluid/lite/api/android/jni/CMakeLists.txt
+++ b/paddle/fluid/lite/api/android/jni/CMakeLists.txt
+if ((NOT LITE_WITH_ARM) OR (NOT LITE_WITH_JAVA))
+  return()
+endif()
+include(UseJava)
+find_package(Java REQUIRED)
+# We are only interested in finding jni.h: we do not care about extended JVM
+# functionality or the AWT library.
+set(JAVA_AWT_LIBRARY NotNeeded)
+set(JAVA_JVM_LIBRARY NotNeeded)
+set(JAVA_INCLUDE_PATH2 NotNeeded)
+set(JAVA_AWT_INCLUDE_PATH NotNeeded)
+find_package(JNI REQUIRED)
+# Generate PaddlePredictor.jar
+include_directories(${JNI_INCLUDE_DIRS})
+add_jar(PaddlePredictor 
+    src/com/baidu/paddle/lite/PaddlePredictor.java
+    src/com/baidu/paddle/lite/Place.java)
+get_target_property(_jarFile PaddlePredictor JAR_FILE)
+get_target_property(_classDir PaddlePredictor CLASSDIR)
+set(_stubDir "${CMAKE_CURRENT_BINARY_DIR}")
+# Generate paddle_lite_jni.h
+add_custom_target(
+    paddle_lite_jni_header ALL
+    COMMAND ${Java_JAVAH_EXECUTABLE} -verbose
+        -classpath ${_classDir}
+        -o paddle_lite_jni.h
+        -jni
+        com.baidu.paddle.lite.PaddlePredictor
+    DEPENDS PaddlePredictor
+)
+# Generate paddle_lite_jni.so
+include_directories(${JNI_INCLUDE_DIRS} ${_classDir} ${_stubDir})
+lite_cc_library(paddle_lite_jni MODULE SRCS paddle_lite_jni.cc 
+    DEPS light_api_lite cxx_api_lite
+        paddle_api_full paddle_api_lite paddle_api_light op_registry_lite
+        ${ops_lite} ${lite_kernel_deps}
+    ARM_DEPS ${arm_kernels})
+if (APPLE)
+    # MacOS only accepts JNI lib ends with .jnilib or .dylib
+    set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".jnilib")
+elseif (WIN32)
+    # Windows only accepts JNI lib ends with .dll
+    set_target_properties(paddle_lite_jni PROPERTIES SUFFIX ".dll")
+endif (APPLE)
+target_link_libraries(paddle_lite_jni light_api_lite cxx_api_lite
+    paddle_api_full paddle_api_lite paddle_api_light op_registry_lite 
+    ${ops_lite} ${arm_kernels} ${lite_kernel_deps})
--- a/paddle/fluid/lite/api/android/jni/paddle_lite_jni.cc
+++ b/paddle/fluid/lite/api/android/jni/paddle_lite_jni.cc
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "paddle/fluid/lite/api/android/jni/paddle_lite_jni.h"
+#include <memory>
+#include <string>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/api/light_api.h"
+#include "paddle/fluid/lite/api/paddle_api.h"
+#include "paddle/fluid/lite/api/paddle_lite_factory_helper.h"
+#include "paddle/fluid/lite/api/paddle_place.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/kernels/arm/activation_compute.h"
+#include "paddle/fluid/lite/kernels/arm/batch_norm_compute.h"
+#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
+#include "paddle/fluid/lite/kernels/arm/concat_compute.h"
+#include "paddle/fluid/lite/kernels/arm/conv_compute.h"
+#include "paddle/fluid/lite/kernels/arm/dropout_compute.h"
+#include "paddle/fluid/lite/kernels/arm/elementwise_compute.h"
+#include "paddle/fluid/lite/kernels/arm/fc_compute.h"
+#include "paddle/fluid/lite/kernels/arm/mul_compute.h"
+#include "paddle/fluid/lite/kernels/arm/pool_compute.h"
+#include "paddle/fluid/lite/kernels/arm/scale_compute.h"
+#include "paddle/fluid/lite/kernels/arm/softmax_compute.h"
+#include "paddle/fluid/lite/kernels/arm/split_compute.h"
+#include "paddle/fluid/lite/kernels/arm/transpose_compute.h"
+#define ARM_KERNEL_POINTER(kernel_class_name__)                    \
+  std::unique_ptr<paddle::lite::kernels::arm::kernel_class_name__> \
+      p##kernel_class_name__(                                      \
+          new paddle::lite::kernels::arm::kernel_class_name__);
+#ifdef __cplusplus
+extern "C" {
+#endif
+using paddle::lite_api::CxxConfig;
+using paddle::lite_api::MobileConfig;
+using paddle::lite_api::PaddlePredictor;
+using paddle::lite_api::Place;
+using paddle::lite_api::Tensor;
+static std::shared_ptr<PaddlePredictor> predictor;
+/**
+ * Not sure why, we have to initial a pointer first for kernels.
+ * Otherwise it throws null pointer error when do KernelRegistor.
+ */
+static void use_arm_kernels() {
+  ARM_KERNEL_POINTER(BatchNormCompute);
+  ARM_KERNEL_POINTER(CalibComputeFp32ToInt8);
+  ARM_KERNEL_POINTER(CalibComputeInt8ToFp32);
+  ARM_KERNEL_POINTER(ConvCompute);
+  ARM_KERNEL_POINTER(ConcatCompute);
+  ARM_KERNEL_POINTER(ElementwiseAddCompute);
+  ARM_KERNEL_POINTER(DropoutCompute);
+  ARM_KERNEL_POINTER(FcCompute);
+  ARM_KERNEL_POINTER(MulCompute);
+  ARM_KERNEL_POINTER(PoolCompute);
+  ARM_KERNEL_POINTER(ReluCompute);
+  ARM_KERNEL_POINTER(ScaleCompute);
+  ARM_KERNEL_POINTER(SoftmaxCompute);
+  ARM_KERNEL_POINTER(SplitCompute);
+  ARM_KERNEL_POINTER(TransposeCompute);
+  ARM_KERNEL_POINTER(Transpose2Compute);
+}
+inline std::string jstring_to_cpp_string(JNIEnv *env, jstring jstr) {
+  // In java, a unicode char will be encoded using 2 bytes (utf16).
+  // so jstring will contain characters utf16. std::string in c++ is
+  // essentially a string of bytes, not characters, so if we want to
+  // pass jstring from JNI to c++, we have convert utf16 to bytes.
+  if (!jstr) {
+    return "";
+  }
+  const jclass stringClass = env->GetObjectClass(jstr);
+  const jmethodID getBytes =
+      env->GetMethodID(stringClass, "getBytes", "(Ljava/lang/String;)[B");
+  const jbyteArray stringJbytes = (jbyteArray)env->CallObjectMethod(
+      jstr, getBytes, env->NewStringUTF("UTF-8"));
+  size_t length = (size_t)env->GetArrayLength(stringJbytes);
+  jbyte *pBytes = env->GetByteArrayElements(stringJbytes, NULL);
+  std::string ret = std::string(reinterpret_cast<char *>(pBytes), length);
+  env->ReleaseByteArrayElements(stringJbytes, pBytes, JNI_ABORT);
+  env->DeleteLocalRef(stringJbytes);
+  env->DeleteLocalRef(stringClass);
+  return ret;
+}
+inline jfloatArray cpp_array_to_jfloatarray(JNIEnv *env, const float *buf,
+                                            int64_t len) {
+  jfloatArray result = env->NewFloatArray(len);
+  env->SetFloatArrayRegion(result, 0, len, buf);
+  return result;
+}
+inline jintArray cpp_array_to_jintarray(JNIEnv *env, const int *buf,
+                                        int64_t len) {
+  jintArray result = env->NewIntArray(len);
+  env->SetIntArrayRegion(result, 0, len, buf);
+  return result;
+}
+inline jbyteArray cpp_array_to_jbytearray(JNIEnv *env, const int8_t *buf,
+                                          int64_t len) {
+  jbyteArray result = env->NewByteArray(len);
+  env->SetByteArrayRegion(result, 0, len, buf);
+  return result;
+}
+inline std::vector<int64_t> jintarray_to_int64_vector(JNIEnv *env,
+                                                      jintArray dims) {
+  int dim_size = env->GetArrayLength(dims);
+  jint *dim_nums = env->GetIntArrayElements(dims, nullptr);
+  std::vector<int64_t> dim_vec(dim_nums, dim_nums + dim_size);
+  env->ReleaseIntArrayElements(dims, dim_nums, 0);
+  return dim_vec;
+}
+/**
+ * Converts Java com.baidu.paddle.lite.Place to c++ paddle::lite_api::Place.
+ */
+inline static Place jplace_to_cpp_place(JNIEnv *env, jobject java_place) {
+  jclass place_jclazz = env->GetObjectClass(java_place);
+  jmethodID target_method =
+      env->GetMethodID(place_jclazz, "getTargetInt", "()I");
+  jmethodID precision_method =
+      env->GetMethodID(place_jclazz, "getPrecisionInt", "()I");
+  jmethodID data_layout_method =
+      env->GetMethodID(place_jclazz, "getDataLayoutInt", "()I");
+  jmethodID device_method = env->GetMethodID(place_jclazz, "getDevice", "()I");
+  int target = env->CallIntMethod(java_place, target_method);
+  int precision = env->CallIntMethod(java_place, precision_method);
+  int data_layout = env->CallIntMethod(java_place, data_layout_method);
+  int device = env->CallIntMethod(java_place, device_method);
+  return Place(static_cast<paddle::lite_api::TargetType>(target),
+               static_cast<paddle::lite_api::PrecisionType>(precision),
+               static_cast<paddle::lite_api::DataLayoutType>(data_layout),
+               device);
+}
+inline static int64_t product(const std::vector<int64_t> &vec) {
+  if (vec.empty()) {
+    return 0;
+  }
+  int64_t result = 1;
+  for (int64_t d : vec) {
+    result *= d;
+  }
+  return result;
+}
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_loadCxxModel(
+    JNIEnv *env, jclass thiz, jstring model_path, jobject preferred_place,
+    jobjectArray valid_places) {
+  if (predictor != nullptr) {
+    return JNI_FALSE;
+  }
+  use_arm_kernels();
+  int valid_place_count = env->GetArrayLength(valid_places);
+  std::vector<Place> cpp_valid_places;
+  for (int i = 0; i < valid_place_count; ++i) {
+    jobject jplace = env->GetObjectArrayElement(valid_places, i);
+    cpp_valid_places.push_back(jplace_to_cpp_place(env, jplace));
+  }
+  CxxConfig config;
+  config.set_model_dir(jstring_to_cpp_string(env, model_path));
+  config.set_preferred_place(jplace_to_cpp_place(env, preferred_place));
+  config.set_valid_places(cpp_valid_places);
+  predictor = paddle::lite_api::CreatePaddlePredictor(config);
+  return predictor == nullptr ? JNI_FALSE : JNI_TRUE;
+}
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_loadMobileModel(JNIEnv *env,
+                                                           jclass thiz,
+                                                           jstring model_path) {
+  if (predictor != nullptr) {
+    return JNI_FALSE;
+  }
+  use_arm_kernels();
+  MobileConfig config;
+  config.set_model_dir(jstring_to_cpp_string(env, model_path));
+  predictor = paddle::lite_api::CreatePaddlePredictor(config);
+  return predictor == nullptr ? JNI_FALSE : JNI_TRUE;
+}
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel(
+    JNIEnv *env, jclass thiz, jstring model_path) {
+  if (predictor == nullptr) {
+    return JNI_FALSE;
+  }
+  predictor->SaveOptimizedModel(jstring_to_cpp_string(env, model_path));
+  return JNI_TRUE;
+}
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_clear(JNIEnv *env, jclass thiz) {
+  if (predictor == nullptr) {
+    return JNI_FALSE;
+  }
+  predictor.reset();
+  return JNI_TRUE;
+}
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_setInput__I_3I_3F(
+    JNIEnv *env, jclass thiz, jint offset, jintArray dims, jfloatArray buf) {
+  if (predictor == nullptr) {
+    return JNI_FALSE;
+  }
+  std::vector<int64_t> ddim = jintarray_to_int64_vector(env, dims);
+  int len = env->GetArrayLength(buf);
+  if ((int64_t)len != product(ddim)) {
+    return JNI_FALSE;
+  }
+  float *buffer = env->GetFloatArrayElements(buf, nullptr);
+  std::unique_ptr<Tensor> tensor =
+      predictor->GetInput(static_cast<int>(offset));
+  tensor->Resize(ddim);
+  float *input = tensor->mutable_data<float>();
+  for (int i = 0; i < len; ++i) {
+    input[i] = buffer[i];
+  }
+  return JNI_TRUE;
+}
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_setInput__I_3I_3B(
+    JNIEnv *env, jclass thiz, jint offset, jintArray dims, jbyteArray buf) {
+  if (predictor == nullptr) {
+    return JNI_FALSE;
+  }
+  std::vector<int64_t> ddim = jintarray_to_int64_vector(env, dims);
+  int len = env->GetArrayLength(buf);
+  if ((int64_t)len != product(ddim)) {
+    return JNI_FALSE;
+  }
+  jbyte *buffer = env->GetByteArrayElements(buf, nullptr);
+  std::unique_ptr<Tensor> tensor =
+      predictor->GetInput(static_cast<int>(offset));
+  tensor->Resize(ddim);
+  int8_t *input = tensor->mutable_data<int8_t>();
+  for (int i = 0; i < len; ++i) {
+    input[i] = (int8_t)buffer[i];
+  }
+  return JNI_TRUE;
+}
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_run(JNIEnv *, jclass) {
+  if (predictor == nullptr) {
+    return JNI_FALSE;
+  }
+  predictor->Run();
+  return JNI_TRUE;
+}
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getFloatOutput(JNIEnv *env,
+                                                          jclass thiz,
+                                                          jint offset) {
+  std::unique_ptr<const Tensor> tensor =
+      predictor->GetOutput(static_cast<int>(offset));
+  int64_t len = product(tensor->shape());
+  return cpp_array_to_jfloatarray(env, tensor->data<float>(), len);
+}
+JNIEXPORT jbyteArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getByteOutput(JNIEnv *env,
+                                                         jclass thiz,
+                                                         jint offset) {
+  std::unique_ptr<const Tensor> tensor =
+      predictor->GetOutput(static_cast<int>(offset));
+  int64_t len = product(tensor->shape());
+  return cpp_array_to_jbytearray(env, tensor->data<int8_t>(), len);
+}
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_fetchFloat(JNIEnv *env, jclass thiz,
+                                                      jstring name) {
+  std::string cpp_name = jstring_to_cpp_string(env, name);
+  std::unique_ptr<const Tensor> tensor = predictor->GetTensor(cpp_name);
+  int64_t len = product(tensor->shape());
+  return cpp_array_to_jfloatarray(env, tensor->data<float>(), len);
+}
+JNIEXPORT jbyteArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_fetchByte(JNIEnv *env, jclass thiz,
+                                                     jstring name) {
+  std::string cpp_name = jstring_to_cpp_string(env, name);
+  std::unique_ptr<const Tensor> tensor = predictor->GetTensor(cpp_name);
+  int64_t len = product(tensor->shape());
+  return cpp_array_to_jbytearray(env, tensor->data<int8_t>(), len);
+}
+#ifdef __cplusplus
+}
+#endif
--- a/paddle/fluid/lite/api/android/jni/paddle_lite_jni.h
+++ b/paddle/fluid/lite/api/android/jni/paddle_lite_jni.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+/* DO NOT EDIT THIS FILE - it is machine generated */
+#include <jni.h>
+/* Header for class com_baidu_paddle_lite_PaddlePredictor */
+#ifndef PADDLE_FLUID_LITE_API_ANDROID_JNI_PADDLE_LITE_JNI_H_
+#define PADDLE_FLUID_LITE_API_ANDROID_JNI_PADDLE_LITE_JNI_H_
+#ifdef __cplusplus
+extern "C" {
+#endif
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    loadCxxModel
+ * Signature:
+ * (Ljava/lang/String;Lcom/baidu/paddle/lite/Place;[Lcom/baidu/paddle/lite/Place;)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_loadCxxModel(JNIEnv *, jclass,
+                                                        jstring, jobject,
+                                                        jobjectArray);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    loadMobileModel
+ * Signature: (Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_loadMobileModel(JNIEnv *, jclass,
+                                                           jstring);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    saveOptimizedModel
+ * Signature: (Ljava/lang/String;)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_saveOptimizedModel(JNIEnv *, jclass,
+                                                              jstring);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    clear
+ * Signature: ()Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_clear(JNIEnv *, jclass);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    setInput
+ * Signature: (I[I[F)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_setInput__I_3I_3F(JNIEnv *, jclass,
+                                                             jint, jintArray,
+                                                             jfloatArray);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    setInput
+ * Signature: (I[I[B)Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_setInput__I_3I_3B(JNIEnv *, jclass,
+                                                             jint, jintArray,
+                                                             jbyteArray);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    run
+ * Signature: ()Z
+ */
+JNIEXPORT jboolean JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_run(JNIEnv *, jclass);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    getFloatOutput
+ * Signature: (I)[F
+ */
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getFloatOutput(JNIEnv *, jclass,
+                                                          jint);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    getByteOutput
+ * Signature: (I)[B
+ */
+JNIEXPORT jbyteArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_getByteOutput(JNIEnv *, jclass,
+                                                         jint);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    fetchFloat
+ * Signature: (Ljava/lang/String;)[F
+ */
+JNIEXPORT jfloatArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_fetchFloat(JNIEnv *, jclass,
+                                                      jstring);
+/*
+ * Class:     com_baidu_paddle_lite_PaddlePredictor
+ * Method:    fetchByte
+ * Signature: (Ljava/lang/String;)[B
+ */
+JNIEXPORT jbyteArray JNICALL
+Java_com_baidu_paddle_lite_PaddlePredictor_fetchByte(JNIEnv *, jclass, jstring);
+#ifdef __cplusplus
+}
+#endif
+#endif  // PADDLE_FLUID_LITE_API_ANDROID_JNI_PADDLE_LITE_JNI_H_
--- a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/.gitignore
+/PaddleLite.class
+/PaddleLiteTest.class
--- a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/PaddlePredictor.java
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+package com.baidu.paddle.lite;
+/** Java Native Interface (JNI) class for Paddle Lite APIs */
+public class PaddlePredictor {
+    /** name of C++ JNI lib */
+    private final static String JNI_LIB_NAME = "paddle_lite_jni";
+    /* load the C++ JNI lib */
+    static {
+        System.loadLibrary(JNI_LIB_NAME);
+    }
+    /**
+     * Loads mobile cxx model, which is the model before optimizing passes. The cxx
+     * model allow users to manage hardware place resources. Caller uses a place at
+     * Java to control Target, DataLayout, Precision, and Device ID. More details
+     * about the four fields see our Paddle-Mobile document.
+     * 
+     * 
+     * @param modelPath      modelPath model file path
+     * @param preferredPlace preferred place to run Cxx Model
+     * @param validPlaces    n * 4 int array, valid places to run Cxx Model
+     * @return true if load successfully
+     */
+    public static native boolean loadCxxModel(String modelPath, Place preferredPlace, Place[] validPlaces);
+    /**
+     * Loads mobile lite model, which is the model after optimizing passes.
+     *
+     * @param modelPath model file path
+     * @return true if load successfully
+     */
+    public static native boolean loadMobileModel(String modelPath);
+    /**
+     * Saves optimized model, which is the model can be used by
+     * {@link loadMobileModel}
+     * 
+     * @param modelPath model file path
+     * @return true if save successfully
+     */
+    public static native boolean saveOptimizedModel(String modelPath);
+    /**
+     * Clears the current loaded model.
+     * 
+     * @return true if a loaded model has been cleared.
+     */
+    public static native boolean clear();
+    /**
+     * Set input data on offset-th column of feed data
+     *
+     * @param offset the offset-th column of feed data will be set
+     * @param buf    the input data
+     * @param dims   dimension format of the input image
+     * @return true if set successfully
+     */
+    public static native boolean setInput(int offset, int[] dims, float[] buf);
+    /**
+     * Set input data on offset-th column of feed data
+     *
+     * @param offset the offset-th column of feed data will be set
+     * @param buf    the input data
+     * @param dims   dimension format of the input image
+     * @return true if set successfully
+     */
+    public static native boolean setInput(int offset, int[] dims, byte[] buf);
+    /**
+     * Run the predict model
+     * 
+     * @return true if run successfully
+     */
+    public static native boolean run();
+    /**
+     * Get offset-th column of output data as float
+     *
+     * @param offset the offset-th column of output data will be returned
+     * @return model predict output
+     */
+    public static native float[] getFloatOutput(int offset);
+    /**
+     * Get offset-th column of output data as byte (int8 in C++ side)
+     *
+     * @param offset the offset-th column of output data will be returned
+     * @return model predict output
+     */
+    public static native byte[] getByteOutput(int offset);
+    /**
+     * Fetches a Tensor's value as Float data
+     * 
+     * @param name Tensor's name
+     * @return values of the Tensor
+     */
+    public static native float[] fetchFloat(String name);
+    /**
+     * Fetches a Tensor's value as byte data (int8 at C++ side)
+     * 
+     * @param name Tensor's name
+     * @return values of the Tensor
+     */
+    public static native byte[] fetchByte(String name);
+    /**
+     * Main function for test
+     */
+    public static void main(String[] args) {
+        System.out.println("Load native library successfully");
+    }
+}
--- a/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
+++ b/paddle/fluid/lite/api/android/jni/src/com/baidu/paddle/lite/Place.java
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+package com.baidu.paddle.lite;
+/**
+ * Place specifies the execution context of a Kernel or input/output for a
+ * kernel. It is used to make the analysis of the MIR more clear and accurate.
+ */
+public class Place {
+    public enum TargetType {
+        UNKNOWN(0), HOST(1), X86(2), CUDA(3), ARM(4), OPEN_CL(5), ANY(6);
+        public final int value;
+        private TargetType(int value) {
+            this.value = value;
+        }
+    }
+    public enum PrecisionType {
+        UNKNOWN(0), FLOAT(1), INT8(2), INT32(3), ANY(4);
+        public final int value;
+        private PrecisionType(int value) {
+            this.value = value;
+        }
+    }
+    public enum DataLayoutType {
+        UNKNOWN(0), NCHW(1), ANY(2);
+        public final int value;
+        private DataLayoutType(int value) {
+            this.value = value;
+        }
+    }
+    public TargetType target;
+    public PrecisionType precision;
+    public DataLayoutType layout;
+    public int device;
+    public Place() {
+        target = TargetType.UNKNOWN;
+        precision = PrecisionType.UNKNOWN;
+        layout = DataLayoutType.UNKNOWN;
+        device = 0;
+    }
+    public Place(TargetType target) {
+        this(target, PrecisionType.FLOAT);
+    }
+    public Place(TargetType target, PrecisionType precision) {
+        this(target, precision, DataLayoutType.NCHW);
+    }
+    public Place(TargetType target, PrecisionType precision, DataLayoutType layout) {
+        this(target, precision, layout, 0);
+    }
+    public Place(TargetType target, PrecisionType precision, DataLayoutType layout, int device) {
+        this.target = target;
+        this.precision = precision;
+        this.layout = layout;
+        this.device = device;
+    }
+    public boolean isValid() {
+        return target != TargetType.UNKNOWN && precision != PrecisionType.UNKNOWN && layout != DataLayoutType.UNKNOWN;
+    }
+    public int getTargetInt() {
+        return target.value;
+    }
+    public int getPrecisionInt() {
+        return precision.value;
+    }
+    public int getDataLayoutInt() {
+        return layout.value;
+    }
+    public int getDevice() {
+        return device;
+    }
+}
--- a/paddle/fluid/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
+++ b/paddle/fluid/lite/api/android/jni/test/com/baidu/paddle/lite/PaddlePredictorTest.java
+/* Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+package com.baidu.paddle.lite;
+import org.junit.jupiter.api.Test;
+import static org.junit.Assert.assertEquals;
+class PaddlePredictorTest {
+    @Test
+    public void run_defaultModel() {
+        PaddlePredictor.loadMobileModel("");
+        float[] inputBuffer = new float[10000];
+        for (int i = 0; i < 10000; ++i) {
+            inputBuffer[i] = i;
+        }
+        int[] dims = { 100, 100 };
+        PaddlePredictor.setInput(0, dims, inputBuffer);
+        PaddlePredictor.run();
+        float[] output = PaddlePredictor.getFloatOutput(0);
+        assertEquals(output.length, 50000);
+        assertEquals(output[0], 50.2132f, 1e-3f);
+        assertEquals(output[1], -28.8729f, 1e-3f);
+        PaddlePredictor.clear();
+    }
+}
--- a/paddle/fluid/lite/api/model_test.cc
+++ b/paddle/fluid/lite/api/model_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <glog/logging.h>
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/api/paddle_api.h"
+#include "paddle/fluid/lite/api/paddle_use_kernels.h"
+#include "paddle/fluid/lite/api/paddle_use_ops.h"
+#include "paddle/fluid/lite/api/paddle_use_passes.h"
+#include "paddle/fluid/lite/api/test_helper.h"
+#include "paddle/fluid/lite/core/cpu_info.h"
+#include "paddle/fluid/lite/utils/string.h"
+namespace paddle {
+namespace lite_api {
+void OutputOptModel(const std::string& load_model_dir,
+                    const std::string& save_optimized_model_dir,
+                    const std::vector<int64_t>& input_shape) {
+  lite_api::CxxConfig config;
+  config.set_model_dir(load_model_dir);
+  config.set_preferred_place(Place{TARGET(kX86), PRECISION(kFloat)});
+  config.set_valid_places({
+      Place{TARGET(kX86), PRECISION(kFloat)},
+      Place{TARGET(kARM), PRECISION(kFloat)},
+  });
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize(input_shape);
+  auto* data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; ++i) {
+    data[i] = i;
+  }
+  predictor->Run();
+  // delete old optimized model
+  int ret = system(
+      paddle::lite::string_format("rm -rf %s", save_optimized_model_dir.c_str())
+          .c_str());
+  if (ret == 0) {
+    LOG(INFO) << "delete old optimized model " << save_optimized_model_dir;
+  }
+  predictor->SaveOptimizedModel(save_optimized_model_dir);
+  LOG(INFO) << "Load model from " << load_model_dir;
+  LOG(INFO) << "Save optimized model to " << save_optimized_model_dir;
+}
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+void Run(const std::vector<int64_t>& input_shape, const std::string& model_dir,
+         const int repeat, const int thread_num, const int warmup_times = 10) {
+  lite::DeviceInfo::Init();
+  lite::DeviceInfo::Global().SetRunMode(lite::LITE_POWER_HIGH, thread_num);
+  lite_api::MobileConfig config;
+  config.set_model_dir(model_dir);
+  auto predictor = lite_api::CreatePaddlePredictor(config);
+  auto input_tensor = predictor->GetInput(0);
+  input_tensor->Resize(input_shape);
+  float* input_data = input_tensor->mutable_data<float>();
+  int input_num = 1;
+  for (int i = 0; i < input_shape.size(); ++i) {
+    input_num *= input_shape[i];
+  }
+  for (int i = 0; i < input_num; ++i) {
+    input_data[i] = i;
+  }
+  for (int i = 0; i < warmup_times; ++i) {
+    predictor->Run();
+  }
+  auto start = lite::GetCurrentUS();
+  for (int i = 0; i < repeat; ++i) {
+    predictor->Run();
+  }
+  auto end = lite::GetCurrentUS();
+  LOG(INFO) << "================== Speed Report ===================";
+  LOG(INFO) << "Model: " << model_dir << ", threads num " << thread_num
+            << ", warmup: " << warmup_times << ", repeats: " << repeat
+            << ", spend " << (end - start) / repeat / 1000.0
+            << " ms in average.";
+  auto output = predictor->GetOutput(0);
+  const float* out = output->data<float>();
+  LOG(INFO) << "out " << out[0];
+  LOG(INFO) << "out " << out[1];
+  auto output_shape = output->shape();
+  int output_num = 1;
+  for (int i = 0; i < output_shape.size(); ++i) {
+    output_num *= output_shape[i];
+  }
+  LOG(INFO) << "output_num: " << output_num;
+}
+#endif
+}  // namespace lite_api
+}  // namespace paddle
+int main(int argc, char** argv) {
+  if (argc < 4) {
+    LOG(INFO) << "usage: " << argv[0] << " <model_dir> <repeat> <thread_num>";
+    exit(0);
+  }
+  std::string load_model_dir = argv[1];
+  std::string save_optimized_model_dir = load_model_dir + "opt2";
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  int repeat = std::stoi(argv[2]);
+  int thread_num = std::stoi(argv[3]);
+#endif
+  std::vector<int64_t> input_shape{1, 3, 224, 224};
+  // Output optimized model
+  paddle::lite_api::OutputOptModel(load_model_dir, save_optimized_model_dir,
+                                   input_shape);
+#ifdef LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
+  // Run inference using optimized model
+  paddle::lite_api::Run(input_shape, save_optimized_model_dir, repeat,
+                        thread_num);
+#endif
+  return 0;
+}
--- a/paddle/fluid/lite/api/test_helper.h
+++ b/paddle/fluid/lite/api/test_helper.h
@@ -15,6 +15,7 @@
 #pragma once
 #include <gflags/gflags.h>
+#include <sys/time.h>
 #include <time.h>
 // for eval

--- a/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/conv_compute_test.cc
@@ -469,6 +469,389 @@ TEST(conv_arm_int8, int8_fp32) {
  }
 }
+TEST(conv_direct_int8, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto ic : {1, 3, 8}) {
+      for (auto oc : {1, 3, 8}) {
+        for (auto ih : {5, 15, 28}) {
+          for (auto iw : {5, 15, 28}) {
+            for (auto flag_bias : {false, true}) {
+              for (auto flag_relu : {false, true}) {
+                for (auto depthwise : {false, /*true*/}) {
+                  for (auto dilation : {1}) {
+                    for (auto stride : {1, 2}) {
+                      for (auto padding : {1}) {
+                        for (auto ks : {3}) {
+                          int group = 1;
+                          if (depthwise) {  // depthwise convolution ?
+                            group = oc = ic;
+                          }
+                          const int dks = dilation * (ks - 1) + 1;
+                          int oh = (ih + 2 * padding - dks) / stride + 1;
+                          int ow = (iw + 2 * padding - dks) / stride + 1;
+                          std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                          std::vector<int64_t> filter_shape = {oc, ic / group,
+                                                               ks, ks};
+                          std::vector<int64_t> bias_shape({1, oc, 1, 1});
+                          std::vector<int64_t> output_shape({n, oc, oh, ow});
+                          Tensor input_fp32, input_int8;
+                          Tensor filter_fp32, filter_int8;
+                          Tensor bias_int32;
+                          Tensor output_int32_ref, output_int32;
+                          Tensor output_fp32_ref, output_fp32;
+                          Tensor output_int8_ref, output_int8;
+                          input_fp32.Resize(input_shape);
+                          input_int8.Resize(input_shape);
+                          filter_fp32.Resize(filter_shape);
+                          filter_int8.Resize(filter_shape);
+                          bias_int32.Resize(bias_shape);
+                          output_int32.Resize(output_shape);
+                          output_int32_ref.Resize(output_shape);
+                          output_fp32_ref.Resize(output_shape);
+                          output_fp32.Resize(output_shape);
+                          output_int8_ref.Resize(output_shape);
+                          output_int8.Resize(output_shape);
+                          float* input_fp32_data =
+                              input_fp32.mutable_data<float>();
+                          int8_t* input_int8_data =
+                              input_int8.mutable_data<int8_t>();
+                          float* filter_fp32_data =
+                              filter_fp32.mutable_data<float>();
+                          int8_t* filter_int8_data =
+                              filter_int8.mutable_data<int8_t>();
+                          int* bias_int32_data =
+                              bias_int32.mutable_data<int32_t>();
+                          for (int i = 0; i < input_fp32.dims().production();
+                               i++) {
+                            input_fp32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          for (int i = 0; i < filter_fp32.dims().production();
+                               i++) {
+                            filter_fp32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          for (int i = 0; i < bias_int32.dims().production();
+                               i++) {
+                            bias_int32_data[i] = i % 10 * (i % 3 - 1);
+                          }
+                          std::vector<float> in_scale;
+                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                              input_fp32, &in_scale, -1, 127.f);
+                          lite::arm::math::trans_tensor_fp32_to_int8(
+                              &input_fp32, &input_int8, in_scale[0]);
+                          std::vector<float> w_scale;
+                          lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                              filter_fp32, &w_scale, -1, 127.f);
+                          int axis_size = oc;
+                          int inner_size = ic / group * ks * ks;
+                          w_scale = lite::arm::math::get_tensor_scale_n(
+                              filter_fp32_data, axis_size, inner_size, 127.f);
+                          lite::arm::math::fp32_to_int8(
+                              filter_fp32_data, filter_int8_data,
+                              w_scale.data(), axis_size, 1, inner_size);
+                          operators::ConvParam param;
+                          param.x = &input_int8;
+                          param.filter = &filter_int8;
+                          if (flag_bias) {
+                            param.bias = &bias_int32;
+                          }
+                          param.fuse_relu = false;
+                          param.paddings = std::vector<int>({padding, padding});
+                          param.strides = std::vector<int>({stride, stride});
+                          param.dilations =
+                              std::vector<int>({dilation, dilation});
+                          param.groups = group;
+                          param.output = &output_int32_ref;
+                          conv_compute_ref<int8_t, int>(param);
+                          int* output_int32_ref_data =
+                              output_int32_ref.mutable_data<int>();
+                          // ============ int8direct_int32 ============
+                          param.output = &output_int32;
+                          std::unique_ptr<KernelContext> ctx_int32(
+                              new KernelContext);
+                          lite::arm::math::DirectConvInt8<PRECISION(kInt32)>
+                              int8direct_int32;
+                          int8direct_int32.init(param,
+                                                &ctx_int32->As<ARMContext>());
+                          int8direct_int32.create(param,
+                                                  &ctx_int32->As<ARMContext>());
+                          int8direct_int32.run(param);
+                          int* output_int32_data =
+                              output_int32.mutable_data<int>();
+                          for (int i = 0; i < output_int32.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_int32_data[i],
+                                        output_int32_ref_data[i], 1e-3);
+                          }
+                          // ============ int8direct_int8 ============
+                          int8_t* output_int8_ref_data =
+                              output_int8_ref.mutable_data<int8_t>();
+                          lite::arm::math::trans_tensor_int32_to_int8(
+                              &output_int32_ref, &output_int8_ref, in_scale[0],
+                              1, w_scale);
+                          param.output = &output_int8;
+                          param.input_scale = in_scale[0];
+                          param.output_scale = 1;
+                          param.weight_scale = w_scale;
+                          std::unique_ptr<KernelContext> ctx_int8(
+                              new KernelContext);
+                          lite::arm::math::DirectConvInt8<PRECISION(kInt8)>
+                              int8direct_int8;
+                          int8direct_int8.init(param,
+                                               &ctx_int8->As<ARMContext>());
+                          int8direct_int8.create(param,
+                                                 &ctx_int8->As<ARMContext>());
+                          int8direct_int8.run(param);
+                          int8_t* output_int8_data =
+                              output_int8.mutable_data<int8_t>();
+                          for (int i = 0; i < output_int8.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_int8_data[i],
+                                        output_int8_ref_data[i], 1e-3);
+                          }
+                          // ============ int8direct_float32 ============
+                          float* output_fp32_ref_data =
+                              output_fp32_ref.mutable_data<float>();
+                          lite::arm::math::trans_tensor_int32_to_fp32(
+                              &output_int32_ref, &output_fp32_ref, in_scale[0],
+                              w_scale);
+                          param.output = &output_fp32;
+                          param.input_scale = in_scale[0];
+                          param.output_scale = 1;
+                          param.weight_scale = w_scale;
+                          std::unique_ptr<KernelContext> ctx_fp32(
+                              new KernelContext);
+                          lite::arm::math::DirectConvInt8<PRECISION(kFloat)>
+                              int8direct_fp32;
+                          int8direct_fp32.init(param,
+                                               &ctx_fp32->As<ARMContext>());
+                          int8direct_fp32.create(param,
+                                                 &ctx_fp32->As<ARMContext>());
+                          int8direct_fp32.run(param);
+                          float* output_fp32_data =
+                              output_fp32.mutable_data<float>();
+                          for (int i = 0; i < output_fp32.dims().production();
+                               i++) {
+                            EXPECT_NEAR(output_fp32_data[i],
+                                        output_fp32_ref_data[i], 1e-3);
+                          }
+                        }
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
+TEST(conv_depthwise_int8, compute) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto ic : {1, 3, 8}) {
+      for (auto ih : {5, 15, 28}) {
+        for (auto iw : {5, 15, 28}) {
+          for (auto flag_bias : {false, true}) {
+            for (auto flag_relu : {false, true}) {
+              for (auto dilation : {1}) {
+                for (auto stride : {1, 2}) {
+                  for (auto padding : {1, 2}) {
+                    for (auto ks : {3, /*5 */}) {
+                      int group = ic;
+                      int oc = ic;
+                      bool flag_dw_3x3 = (ks == 3) && (padding == 1) &&
+                                         (stride == 1 || stride == 2);
+                      bool flag_dw_5x5 =
+                          (ks == 5 && stride == 1 && padding == 2);
+                      bool flag_dw = flag_dw_3x3 || flag_dw_5x5;
+                      if (!flag_dw) continue;
+                      const int dks = dilation * (ks - 1) + 1;
+                      int oh = (ih + 2 * padding - dks) / stride + 1;
+                      int ow = (iw + 2 * padding - dks) / stride + 1;
+                      std::vector<int64_t> input_shape = {n, ic, ih, iw};
+                      std::vector<int64_t> filter_shape = {oc, ic / group, ks,
+                                                           ks};
+                      std::vector<int64_t> bias_shape({1, oc, 1, 1});
+                      std::vector<int64_t> output_shape({n, oc, oh, ow});
+                      Tensor input_fp32, input_int8;
+                      Tensor filter_fp32, filter_int8;
+                      Tensor bias_int32;
+                      Tensor output_int32_ref, output_int32;
+                      Tensor output_fp32_ref, output_fp32;
+                      Tensor output_int8_ref, output_int8;
+                      input_fp32.Resize(input_shape);
+                      input_int8.Resize(input_shape);
+                      filter_fp32.Resize(filter_shape);
+                      filter_int8.Resize(filter_shape);
+                      bias_int32.Resize(bias_shape);
+                      output_int32.Resize(output_shape);
+                      output_int32_ref.Resize(output_shape);
+                      output_fp32_ref.Resize(output_shape);
+                      output_fp32.Resize(output_shape);
+                      output_int8_ref.Resize(output_shape);
+                      output_int8.Resize(output_shape);
+                      float* input_fp32_data = input_fp32.mutable_data<float>();
+                      int8_t* input_int8_data =
+                          input_int8.mutable_data<int8_t>();
+                      float* filter_fp32_data =
+                          filter_fp32.mutable_data<float>();
+                      int8_t* filter_int8_data =
+                          filter_int8.mutable_data<int8_t>();
+                      int* bias_int32_data = bias_int32.mutable_data<int32_t>();
+                      for (int i = 0; i < input_fp32.dims().production(); i++) {
+                        input_fp32_data[i] = i % 10 * (i % 3 - 1);
+                      }
+                      for (int i = 0; i < filter_fp32.dims().production();
+                           i++) {
+                        filter_fp32_data[i] = i % 10 * (i % 3 - 1);
+                      }
+                      for (int i = 0; i < bias_int32.dims().production(); i++) {
+                        bias_int32_data[i] = i % 10 * (i % 3 - 1);
+                      }
+                      std::vector<float> in_scale;
+                      lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                          input_fp32, &in_scale, -1, 127.f);
+                      lite::arm::math::trans_tensor_fp32_to_int8(
+                          &input_fp32, &input_int8, in_scale[0]);
+                      std::vector<float> w_scale;
+                      lite::arm::math::get_tensor_scale<PRECISION(kFloat)>(
+                          filter_fp32, &w_scale, -1, 127.f);
+                      int axis_size = oc;
+                      int inner_size = ic / group * ks * ks;
+                      w_scale = lite::arm::math::get_tensor_scale_n(
+                          filter_fp32_data, axis_size, inner_size, 127.f);
+                      lite::arm::math::fp32_to_int8(
+                          filter_fp32_data, filter_int8_data, w_scale.data(),
+                          axis_size, 1, inner_size);
+                      operators::ConvParam param;
+                      param.x = &input_int8;
+                      param.filter = &filter_int8;
+                      if (flag_bias) {
+                        param.bias = &bias_int32;
+                      }
+                      param.fuse_relu = false;
+                      param.paddings = std::vector<int>({padding, padding});
+                      param.strides = std::vector<int>({stride, stride});
+                      param.dilations = std::vector<int>({dilation, dilation});
+                      param.groups = group;
+                      param.output = &output_int32_ref;
+                      conv_compute_ref<int8_t, int>(param);
+                      int* output_int32_ref_data =
+                          output_int32_ref.mutable_data<int>();
+                      // ============ int8depthwise_int32 ============
+                      param.output = &output_int32;
+                      std::unique_ptr<KernelContext> ctx_int32(
+                          new KernelContext);
+                      lite::arm::math::DepthwiseConvInt8<PRECISION(kInt32)>
+                          int8depthwise_int32;
+                      int8depthwise_int32.init(param,
+                                               &ctx_int32->As<ARMContext>());
+                      int8depthwise_int32.create(param,
+                                                 &ctx_int32->As<ARMContext>());
+                      int8depthwise_int32.run(param);
+                      int* output_int32_data = output_int32.mutable_data<int>();
+                      for (int i = 0; i < output_int32.dims().production();
+                           i++) {
+                        EXPECT_NEAR(output_int32_data[i],
+                                    output_int32_ref_data[i], 1e-3);
+                      }
+                      // ============ int8depthwise_int8============
+                      int8_t* output_int8_ref_data =
+                          output_int8_ref.mutable_data<int8_t>();
+                      lite::arm::math::trans_tensor_int32_to_int8(
+                          &output_int32_ref, &output_int8_ref, in_scale[0], 1,
+                          w_scale);
+                      param.output = &output_int8;
+                      param.input_scale = in_scale[0];
+                      param.output_scale = 1;
+                      param.weight_scale = w_scale;
+                      std::unique_ptr<KernelContext> ctx_int8(
+                          new KernelContext);
+                      lite::arm::math::DepthwiseConvInt8<PRECISION(kInt8)>
+                          int8depthwise_int8;
+                      int8depthwise_int8.init(param,
+                                              &ctx_int8->As<ARMContext>());
+                      int8depthwise_int8.create(param,
+                                                &ctx_int8->As<ARMContext>());
+                      int8depthwise_int8.run(param);
+                      int8_t* output_int8_data =
+                          output_int8.mutable_data<int8_t>();
+                      for (int i = 0; i < output_int8.dims().production();
+                           i++) {
+                        EXPECT_NEAR(output_int8_data[i],
+                                    output_int8_ref_data[i], 1e-3);
+                      }
+                      // ============int8depthwise_float32 ============
+                      float* output_fp32_ref_data =
+                          output_fp32_ref.mutable_data<float>();
+                      lite::arm::math::trans_tensor_int32_to_fp32(
+                          &output_int32_ref, &output_fp32_ref, in_scale[0],
+                          w_scale);
+                      param.output = &output_fp32;
+                      param.input_scale = in_scale[0];
+                      param.output_scale = 1;
+                      param.weight_scale = w_scale;
+                      std::unique_ptr<KernelContext> ctx_fp32(
+                          new KernelContext);
+                      lite::arm::math::DepthwiseConvInt8<PRECISION(kFloat)>
+                          int8depthwise_fp32;
+                      int8depthwise_fp32.init(param,
+                                              &ctx_fp32->As<ARMContext>());
+                      int8depthwise_fp32.create(param,
+                                                &ctx_fp32->As<ARMContext>());
+                      int8depthwise_fp32.run(param);
+                      float* output_fp32_data =
+                          output_fp32.mutable_data<float>();
+                      for (int i = 0; i < output_fp32.dims().production();
+                           i++) {
+                        EXPECT_NEAR(output_fp32_data[i],
+                                    output_fp32_ref_data[i], 1e-3);
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+}
 TEST(conv_arm, compute) {
  DeviceInfo::Init();
 #if 1