Merge branch 'incubate/lite' of http://10.87.145.36/inference/paddlelite into xzl/incubate/lite

8202e25d · nhzlx · f4cc504f · dfbc4b50 · 8202e25d · 8202e25d
33 changed file
--- a/.gitlab-ci.yml
+++ b/.gitlab-ci.yml
@@ -114,6 +114,32 @@ build:mobile_armlinux:
            - $MOBILE_LITE_CACHE1
            - ~/.ccache
+build:mobile_model_mobilenetv1:
+    tags:
+        - lite
+    stage: build_mobile
+    image: $MOBILE_LITE_DOCKER_IMAGE
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+    script:
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv1
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv1
+    dependencies:
+        - build:server
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_model_mobilenetv1
 build:mobile_model_mobilenetv2:
    tags:
        - lite
@@ -126,8 +152,34 @@ build:mobile_model_mobilenetv2:
            - $MOBILE_LITE_CACHE1
            - ~/.ccache
    script:
-        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model1
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_mobilenetv2
-        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model1
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_mobilenetv2
+    dependencies:
+        - build:server
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+            - $CI_PROJECT_DIR/build_mobile_model_mobilenetv2
+build:mobile_model_resnet50:
+    tags:
+        - lite
+    stage: build_mobile
+    image: $MOBILE_LITE_DOCKER_IMAGE
+    cache:
+        key: mobile_thirdparty
+        paths:
+            - $MOBILE_LITE_CACHE0
+            - $MOBILE_LITE_CACHE1
+            - ~/.ccache
+    script:
+        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_resnet50
+        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_resnet50
    dependencies:
        - build:server
@@ -138,4 +190,30 @@ build:mobile_model_mobilenetv2:
            - $MOBILE_LITE_CACHE0
            - $MOBILE_LITE_CACHE1
            - ~/.ccache
-            - $CI_PROJECT_DIR/build_mobile_model1
+            - $CI_PROJECT_DIR/build_mobile_model_resnet50
+#build:mobile_model_inceptionv4:
+#    tags:
+#        - lite
+#    stage: build_mobile
+#    image: $MOBILE_LITE_DOCKER_IMAGE
+#    cache:
+#        key: mobile_thirdparty
+#        paths:
+#            - $MOBILE_LITE_CACHE0
+#            - $MOBILE_LITE_CACHE1
+#            - ~/.ccache
+#    script:
+#        - export CCACHE_DIR=$CI_PROJECT_DIR/build_mobile_model_inceptionv4
+#        - ./paddle/fluid/lite/tools/build.sh build_test_arm_model_inceptionv4
+#
+#    dependencies:
+#        - build:server
+#
+#    cache:
+#        key: mobile_thirdparty
+#        paths:
+#            - $MOBILE_LITE_CACHE0
+#            - $MOBILE_LITE_CACHE1
+#            - ~/.ccache
+#            - $CI_PROJECT_DIR/build_mobile_model_inceptionv4
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -56,6 +56,16 @@ if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
    include(cross_compiling/host)
    include(cross_compiling/armlinux)
    include(cross_compiling/android)
+    if(NOT CMAKE_BUILD_TYPE)
+        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
+            "Default use Release in android" FORCE)
+    endif()
+    if(NOT THIRD_PARTY_BUILD_TYPE)
+        set(THIRD_PARTY_BUILD_TYPE "MinSizeRel" CACHE STRING
+            "Default use MinSizeRel in android" FORCE)
+    endif()
 endif()
 project(paddle CXX C)
@@ -133,15 +143,6 @@ if(ANDROID OR IOS OR ARMLINUX)
        "Disable RDMA when cross-compiling for Android and iOS" FORCE)
    set(WITH_MKL OFF CACHE STRING
        "Disable MKL when cross-compiling for Android and iOS" FORCE)
-    if(NOT CMAKE_BUILD_TYPE)
-        set(CMAKE_BUILD_TYPE "Release" CACHE STRING
-            "Default use Release in android" FORCE)
-    endif()
-    if(NOT THIRD_PARTY_BUILD_TYPE)
-        set(THIRD_PARTY_BUILD_TYPE "MinSizeRel" CACHE STRING
-            "Default use MinSizeRel in android" FORCE)
-    endif()
 endif()
 # for lite, both server and mobile framework.

--- a/paddle/fluid/lite/CMakeLists.txt
+++ b/paddle/fluid/lite/CMakeLists.txt
@@ -190,6 +190,9 @@ add_subdirectory(gen_code)
 if (WITH_TESTING)
    lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "lite_naive_model.tar.gz")
    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2_relu.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v1.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "mobilenet_v2.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "resnet50.tar.gz")
+        lite_download_and_uncompress(${LITE_MODEL_DIR} ${LITE_URL} "inception_v4.tar.gz")
    endif()
 endif()
--- a/paddle/fluid/lite/api/CMakeLists.txt
+++ b/paddle/fluid/lite/api/CMakeLists.txt
@@ -33,24 +33,37 @@ include(ExternalProject)
 set(LITE_DEMO_INSTALL_DIR "${THIRD_PARTY_PATH}/inference_demo" CACHE STRING
        "A path setting inference demo download directories.")
-if(WITH_TESTING)
+if(NOT LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
-    set(eval_model_dir "")
-    set(test_cxx_api_deps cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${x86_kernels})
-    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
-        set(eval_model_dir ${LITE_MODEL_DIR}/mobilenet_v2_relu)
-        set(test_cxx_api_deps ${test_cxx_api_deps} ${arm_kernels})
-    endif()
    lite_cc_test(test_cxx_api_lite SRCS cxx_api_test.cc
-       DEPS ${test_cxx_api_deps}
+       DEPS cxx_api_lite mir_passes 
+       ${ops_lite} ${host_kernels} ${x86_kernels}
       ARGS --model_dir=${LITE_MODEL_DIR}/lite_naive_model
-            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt 
+            --optimized_model=${LITE_MODEL_DIR}/lite_naive_model_opt SERIAL)
-            --eval_model_dir=eval_model_dir SERIAL)
    add_dependencies(test_cxx_api_lite extern_lite_download_lite_naive_model_tar_gz)
-    if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK)
+endif()
-        add_dependencies(test_cxx_api_lite extern_lite_download_mobilenet_v2_relu_tar_gz)
-    endif()
+if(LITE_WITH_LIGHT_WEIGHT_FRAMEWORK AND WITH_TESTING)
+    set(lite_model_test_DEPS cxx_api_lite mir_passes ${ops_lite} ${host_kernels} ${arm_kernels})
+    lite_cc_test(test_mobilenetv1_lite SRCS mobilenetv1_test.cc
+       DEPS ${lite_model_test_DEPS} 
+       ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v1 SERIAL)
+    add_dependencies(test_mobilenetv1_lite extern_lite_download_mobilenet_v1_tar_gz)
+    lite_cc_test(test_mobilenetv2_lite SRCS mobilenetv2_test.cc
+       DEPS ${lite_model_test_DEPS} 
+       ARGS --model_dir=${LITE_MODEL_DIR}/mobilenet_v2 SERIAL)
+    add_dependencies(test_mobilenetv2_lite extern_lite_download_mobilenet_v2_tar_gz)
+    lite_cc_test(test_resnet50_lite SRCS resnet50_test.cc
+       DEPS ${lite_model_test_DEPS} 
+       ARGS --model_dir=${LITE_MODEL_DIR}/resnet50 SERIAL)
+    add_dependencies(test_resnet50_lite extern_lite_download_resnet50_tar_gz)
+    lite_cc_test(test_inceptionv4_lite SRCS inceptionv4_test.cc
+       DEPS ${lite_model_test_DEPS} 
+       ARGS --model_dir=${LITE_MODEL_DIR}/inception_v4 SERIAL)
+    add_dependencies(test_inceptionv4_lite extern_lite_download_inception_v4_tar_gz)
 endif()
 # These tests needs CLI arguments, and is not supported in ARM CI.

--- a/paddle/fluid/lite/api/cxx_api_test.cc
+++ b/paddle/fluid/lite/api/cxx_api_test.cc
@@ -27,9 +27,6 @@
 DEFINE_string(startup_program_path, "", "");
 DEFINE_string(main_program_path, "", "");
-// for eval
-DEFINE_string(eval_model_dir, "", "");
 namespace paddle {
 namespace lite {
@@ -88,37 +85,5 @@ TEST(CXXApi, save_model) {
 }*/
 #endif  // LITE_WITH_LIGHT_WEIGHT_FRAMEWORK
-#ifdef LITE_WITH_ARM
-TEST(CXXApi, eval) {
-  DeviceInfo::Init();
-  lite::ExecutorLite predictor;
-  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
-                                   Place{TARGET(kARM), PRECISION(kFloat)}});
-  predictor.Build(FLAGS_eval_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
-                  valid_places);
-  auto* input_tensor = predictor.GetInput(0);
-  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
-  auto* data = input_tensor->mutable_data<float>();
-  for (int i = 0; i < input_tensor->dims().production(); i++) {
-    data[i] = 1;
-  }
-  predictor.Run();
-  auto* out = predictor.GetOutput(0);
-  std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
-                              0.00098268, 0.00104065, 0.00099962, 0.00095181,
-                              0.00099694, 0.00099406});
-  for (int i = 0; i < results.size(); ++i) {
-    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
-  }
-  ASSERT_EQ(out->dims().size(), 2);
-  ASSERT_EQ(out->dims()[0], 1);
-  ASSERT_EQ(out->dims()[1], 1000);
-}
-#endif
 }  // namespace lite
 }  // namespace paddle
--- a/paddle/fluid/lite/api/inceptionv4_test.cc
+++ b/paddle/fluid/lite/api/inceptionv4_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
+// for eval
+DEFINE_string(model_dir, "", "");
+namespace paddle {
+namespace lite {
+#ifdef LITE_WITH_ARM
+TEST(InceptionV4, test) {
+  DeviceInfo::Init();
+  lite::ExecutorLite predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+  predictor.Run();
+  auto* out = predictor.GetOutput(0);
+  std::vector<float> results({0.00078033, 0.00083865, 0.00060029, 0.00057083,
+                              0.00070094, 0.00080584, 0.00044525, 0.00074907,
+                              0.00059774, 0.00063654});
+  for (int i = 0; i < results.size(); ++i) {
+    // TODO(sangoly): fix assert
+    // EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
+    LOG(INFO) << "out -> " << out->data<float>()[i];
+  }
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+}
+#endif
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/mobilenetv1_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv1_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
+// for eval
+DEFINE_string(model_dir, "", "");
+namespace paddle {
+namespace lite {
+#ifdef LITE_WITH_ARM
+TEST(MobileNetV1, test) {
+  DeviceInfo::Init();
+  lite::ExecutorLite predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+  predictor.Run();
+  auto* out = predictor.GetOutput(0);
+  std::vector<float> results({1.91308980e-04, 5.92055148e-04, 1.12303176e-04,
+                              6.27335685e-05, 1.27507330e-04, 1.32147351e-03,
+                              3.13812525e-05, 6.52209565e-05, 4.78087313e-05,
+                              2.58822285e-04});
+  for (int i = 0; i < results.size(); ++i) {
+    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
+  }
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+}
+#endif
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/mobilenetv2_test.cc
+++ b/paddle/fluid/lite/api/mobilenetv2_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
+// for eval
+DEFINE_string(model_dir, "", "");
+namespace paddle {
+namespace lite {
+#ifdef LITE_WITH_ARM
+TEST(MobileNetV2, test) {
+  DeviceInfo::Init();
+  lite::ExecutorLite predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+  predictor.Run();
+  auto* out = predictor.GetOutput(0);
+  std::vector<float> results({0.00097802, 0.00099822, 0.00103093, 0.00100121,
+                              0.00098268, 0.00104065, 0.00099962, 0.00095181,
+                              0.00099694, 0.00099406});
+  for (int i = 0; i < results.size(); ++i) {
+    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
+  }
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+}
+#endif
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/api/resnet50_test.cc
+++ b/paddle/fluid/lite/api/resnet50_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include <gflags/gflags.h>
+#include <gtest/gtest.h>
+#include <vector>
+#include "paddle/fluid/lite/api/cxx_api.h"
+#include "paddle/fluid/lite/core/mir/use_passes.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/kernels/use_kernels.h"
+#include "paddle/fluid/lite/operators/use_ops.h"
+// for eval
+DEFINE_string(model_dir, "", "");
+namespace paddle {
+namespace lite {
+#ifdef LITE_WITH_ARM
+TEST(ResNet50, test) {
+  DeviceInfo::Init();
+  lite::ExecutorLite predictor;
+  std::vector<Place> valid_places({Place{TARGET(kHost), PRECISION(kFloat)},
+                                   Place{TARGET(kARM), PRECISION(kFloat)}});
+  predictor.Build(FLAGS_model_dir, Place{TARGET(kARM), PRECISION(kFloat)},
+                  valid_places);
+  auto* input_tensor = predictor.GetInput(0);
+  input_tensor->Resize(DDim(std::vector<DDim::value_type>({1, 3, 224, 224})));
+  auto* data = input_tensor->mutable_data<float>();
+  for (int i = 0; i < input_tensor->dims().production(); i++) {
+    data[i] = 1;
+  }
+  predictor.Run();
+  auto* out = predictor.GetOutput(0);
+  std::vector<float> results({2.41399175e-04, 4.13724629e-04, 2.64324830e-04,
+                              9.68795503e-05, 2.01968738e-04, 8.14945495e-04,
+                              7.45922662e-05, 1.76479152e-04, 7.47223166e-05,
+                              6.06825110e-04});
+  for (int i = 0; i < results.size(); ++i) {
+    EXPECT_NEAR(out->data<float>()[i], results[i], 1e-5);
+  }
+  ASSERT_EQ(out->dims().size(), 2);
+  ASSERT_EQ(out->dims()[0], 1);
+  ASSERT_EQ(out->dims()[1], 1000);
+}
+#endif
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/arm/math/CMakeLists.txt
+++ b/paddle/fluid/lite/arm/math/CMakeLists.txt
@@ -16,7 +16,7 @@ cc_library(math_arm SRCS
    elementwise.cc
    concat.cc
    sgemv.cc
-    type_trans.cpp
+    type_trans.cc
    conv_impl.cc
    conv_direct_3x3s1.cc
    conv_direct_3x3s2.cc

--- a/paddle/fluid/lite/arm/math/type_trans.cpp
+++ b/paddle/fluid/lite/arm/math/type_trans.cpp
-// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
-//
-// Licensed under the Apache License, Version 2.0 (the "License");
-// you may not use this file except in compliance with the License.
-// You may obtain a copy of the License at
-//
-//     http://www.apache.org/licenses/LICENSE-2.0
-//
-// Unless required by applicable law or agreed to in writing, software
-// distributed under the License is distributed on an "AS IS" BASIS,
-// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-// See the License for the specific language governing permissions and
-// limitations under the License.
-#include "paddle/fluid/lite/arm/math/type_trans.h"
-#include <arm_neon.h>
-#include <string.h>
-#include "paddle/fluid/lite/arm/math/saturate.h"
-namespace paddle {
-namespace lite {
-namespace arm {
-namespace math {
-template <typename dtype>
-void int32_to_dtype(const int* din, dtype* dout, const float* scale,
-                    int axis_size, int64_t outer_size, int64_t inner_size);
-void fp32_to_int8(const float* din, signed char* dout, const float* scale,
-                  int axis_size, int64_t outer_size, int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = outer_size * axis_size;
-#pragma omp parallel for
-  for (int j = 0; j < loop_size; ++j) {
-    float inv_scale = 1.f / scale[j % axis_size];
-    float32x4_t vzero = vdupq_n_f32(0.f);
-    float32x4_t vscale = vdupq_n_f32(inv_scale);
-    float32x4_t vpoff = vdupq_n_f32(0.5f);
-    float32x4_t vnoff = vdupq_n_f32(-0.5f);
-    const float* din_c = din + j * inner_size;
-    signed char* dout_c = dout + j * inner_size;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      const float* din_ptr = din_c;
-      signed char* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp q0, q1, [%[in]], #32                           \n"
-          "ldp q2, q3, [%[in]], #32                   \n"
-          "0:                                         \n" /* main loop */
-          "fmul v4.4s, v0.4s, %[scale].4s             \n"
-          "fmul v5.4s, v1.4s, %[scale].4s             \n"
-          "fmul v6.4s, v2.4s, %[scale].4s             \n"
-          "fmul v7.4s, v3.4s, %[scale].4s             \n"
-          "ldp q0, q1, [%[in]], #32                   \n"
-          "subs %[cnt], %[cnt], #1                    \n"
-          "FCVTAS v8.4s, v4.4s                        \n"
-          "FCVTAS v9.4s, v5.4s                        \n"
-          "FCVTAS v10.4s, v6.4s                       \n"
-          "FCVTAS v11.4s, v7.4s                       \n"
-          "ldp q2, q3, [%[in]], #32                   \n"
-          "sqxtn    v4.4h, v8.4s                      \n"
-          "sqxtn2   v4.8h, v9.4s                      \n"
-          "sqxtn    v5.4h, v10.4s                     \n"
-          "sqxtn2   v5.8h, v11.4s                     \n"
-          "sqxtn    v8.8b, v4.8h                      \n"
-          "sqxtn2   v8.16b, v5.8h                     \n"
-          "str q8, [%[out]], #16                      \n"
-          "bne    0b                                  \n"
-          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11");
-#else
-      asm volatile(
-          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
-          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
-          "0:                                     @ main loop\n"
-          "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-          "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
-          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
-          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
-          "vcgt.f32   q10, q2, %q[vzero]          @ get mask > 0, in2\n"
-          "vcgt.f32   q11, q3, %q[vzero]          @ get mask > 0, in3\n"
-          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
-          "vbif.f32   q6, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q7, %q[vnoff], q11          @ get right offset\n"
-          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q6, q2, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q7, q3, %q[vscale]          @ mul scale\n"
-          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
-          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
-          "vcvt.s32.f32  q2, q6                   @ cvt to int32\n"
-          "vcvt.s32.f32  q3, q7                   @ cvt to int32\n"
-          "vqmovn.s32 d8, q0                      @ cnt to int16\n"
-          "vqmovn.s32 d9, q1                      @ cnt to int16\n"
-          "vqmovn.s32 d10, q2                     @ cnt to int16\n"
-          "vqmovn.s32 d11, q3                     @ cnt to int16\n"
-          "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
-          "vqmovn.s16 d12, q4                     @ cnt to int8\n"
-          "vqmovn.s16 d13, q5                     @ cnt to int8\n"
-          "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
-          "vst1.32    {d12-d13},  [%[dout]]!      @ write to output\n"
-          "subs   %[cnt], #1                      @ loop count -1\n"
-          "bne    0b                              @ to main loop\n"
-          : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
-          : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11");
-#endif
-    }
-    const float* din_r = din_c + 16 * cnt;
-    signed char* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = saturate_cast<int8_t>(roundf(inv_scale * din_r[i]));
-    }
-  }
-}
-void fp32_to_int16(const float* din, int16_t* dout, const float* scale,
-                   int axis_size, int64_t outer_size, int64_t inner_size) {
-  int cnt = inner_size / 8;
-  int remain = inner_size & 7;
-  int64_t loop_size = outer_size * axis_size;
-#pragma omp parallel for
-  for (int j = 0; j < loop_size; ++j) {
-    float inv_scale = 1.f / scale[j % axis_size];
-    float32x4_t vzero = vdupq_n_f32(0.f);
-    float32x4_t vscale = vdupq_n_f32(inv_scale);
-    float32x4_t vpoff = vdupq_n_f32(0.5f);
-    float32x4_t vnoff = vdupq_n_f32(-0.5f);
-    const float* din_c = din + j * inner_size;
-    int16_t* dout_c = dout + j * inner_size;
-    if (cnt > 0) {
-      int cnt_loop = cnt;
-      const float* din_ptr = din_c;
-      int16_t* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp q0, q1, [%[in]], #32                   \n"
-          "0:                                         \n" /* main loop */
-          "fmul v4.4s, v0.4s, %[scale].4s             \n"
-          "fmul v5.4s, v1.4s, %[scale].4s             \n"
-          "ldp q0, q1, [%[in]], #32                   \n"
-          "subs %[cnt], %[cnt], #1                    \n"
-          "FCVTAS v8.4s, v4.4s                        \n"
-          "FCVTAS v9.4s, v5.4s                        \n"
-          "sqxtn    v4.4h, v8.4s                      \n"
-          "sqxtn2   v4.8h, v9.4s                      \n"
-          "str q4, [%[out]], #16                      \n"
-          "bne    0b                                  \n"
-          : [in] "+r"(din_ptr), [out] "+r"(dout_ptr), [cnt] "+r"(cnt_loop)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v8", "v9");
-#else
-      asm volatile(
-          "vld1.32 {d0-d3}, [%[din]]!             @ load in0~in7\n"
-          "0:                                     @ main loop\n"
-          "vand.i32   q4, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-          "vand.i32   q5, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q6, q4, q4                  @ set offset, 0.5\n"
-          "vand.i32   q7, q4, q4                  @ set offset, 0.5\n"
-          "vcgt.f32   q8, q0, %q[vzero]           @ get mask > 0, in0\n"
-          "vcgt.f32   q9, q1, %q[vzero]           @ get mask > 0, in1\n"
-          "vbif.f32   q4, %q[vnoff], q8           @ get right offset\n"
-          "vbif.f32   q5, %q[vnoff], q9           @ get right offset\n"
-          "vmla.f32   q4, q0, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q5, q1, %q[vscale]          @ mul scale\n"
-          "vcvt.s32.f32  q0, q4                   @ cvt to int32\n"
-          "vcvt.s32.f32  q1, q5                   @ cvt to int32\n"
-          "vqmovn.s32 d8, q0                      @ cnt to int16\n"
-          "vqmovn.s32 d9, q1                      @ cnt to int16\n"
-          "vld1.32 {d0-d3},  [%[din]]!            @ load in0~in7\n"
-          "vst1.32 {d8-d9},  [%[dout]]!           @ write to output\n"
-          "subs   %[cnt], #1                      @ loop count -1\n"
-          "bne    0b                              @ to main loop\n"
-          : [dout] "+r"(dout_ptr), [din] "+r"(din_ptr), [cnt] "+r"(cnt_loop)
-          : [vscale] "w"(vscale), [vpoff] "w"(vpoff), [vnoff] "w"(vnoff),
-            [vzero] "w"(vzero)
-          : "q0", "q1", "q4", "q5", "q6", "q7", "q8", "q9");
-#endif
-    }
-    const float* din_r = din_c + 8 * cnt;
-    int16_t* dout_r = dout_c + 8 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = saturate_cast<int16_t>(roundf(inv_scale * din_r[i]));
-    }
-  }
-}
-void int8_to_fp32(const signed char* in, float* out, const float* scale,
-                  int axis_size, int64_t outer_size, int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = axis_size * outer_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const signed char* din_c = in + n * inner_size;
-    float* dout_c = out + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    if (cnt > 0) {
-      int loop = cnt;
-      const signed char* din_ptr = din_c;
-      float* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp     d0, d1, [%[in]], #16               \n" /* load 16 int8*/
-          "0:                                 \n"         /* main loop */
-          "sshll   v2.8h, v0.8b, #0           \n"         /* trans to int16*/
-          "sshll   v3.8h, v1.8b, #0           \n"         /* trans to int16*/
-          "sshll   v4.4s, v2.4h, #0           \n" /* trans to int32*/
-          "sshll2  v5.4s, v2.8h, #0           \n" /* trans to int32*/
-          "sshll   v6.4s, v3.4h, #0           \n" /* trans to int32*/
-          "sshll2  v7.4s, v3.8h, #0           \n" /* trans to int32*/
-          "ldp     d0, d1, [%[in]], #16       \n" /* load 16 int8*/
-          "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
-          "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
-          "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
-          "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
-          "subs    %[loop], %[loop], #1       \n"
-          "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
-          "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
-          "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
-          "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
-          "bne     0b                         \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11");
-#else
-      asm volatile(
-          "vld1.32    {d0-d1},    [%[in]]!            @ load 16 int8\n"
-          "0:                                 @ main loop\n"
-          "vmovl.s8      q2, d0               @ trans to int16\n"
-          "vmovl.s8      q3, d1               @ trans to int16\n"
-          "vmovl.s16     q4, d4               @ trans to int32\n"
-          "vmovl.s16     q5, d5               @ trans to int32\n"
-          "vmovl.s16     q6, d6               @ trans to int32\n"
-          "vmovl.s16     q7, d7               @ trans to int32\n"
-          "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
-          "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
-          "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
-          "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
-          "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
-          "vld1.32    {d0-d1},    [%[in]]!    @ load 16 int8\n"
-          "subs          %[loop], #1            \n"
-          "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
-          "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
-          "bne           0b                     \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-#endif  // __aarch64__
-    }
-    const signed char* din_r = din_c + 16 * cnt;
-    float* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = in_scale * din_r[i];
-    }
-  }
-}
-void int16_to_fp32(const int16_t* in, float* out, const float* scale,
-                   int axis_size, int64_t outer_size, int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = axis_size * outer_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const int16_t* din_c = in + n * inner_size;
-    float* dout_c = out + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    if (cnt > 0) {
-      int loop = cnt;
-      const int16_t* din_ptr = din_c;
-      float* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp     q0, q1, [%[in]], #32               \n" /* load 16 int16*/
-          "0:                                 \n"         /* main loop */
-          "sshll   v4.4s, v0.4h, #0           \n"         /* trans to int32*/
-          "sshll2  v5.4s, v0.8h, #0           \n"         /* trans to int32*/
-          "sshll   v6.4s, v1.4h, #0           \n"         /* trans to int32*/
-          "sshll2  v7.4s, v1.8h, #0           \n"         /* trans to int32*/
-          "ldp     q0, q1, [%[in]], #32       \n" /* load 16 int16*/
-          "scvtf   v8.4s, v4.4s               \n" /* trans to fp32*/
-          "scvtf   v9.4s, v5.4s               \n" /* trans to fp32*/
-          "scvtf   v10.4s, v6.4s              \n" /* trans to fp32*/
-          "scvtf   v11.4s, v7.4s              \n" /* trans to fp32*/
-          "subs    %[loop], %[loop], #1       \n"
-          "fmul    v4.4s, v8.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v5.4s, v9.4s, %[scale].4s  \n" /* mul with scale*/
-          "fmul    v6.4s, v10.4s, %[scale].4s \n" /* mul with scale*/
-          "fmul    v7.4s, v11.4s, %[scale].4s \n" /* mul with scale*/
-          "stp     q4, q5, [%[out]], #32      \n" /* write to memory*/
-          "stp     q6, q7, [%[out]], #32      \n" /* write to memory*/
-          "bne     0b                         \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v4", "v5", "v6", "v7", "v8", "v9", "v10", "v11");
-#else
-      asm volatile(
-          "vld1.32    {d0-d3},    [%[in]]!            @ load 16 int16\n"
-          "0:                                 @ main loop\n"
-          "vmovl.s16     q4, d0               @ trans to int32\n"
-          "vmovl.s16     q5, d1               @ trans to int32\n"
-          "vmovl.s16     q6, d2               @ trans to int32\n"
-          "vmovl.s16     q7, d3               @ trans to int32\n"
-          "vcvt.f32.s32  q0, q4               @ trans to fp32\n"
-          "vcvt.f32.s32  q1, q5               @ trans to fp32\n"
-          "vcvt.f32.s32  q2, q6               @ trans to fp32\n"
-          "vcvt.f32.s32  q3, q7               @ trans to fp32\n"
-          "vmul.f32      q4, q0, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q5, q1, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q6, q2, %q[scale]    @ mul with scale\n"
-          "vmul.f32      q7, q3, %q[scale]    @ mul with scale\n"
-          "vld1.32    {d0-d3},    [%[in]]!    @ load 16 int8\n"
-          "subs          %[loop], #1            \n"
-          "vst1.f32      {d8-d11}, [%[out]]!  @ write to memory\n"
-          "vst1.f32      {d12-d15}, [%[out]]! @ write to memory\n"
-          "bne           0b                     \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7");
-#endif  // __aarch64__
-    }
-    const int16_t* din_r = din_c + 16 * cnt;
-    float* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = in_scale * din_r[i];
-    }
-  }
-}
-void int32_to_fp32(const int* din, float* dout, const float* scale,
-                   int axis_size, int64_t outer_size, int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = axis_size * outer_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const int* din_c = din + n * inner_size;
-    float* dout_c = dout + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    if (cnt > 0) {
-      int loop = cnt;
-      const int* din_ptr = din_c;
-      float* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "ldp     q0, q1, [%[in]], #32               \n"
-          "ldp  q2, q3, [%[in]], #32          \n"
-          "0:                                 \n"
-          "scvtf   v4.4s, v0.4s               \n"
-          "scvtf   v5.4s, v1.4s               \n"
-          "scvtf   v6.4s, v2.4s               \n"
-          "scvtf   v7.4s, v3.4s               \n"
-          "ldp  q0, q1, [%[in]], #32          \n"
-          "fmul    v8.4s, v4.4s, %[scale].4s  \n"
-          "fmul    v9.4s, v5.4s, %[scale].4s  \n"
-          "fmul    v10.4s, v6.4s, %[scale].4s \n"
-          "fmul    v11.4s, v7.4s, %[scale].4s \n"
-          "ldp  q2, q3, [%[in]], #32          \n"
-          "stp     q8, q9, [%[out]], #32      \n"
-          "stp     q10, q11, [%[out]], #32    \n"
-          "subs    %[loop], %[loop], #1       \n"
-          "bne     0b                         \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7", "v8", "v9", "v10",
-            "v11");
-#else
-      asm volatile(
-          "vld1.s32       {d0-d3}, [%[in]]!               \n"
-          "vld1.s32       {d4-d7}, [%[in]]!       \n"
-          "0:                                     \n"
-          "vcvt.f32.s32   q4, q0                  \n"
-          "vcvt.f32.s32   q5, q1                  \n"
-          "vcvt.f32.s32   q6, q2                  \n"
-          "vcvt.f32.s32   q7, q3                  \n"
-          "vld1.s32       {d0-d3}, [%[in]]!       \n"
-          "vmul.f32       q8, q4, %q[scale]       \n"
-          "vmul.f32       q9, q5, %q[scale]       \n"
-          "vmul.f32       q10, q6, %q[scale]      \n"
-          "vmul.f32       q11, q7, %q[scale]      \n"
-          "vld1.s32       {d4-d7}, [%[in]]!       \n"
-          "subs           %[loop], #1             \n"
-          "vst1.f32       {d16-d19}, [%[out]]!    \n"
-          "vst1.f32       {d20-d23}, [%[out]]!    \n"
-          "bne            0b                      \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11");
-#endif  // __aarch64__
-    }
-    const int* din_r = din_c + 16 * cnt;
-    float* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = in_scale * din_r[i];
-    }
-  }
-}
-void int32_to_int8(const int* din, signed char* dout, const float* scale,
-                   int axis_size, int64_t outer_size, int64_t inner_size) {
-  int cnt = inner_size / 16;
-  int remain = inner_size & 15;
-  int64_t loop_size = outer_size * axis_size;
-#pragma omp parallel for
-  for (int64_t n = 0; n < loop_size; ++n) {
-    float in_scale = scale[n % axis_size];
-    const int* din_c = din + n * inner_size;
-    signed char* dout_c = dout + n * inner_size;
-    float32x4_t vscale = vdupq_n_f32(in_scale);
-    float32x4_t vzero = vdupq_n_f32(0.f);
-    float32x4_t vpoff = vdupq_n_f32(0.5f);
-    float32x4_t vnoff = vdupq_n_f32(-0.5f);
-    if (cnt > 0) {
-      int loop = cnt;
-      const int* din_ptr = din_c;
-      signed char* dout_ptr = dout_c;
-#ifdef __aarch64__
-      asm volatile(
-          "0:                                        \n"
-          "ld1     {v0.4s, v1.4s}, [%[in]], #32      \n"
-          "ld1     {v2.4s, v3.4s}, [%[in]], #32      \n"
-          "scvtf   v4.4s, v0.4s                      \n"
-          "scvtf   v5.4s, v1.4s                      \n"
-          "scvtf   v6.4s, v2.4s                      \n"
-          "scvtf   v7.4s, v3.4s                      \n"
-          "fmul    v0.4s, v4.4s, %[scale].4s         \n"
-          "fmul    v1.4s, v5.4s, %[scale].4s         \n"
-          "fmul    v2.4s, v6.4s, %[scale].4s         \n"
-          "fmul    v3.4s, v7.4s, %[scale].4s         \n"
-          "fcvtas  v4.4s, v0.4s                      \n"
-          "fcvtas  v5.4s, v1.4s                      \n"
-          "fcvtas  v6.4s, v2.4s                      \n"
-          "fcvtas  v7.4s, v3.4s                      \n"
-          "sqxtn   v0.4h, v4.4s                      \n"
-          "sqxtn2  v0.8h, v5.4s                      \n"
-          "sqxtn   v1.4h, v6.4s                      \n"
-          "sqxtn2  v1.8h, v7.4s                      \n"
-          "sqxtn   v2.8b, v0.8h                      \n"
-          "sqxtn2  v2.16b, v1.8h                     \n"
-          "st1     {v2.16b}, [%[out]], #16           \n"
-          "subs    %[loop], %[loop], #1              \n"
-          "bne     0b                                \n"
-          : [loop] "+r"(loop), [in] "+r"(din_ptr), [out] "+r"(dout_ptr)
-          : [scale] "w"(vscale)
-          : "v0", "v1", "v2", "v3", "v4", "v5", "v6", "v7");
-#else
-      asm volatile(
-          "vld1.32 {d0-d3},    [%[din]]!                  @ load in0~in7\n"
-          "vld1.32    {d4-d7},    [%[din]]!       @ load in8~in16\n"
-          "0:                                     @ main loop\n"
-          "vcvt.f32.s32   q4, q0                  @ cvt to float\n"
-          "vcvt.f32.s32   q5, q1                  @ cvt to float\n"
-          "vcvt.f32.s32   q6, q2                  @ cvt to float\n"
-          "vcvt.f32.s32   q7, q3                  @ cvt to float\n"
-          "vand.i32   q0, %q[vpoff], %q[vpoff]    @ set offset, 0.5\n"
-          "vand.i32   q1, q0, q0                  @ set offset, 0.5\n"
-          "vand.i32   q2, q0, q0                  @ set offset, 0.5\n"
-          "vand.i32   q3, q0, q0                  @ set offset, 0.5\n"
-          "vcgt.f32   q8, q4, %q[vzero]           @ get mask > 0, in0\n"
-          "vcgt.f32   q9, q5, %q[vzero]           @ get mask > 0, in1\n"
-          "vcgt.f32   q10, q6, %q[vzero]          @ get mask > 0, in2\n"
-          "vcgt.f32   q11, q7, %q[vzero]          @ get mask > 0, in3\n"
-          "vbif.f32   q0, %q[vnoff], q8           @ get right offset\n"
-          "vbif.f32   q1, %q[vnoff], q9           @ get right offset\n"
-          "vbif.f32   q2, %q[vnoff], q10          @ get right offset\n"
-          "vbif.f32   q3, %q[vnoff], q11          @ get right offset\n"
-          "vmla.f32   q0, q4, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q1, q5, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q2, q6, %q[vscale]          @ mul scale\n"
-          "vmla.f32   q3, q7, %q[vscale]          @ mul scale\n"
-          "vcvt.s32.f32  q4, q0                   @ cvt to int32\n"
-          "vcvt.s32.f32  q5, q1                   @ cvt to int32\n"
-          "vcvt.s32.f32  q6, q2                   @ cvt to int32\n"
-          "vcvt.s32.f32  q7, q3                   @ cvt to int32\n"
-          "vqmovn.s32 d16, q4                     @ cnt to int16\n"
-          "vqmovn.s32 d17, q5                     @ cnt to int16\n"
-          "vqmovn.s32 d18, q6                     @ cnt to int16\n"
-          "vqmovn.s32 d19, q7                     @ cnt to int16\n"
-          "vld1.32 {d0-d3},    [%[din]]!          @ load in0~in7\n"
-          "vqmovn.s16 d8, q8                      @ cnt to int8\n"
-          "vqmovn.s16 d9, q9                      @ cnt to int8\n"
-          "vld1.32 {d4-d7},    [%[din]]!          @ load in8~in16\n"
-          "vst1.32 {d8-d9},    [%[dout]]!         @ write to output\n"
-          "subs   %[loop], #1                     @ loop count -1\n"
-          "bne    0b                              @ to main loop\n"
-          : [loop] "+r"(loop), [din] "+r"(din_ptr), [dout] "+r"(dout_ptr)
-          : [vscale] "w"(vscale), [vzero] "w"(vzero), [vnoff] "w"(vnoff),
-            [vpoff] "w"(vpoff)
-          : "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10",
-            "q11");
-#endif  // __aarch64__
-    }
-    const int* din_r = din_c + 16 * cnt;
-    int8_t* dout_r = dout_c + 16 * cnt;
-    for (int i = 0; i < remain; ++i) {
-      dout_r[i] = saturate_cast<int8_t>(roundf(in_scale * din_r[i]));
-    }
-  }
-}
-void int32_to_int32(const int* din, int* dout, const float* scale,
-                    int axis_size, int64_t outer_size, int64_t inner_size) {
-  int size_all = outer_size * axis_size * inner_size;
-  memmove(dout, din, size_all * sizeof(int));
-}
-template <>
-void int32_to_dtype(const int* din, float* dout, const float* scale,
-                    int axis_size, int64_t outer_size, int64_t inner_size) {
-  return int32_to_fp32(din, dout, scale, axis_size, outer_size, inner_size);
-}
-template <>
-void int32_to_dtype(const int* din, signed char* dout, const float* scale,
-                    int axis_size, int64_t outer_size, int64_t inner_size) {
-  return int32_to_int8(din, dout, scale, axis_size, outer_size, inner_size);
-}
-template <>
-void int32_to_dtype(const int* din, int* dout, const float* scale,
-                    int axis_size, int64_t outer_size, int64_t inner_size) {
-  return int32_to_int32(din, dout, scale, axis_size, outer_size, inner_size);
-}
-}  // namespace math
-}  // namespace arm
-}  // namespace lite
-}  // namespace paddle
--- a/paddle/fluid/lite/core/mir/CMakeLists.txt
+++ b/paddle/fluid/lite/core/mir/CMakeLists.txt
@@ -52,8 +52,11 @@ cc_library(mir_passes
 #   X86_DEPS mul_compute_x86
 # )
+set(pattern_deps mir_node mir_ssa_graph op_lite)
-lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS mir_node mir_ssa_graph op_lite)
+if (WITH_TESTING)
+  list(APPEND pattern_deps gtest)
+endif()
+lite_cc_library(pattern_matcher_lite SRCS pattern_matcher.cc DEPS ${pattern_deps})
 lite_cc_test(test_pattern_matcher_lite SRCS pattern_matcher_test.cc DEPS pattern_matcher_lite)
 lite_cc_library(pattern_matcher_high_api SRCS pattern_matcher_high_api.cc DEPS pattern_matcher_lite)

--- a/paddle/fluid/lite/kernels/arm/CMakeLists.txt
+++ b/paddle/fluid/lite/kernels/arm/CMakeLists.txt
@@ -16,6 +16,7 @@ cc_library(pool_compute_arm SRCS pool_compute.cc DEPS ${lite_kernel_deps} math_a
 cc_library(split_compute_arm SRCS split_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(concat_compute_arm SRCS concat_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(dropout_compute_arm SRCS dropout_compute.cc DEPS ${lite_kernel_deps} math_arm)
+cc_library(calib_compute_arm SRCS calib_compute.cc DEPS ${lite_kernel_deps} math_arm)
 cc_library(transpose_compute_arm SRCS transpose_compute.cc DEPS ${lite_kernel_deps} math_arm)
 lite_cc_test(test_fc_compute_arm SRCS fc_compute_test.cc DEPS fc_compute_arm math_arm)
@@ -30,6 +31,7 @@ lite_cc_test(test_mul_compute_arm SRCS mul_compute_test.cc DEPS mul_compute_arm)
 lite_cc_test(test_split_compute_arm SRCS split_compute_test.cc DEPS split_compute_arm)
 lite_cc_test(test_concat_compute_arm SRCS concat_compute_test.cc DEPS concat_compute_arm)
 lite_cc_test(test_dropout_compute_arm SRCS dropout_compute_test.cc DEPS dropout_compute_arm)
+lite_cc_test(test_calib_compute_arm SRCS calib_compute_test.cc DEPS calib_compute_arm)
 lite_cc_test(test_transpose_compute_arm SRCS transpose_compute_test.cc DEPS transpose_compute_arm)
 set(arm_kernels

--- a/paddle/fluid/lite/kernels/arm/calib_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/calib_compute.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
+#include <vector>
+#include "paddle/fluid/lite/arm/math/type_trans.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+#include "paddle/fluid/lite/core/type_system.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+void CalibCompute::Run() {
+  auto& param = this->Param<operators::CalibParam>();
+  std::vector<float> scale = {param.in_scale};
+  if (param.in_dtype == PRECISION(kFloat) &&
+      param.out_dtype == PRECISION(kInt8)) {
+    const auto* din = param.input->data<float>();
+    auto* dout = param.output->mutable_data<signed char>();
+    lite::arm::math::fp32_to_int8(din, dout, scale.data(), 1, 1,
+                                  param.input->numel());
+    return;
+  }
+  if (param.in_dtype == PRECISION(kInt8) &&
+      param.out_dtype == PRECISION(kFloat)) {
+    const auto* din = param.input->data<signed char>();
+    auto* dout = param.output->mutable_data<float>();
+    lite::arm::math::int8_to_fp32(din, dout, scale.data(), 1, 1,
+                                  param.input->numel());
+    return;
+  }
+  LOG(FATAL) << "Unsupport Dtype.";
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_KERNEL(calib, kARM, kInt8, kNCHW,
+                     paddle::lite::kernels::arm::CalibCompute, def)
+    .BindInput("Input", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .Finalize();
--- a/paddle/fluid/lite/kernels/arm/calib_compute.h
+++ b/paddle/fluid/lite/kernels/arm/calib_compute.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/operators/calib_op.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+class CalibCompute : public KernelLite<TARGET(kARM), PRECISION(kInt8)> {
+ public:
+  using param_t = operators::CalibParam;
+  void Run() override;
+  ~CalibCompute() override{};
+ private:
+};
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/kernels/arm/calib_compute_test.cc
+++ b/paddle/fluid/lite/kernels/arm/calib_compute_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/kernels/arm/calib_compute.h"
+#include <gtest/gtest.h>
+#include <algorithm>
+#include <iostream>
+#include <memory>
+#include <random>
+#include <utility>
+#include <vector>
+#include "paddle/fluid/lite/arm/math/funcs.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace kernels {
+namespace arm {
+static int get_rand(int start, int end) {
+  int i = rand();  // NOLINT
+  i = (i % (end - start)) + start;
+  return i;
+}
+static void int8_to_fp32_basic(const int8_t* din, float* dout,
+                               const float* scale, int axis_size,
+                               int64_t outer_size, int64_t inner_size) {
+  int loop_size = axis_size * outer_size;
+  for (int i = 0; i < loop_size; ++i) {
+    float scale_in = scale[i % axis_size];
+    for (int j = 0; j < inner_size; ++j) {
+      dout[j] = din[j] * scale_in;
+    }
+    dout += inner_size;
+    din += inner_size;
+  }
+}
+static void fp32_to_int8_basic(const float* din, int8_t* dout,
+                               const float* scale, int axis_size,
+                               int64_t outer_size, int64_t inner_size) {
+  int loop_size = axis_size * outer_size;
+  for (int i = 0; i < loop_size; ++i) {
+    float inv_scale = 1.f / scale[i % axis_size];
+    for (int j = 0; j < inner_size; ++j) {
+      dout[j] = static_cast<int8_t>(roundf(din[j] * inv_scale));
+    }
+    dout += inner_size;
+    din += inner_size;
+  }
+}
+void calib_ref(const operators::CalibParam& param) {
+  std::vector<float> scale = {param.in_scale};
+  if (param.in_dtype == PRECISION(kFloat) &&
+      param.out_dtype == PRECISION(kInt8)) {
+    const auto* din = param.input->data<float>();
+    auto* dout = param.output->mutable_data<signed char>();
+    fp32_to_int8_basic(din, dout, scale.data(), 1, 1, param.input->numel());
+    return;
+  }
+  if (param.in_dtype == PRECISION(kInt8) &&
+      param.out_dtype == PRECISION(kFloat)) {
+    const auto* din = param.input->data<signed char>();
+    auto* dout = param.output->mutable_data<float>();
+    int8_to_fp32_basic(din, dout, scale.data(), 1, 1, param.input->numel());
+    return;
+  }
+  LOG(FATAL) << "Unsupport Dtype.";
+}
+TEST(calib_arm, retrive_op) {
+  auto calib =
+      KernelRegistry::Global()
+          .Create<TARGET(kARM), PRECISION(kInt8), DATALAYOUT(kNCHW)>("calib");
+  ASSERT_FALSE(calib.empty());
+  ASSERT_TRUE(calib.front());
+}
+TEST(calib_arm, init) {
+  CalibCompute calib;
+  ASSERT_EQ(calib.precision(), PRECISION(kInt8));
+  ASSERT_EQ(calib.target(), TARGET(kARM));
+}
+TEST(calib_arm, int8_to_fp32) {
+  DeviceInfo::Init();
+  for (auto n : {1, 2}) {
+    for (auto c : {6, 32 /*, 128*/}) {
+      for (auto h : {9, 18 /*, 56 , 112, 224, 512*/}) {
+        for (auto w : {9, 18 /*, 56, 112, 224, 512*/}) {
+          Tensor x;
+          Tensor output;
+          Tensor output_ref;
+          // set the dims of input, output, ref output tensors
+          x.Resize({n, c, h, w});
+          output.Resize({n, c, h, w});
+          output_ref.Resize({n, c, h, w});
+          // initialize the data of input tensors
+          auto* x_data = x.mutable_data<char>();
+          auto* output_data = output.mutable_data<float>();
+          for (int i = 0; i < x.dims().production(); i++) {
+            float sign = i % 3 == 0 ? -1.0f : 1.0f;
+            x_data[i] = sign * static_cast<float>(i % 128) * 0.013f;
+          }
+          // prepare kernel params and run
+          CalibCompute calib;
+          std::unique_ptr<KernelContext> ctx(new KernelContext);
+          ctx->As<ARMContext>();
+          calib.SetContext(std::move(ctx));
+          operators::CalibParam param;
+          param.in_scale = get_rand(0, 100) * 0.1f;
+          param.in_dtype = PRECISION(kInt8);
+          param.out_dtype = PRECISION(kFloat);
+          param.input = &x;
+          param.output = &output;
+          calib.SetParam(param);
+          calib.Launch();
+          // invoking ref implementation and compare results
+          param.output = &output_ref;
+          calib_ref(param);
+          auto* output_ref_data = output_ref.mutable_data<float>();
+          for (int i = 0; i < output.dims().production(); i++) {
+            EXPECT_NEAR(output_data[i], output_ref_data[i], 1e-5);
+          }
+        }
+      }
+    }
+  }
+}
+}  // namespace arm
+}  // namespace kernels
+}  // namespace lite
+}  // namespace paddle
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, def);
--- a/paddle/fluid/lite/kernels/arm/dropout_compute.cc
+++ b/paddle/fluid/lite/kernels/arm/dropout_compute.cc
@@ -44,4 +44,5 @@ REGISTER_LITE_KERNEL(dropout, kARM, kFloat, kNCHW,
    .BindInput("dropout_prob", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindInput("dropout_implementation", {LiteType::GetTensorTy(TARGET(kARM))})
    .BindOutput("Out", {LiteType::GetTensorTy(TARGET(kARM))})
+    .BindOutput("Mask", {LiteType::GetTensorTy(TARGET(kARM))})
    .Finalize();
--- a/paddle/fluid/lite/kernels/use_kernels.h
+++ b/paddle/fluid/lite/kernels/use_kernels.h
@@ -47,6 +47,8 @@ USE_LITE_KERNEL(depthwise_conv2d, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(pool2d, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(elementwise_add, kARM, kFloat, kNCHW, def);
 USE_LITE_KERNEL(softmax, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(concat, kARM, kFloat, kNCHW, def);
+USE_LITE_KERNEL(dropout, kARM, kFloat, kNCHW, def);
 #endif
 #ifdef LITE_WITH_CUDA

--- a/paddle/fluid/lite/kernels/x86/relu_compute.h
+++ b/paddle/fluid/lite/kernels/x86/relu_compute.h
@@ -31,13 +31,13 @@ namespace x86 {
 template <typename T>
 class ReluCompute : public KernelLite<TARGET(kX86), PRECISION(kFloat)> {
 public:
-  using param_t = operators::ReluParam;
+  using param_t = operators::ActivationParam;
  void Run() override {
    auto& param = *param_.get_mutable<param_t>();
-    auto n = param.input->dims().production();
+    auto n = param.X->dims().production();
-    const float* input = param.input->data<float>();
+    const float* input = param.X->data<float>();
-    float* output = param.output->mutable_data<float>();
+    float* output = param.Out->mutable_data<float>();
    for (int i = 0; i < n; i++) {
      output[i] = std::max(0.f, input[i]);
    }

--- a/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
+++ b/paddle/fluid/lite/kernels/x86/relu_compute_test.cc
@@ -53,10 +53,10 @@ TEST(relu_x86, run_test) {
  }
  // ReluCompute relu;
  ReluCompute<float> relu;
-  operators::ReluParam param;
+  operators::ActivationParam param;
-  param.input = &x;
+  param.X = &x;
-  param.output = &out;
+  param.Out = &out;
  relu.SetParam(param);
  relu.Run();

--- a/paddle/fluid/lite/operators/CMakeLists.txt
+++ b/paddle/fluid/lite/operators/CMakeLists.txt
@@ -21,6 +21,7 @@ cc_library(fill_constant_op_lite SRCS fill_constant_op.cc DEPS ${op_DEPS})
 cc_library(op_params_lite SRCS op_params.cc DEPS ${tensor_lite} any_lite framework_proto_lite)
 cc_library(dropout_op_lite SRCS dropout_op.cc DEPS ${op_DEPS})
 cc_library(concat_op_lite SRCS concat_op.cc DEPS ${op_DEPS})
+cc_library(calib_op_lite SRCS calib_op.cc DEPS ${op_DEPS})
 cc_library(split_op_lite SRCS split_op.cc DEPS ${op_DEPS})
 cc_library(transpose_op_lite SRCS transpose_op.cc DEPS ${op_DEPS})
 cc_library(fake_quant SRCS fake_quantize_moving_avg_max_abs.cc DEPS ${op_DEPS})
@@ -46,6 +47,7 @@ set(ops_lite
        activation_ops_lite
        dropout_op_lite
        concat_op_lite
+        calib_op_lite
        split_op_lite
        transpose_op_lite
        fake_quant
@@ -64,6 +66,7 @@ lite_cc_test(test_softmax_op_lite SRCS softmax_op_test.cc DEPS softmax_op_lite m
 lite_cc_test(test_reshape_op_lite SRCS reshape_op_test.cc DEPS reshape_op_lite memory_lite)
 lite_cc_test(test_batch_norm_op_lite SRCS batch_norm_op_test.cc DEPS batch_norm_op_lite memory_lite)
 lite_cc_test(test_concat_op_lite SRCS concat_op_test.cc DEPS concat_op_lite memory_lite)
+lite_cc_test(test_calib_op_lite SRCS calib_op_test.cc DEPS calib_op_lite memory_lite ARM_DEPS calib_compute_arm)
 lite_cc_test(test_fusion_elementwise_activation_ops_lite 
             SRCS fusion_elementwise_activation_ops_test.cc 
             DEPS fusion_elementwise_activation_ops_lite memory_lite)

--- a/paddle/fluid/lite/operators/calib_op.cc
+++ b/paddle/fluid/lite/operators/calib_op.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/operators/calib_op.h"
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+bool CalibOpLite::CheckShape() const {
+  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.output);
+  return true;
+}
+bool CalibOpLite::InferShape() const {
+  param_.output->Resize(param_.input->dims());
+  return true;
+}
+bool CalibOpLite::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
+  auto x_var = scope->FindVar(opdesc.Input("Input").front());
+  auto output_var = scope->FindVar(opdesc.Output("Out").front());
+  CHECK(x_var);
+  CHECK(output_var);
+  param_.input = const_cast<lite::Tensor *>(&(x_var->Get<lite::Tensor>()));
+  param_.output = output_var->GetMutable<lite::Tensor>();
+  std::vector<std::string> input_arg_names = opdesc.InputArgumentNames();
+  param_.in_dtype =
+      static_cast<lite::PrecisionType>(opdesc.GetAttr<int>("in_dtype"));
+  param_.out_dtype =
+      static_cast<lite::PrecisionType>(opdesc.GetAttr<int>("out_dtype"));
+  if (opdesc.HasAttr("in_scale")) {
+    param_.in_scale = opdesc.GetAttr<float>("in_scale");
+  }
+  CHECK(param_.input) << "Input(X) of CalibOp should not be null.";
+  CHECK(param_.output) << "Output(Out) of CalibOp should not be null.";
+  return true;
+}
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+REGISTER_LITE_OP(calib, paddle::lite::operators::CalibOpLite);
--- a/paddle/fluid/lite/operators/calib_op.h
+++ b/paddle/fluid/lite/operators/calib_op.h
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#pragma once
+#include <string>
+#include <vector>
+#include "paddle/fluid/lite/core/compatible_tensor.h"
+#include "paddle/fluid/lite/core/kernel.h"
+#include "paddle/fluid/lite/core/op_lite.h"
+#include "paddle/fluid/lite/core/scope.h"
+#include "paddle/fluid/lite/operators/op_params.h"
+#include "paddle/fluid/lite/utils/all.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+/*
+ * The data types used by the two adjacent layers in the model should
+ * be the same. When the two operators accept different data types,
+ * we may need to implicitly add a data type conversion operator.
+ * Currently, this operator only supports mutual conversion of int8
+ * and float32 types.
+ */
+class CalibOpLite : public OpLite {
+ public:
+  CalibOpLite() {}
+  explicit CalibOpLite(const std::string &type) : OpLite(type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
+  bool AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope);
+  void AttachKernel(KernelBase *kernel) override { kernel->SetParam(param_); }
+  std::string DebugString() const override { return "calib"; }
+ private:
+  mutable CalibParam param_;
+};
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
--- a/paddle/fluid/lite/operators/calib_op_test.cc
+++ b/paddle/fluid/lite/operators/calib_op_test.cc
+// Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//     http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License.
+#include "paddle/fluid/lite/operators/calib_op.h"
+#include <gtest/gtest.h>
+#include "paddle/fluid/lite/core/op_registry.h"
+namespace paddle {
+namespace lite {
+namespace operators {
+#ifdef LITE_WITH_ARM
+TEST(calib_op_lite, TestARM) {
+  // prepare variables
+  Scope scope;
+  auto* x = scope.Var("Input")->GetMutable<Tensor>();
+  auto* output = scope.Var("output")->GetMutable<Tensor>();
+  x->Resize(DDim(std::vector<int64_t>({1, 10, 20})));
+  output->Resize(DDim(std::vector<int64_t>{1, 10, 20}));
+  // set data
+  for (int i = 0; i < 10 * 20; i++) {
+    x->mutable_data<float>()[i] = i;
+  }
+  for (int i = 0; i < 10 * 20; i++) {
+    output->mutable_data<float>()[i] = 0.;
+  }
+  // prepare op desc
+  cpp::OpDesc desc;
+  desc.SetType("calib");
+  desc.SetInput("Input", {"Input"});
+  desc.SetOutput("Out", {"output"});
+  desc.SetAttr("in_dtype", static_cast<int>(PRECISION(kInt8)));
+  desc.SetAttr("out_dtype", static_cast<int>(PRECISION(kFloat)));
+  desc.SetAttr("in_scale", 10.0f);
+  CalibOpLite calib("calib");
+  calib.SetValidPlaces({Place{TARGET(kARM), PRECISION(kInt8)}});
+  calib.Attach(desc, &scope);
+  auto kernels = calib.CreateKernels({Place{TARGET(kARM), PRECISION(kInt8)}});
+  ASSERT_FALSE(kernels.empty());
+}
+#endif
+}  // namespace operators
+}  // namespace lite
+}  // namespace paddle
+#ifdef LITE_WITH_ARM
+USE_LITE_KERNEL(calib, kARM, kInt8, kNCHW, def);
+#endif
--- a/paddle/fluid/lite/operators/dropout_op.cc
+++ b/paddle/fluid/lite/operators/dropout_op.cc
@@ -52,13 +52,16 @@ class DropoutOpLite : public OpLite {
    param_.mask = GetMutableVar<lite::Tensor>(scope, Mask);
    param_.dropout_prob = op_desc.GetAttr<float>("dropout_prob");
-    if (op_desc.HasAttr("is_test")) {
+    param_.is_test = true;
-      param_.is_test = op_desc.GetAttr<bool>("is_test");
+    // TODO(sangoly): `is_test` has different attr type in x86 and arm, set
-    }
+    // `true` now.
+    // if (op_desc.HasAttr("is_test")) {
+    //   param_.is_test = op_desc.GetAttr<bool>("is_test");
+    // }
    param_.fix_seed = op_desc.GetAttr<bool>("fix_seed");
    param_.seed = op_desc.GetAttr<int>("seed");
    param_.dropout_implementation =
-        op_desc.GetAttr<int>("dropout_implementation");
+        op_desc.GetAttr<std::string>("dropout_implementation");
    return true;
  }

--- a/paddle/fluid/lite/operators/elementwise_ops.h
+++ b/paddle/fluid/lite/operators/elementwise_ops.h
@@ -32,6 +32,7 @@ class ElementwiseOp : public OpLite {
  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
  std::string DebugString() const override { return "elementwise_op"; }
 private:

--- a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.cc
@@ -20,9 +20,29 @@ namespace paddle {
 namespace lite {
 namespace operators {
+bool FusionElementwiseActivationOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.X);
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.Out);
+  return true;
+}
+bool FusionElementwiseActivationOp::InferShape() const {
+  CHECK_OR_FALSE(param_.X->dims().size() >= param_.Y->dims().size());
+  param_.Out->Resize(param_.X->dims());
+  return true;
+}
 bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
                                               lite::Scope* scope) {
-  ElementwiseOp::AttachImpl(opdesc, scope);
+  auto X_name = opdesc.Input("X").front();
+  auto Y_name = opdesc.Input("Y").front();
+  auto Out_name = opdesc.Output("Out").front();
+  param_.X = GetVar<lite::Tensor>(scope, X_name);
+  param_.Y = GetVar<lite::Tensor>(scope, Y_name);
+  param_.Out = GetMutableVar<lite::Tensor>(scope, Out_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
  param_.act_type = opdesc.GetAttr<std::string>("act_type");
  // TODO(sangoly): support more activation types.
  CHECK(param_.act_type == "relu") << "Only relu activation be supported now";
@@ -31,9 +51,31 @@ bool FusionElementwiseActivationOp::AttachImpl(const cpp::OpDesc& opdesc,
 }
 #ifdef LITE_WITH_X86
+bool FusionElementwiseActivationGradExplicitOp::CheckShape() const {
+  CHECK_OR_FALSE(param_.Y);
+  CHECK_OR_FALSE(param_.X_grad);
+  CHECK_OR_FALSE(param_.Y_grad);
+  CHECK_OR_FALSE(param_.Out_grad);
+  return true;
+}
+bool FusionElementwiseActivationGradExplicitOp::InferShape() const {
+  param_.X_grad->Resize(param_.Out_grad->dims());
+  param_.Y_grad->Resize(param_.Y->dims());
+  return true;
+}
 bool FusionElementwiseActivationGradExplicitOp::AttachImpl(
    const cpp::OpDesc& opdesc, lite::Scope* scope) {
-  ElementwiseGradExplicitOp::AttachImpl(opdesc, scope);
+  CHECK_EQ(opdesc.InputArgumentNames().size(), 1UL);
+  auto Out_name = opdesc.Input(framework::GradVarName("Out")).front();
+  auto X_name = opdesc.Output(framework::GradVarName("X")).front();
+  auto Y_name = opdesc.Output(framework::GradVarName("Y")).front();
+  param_.Out_grad = GetVar<lite::Tensor>(scope, Out_name);
+  param_.X_grad = GetMutableVar<lite::Tensor>(scope, X_name);
+  param_.Y_grad = GetMutableVar<Tensor>(scope, Y_name);
+  param_.axis = opdesc.GetAttr<int>("axis");
  param_.act_type = opdesc.GetAttr<std::string>("act_type");
  // TODO(sangoly): support more activation types.
  CHECK(param_.act_type == "relu") << "Only relu activation be supported now";

--- a/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h
+++ b/paddle/fluid/lite/operators/fusion_elementwise_activation_ops.h
@@ -22,13 +22,19 @@ namespace paddle {
 namespace lite {
 namespace operators {
-class FusionElementwiseActivationOp : public ElementwiseOp {
+class FusionElementwiseActivationOp : public OpLite {
 public:
  explicit FusionElementwiseActivationOp(const std::string& type)
-      : ElementwiseOp(type) {}
+      : OpLite(type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
  std::string DebugString() const override {
    return "fusion_elementwise_activation_op";
  }
@@ -38,14 +44,19 @@ class FusionElementwiseActivationOp : public ElementwiseOp {
 };
 #ifdef LITE_WITH_X86
-class FusionElementwiseActivationGradExplicitOp
+class FusionElementwiseActivationGradExplicitOp : public OpLite {
-    : public ElementwiseGradExplicitOp {
 public:
  explicit FusionElementwiseActivationGradExplicitOp(const std::string& type)
-      : ElementwiseGradExplicitOp(type) {}
+      : OpLite(type) {}
+  bool CheckShape() const override;
+  bool InferShape() const override;
  bool AttachImpl(const cpp::OpDesc& opdesc, lite::Scope* scope) override;
+  void AttachKernel(KernelBase* kernel) override { kernel->SetParam(param_); }
  std::string DebugString() const override {
    return "fusion_elementwise_activation_grad_explicit_op";
  }

--- a/paddle/fluid/lite/operators/op_params.h
+++ b/paddle/fluid/lite/operators/op_params.h
@@ -48,6 +48,14 @@ struct IoCopyParam {
  lite::Tensor* y{};
 };
+struct CalibParam {
+  const lite::Tensor* input{};
+  lite::Tensor* output{};
+  float in_scale;
+  PrecisionType in_dtype;
+  PrecisionType out_dtype;
+};
 /// -------------------------- NN operators ------------------------------------
 struct FcParam {
@@ -60,11 +68,6 @@ struct FcParam {
  bool weight_transposed{false};
 };
-struct ReluParam {
-  lite::Tensor* input{};
-  lite::Tensor* output{};
-};
 // For Mul Op
 struct MulParam {
  const lite::Tensor* x{};

--- a/paddle/fluid/lite/operators/relu_op.cc
+++ b/paddle/fluid/lite/operators/relu_op.cc
@@ -21,22 +21,22 @@ namespace operators {
 bool ReluOp::CheckShape() const { return true; }
 bool ReluOp::InferShape() const {
-  CHECK_OR_FALSE(param_.input);
+  CHECK_OR_FALSE(param_.X);
-  CHECK_OR_FALSE(param_.output);
+  CHECK_OR_FALSE(param_.Out);
  // TODO(Superjomn) Enable data sharing.
-  param_.output->Resize(param_.input->dims());
+  param_.Out->Resize(param_.X->dims());
  // share lod
-  // param_.output->set_lod(param_.input->lod());
+  // param_.output->set_lod(param_.X->lod());
  return true;
 }
 bool ReluOp::AttachImpl(const cpp::OpDesc &opdesc, lite::Scope *scope) {
-  param_.input = const_cast<lite::Tensor *>(
+  param_.X = const_cast<lite::Tensor *>(
      &scope->FindVar(opdesc.Input("X").front())->Get<lite::Tensor>());
-  param_.output =
+  param_.Out =
      scope->FindVar(opdesc.Output("Out").front())->GetMutable<lite::Tensor>();
-  CHECK(param_.input);
+  CHECK(param_.X);
-  CHECK(param_.output);
+  CHECK(param_.Out);
  return true;
 }

--- a/paddle/fluid/lite/operators/relu_op.h
+++ b/paddle/fluid/lite/operators/relu_op.h
@@ -38,7 +38,7 @@ class ReluOp : public OpLite {
  std::string DebugString() const override { return "relu"; }
 private:
-  mutable ReluParam param_;
+  mutable ActivationParam param_;
 };
 }  // namespace operators

--- a/paddle/fluid/lite/operators/use_ops.h
+++ b/paddle/fluid/lite/operators/use_ops.h
@@ -34,3 +34,4 @@ USE_LITE_OP(conv2d)
 USE_LITE_OP(depthwise_conv2d)
 USE_LITE_OP(pool2d)
 USE_LITE_OP(batch_norm)
+USE_LITE_OP(fusion_elementwise_sub_activation)
--- a/paddle/fluid/lite/tools/build.sh
+++ b/paddle/fluid/lite/tools/build.sh
@@ -99,7 +99,7 @@ function test_arm_android {
    echo "test name: ${test_name}"
    adb_work_dir="/data/local/tmp"
-    skip_list=("test_model_parser_lite" "test_cxx_api_lite")
+    skip_list=("test_model_parser_lite" "test_mobilenetv1_lite" "test_mobilenetv2_lite" "test_resnet50_lite" "test_inceptionv4_lite")
    for skip_name in ${skip_list[@]} ; do
        [[ $skip_name =~ (^|[[:space:]])$test_name($|[[:space:]]) ]] && echo "skip $test_name" && return
    done
@@ -136,7 +136,7 @@ function test_arm_model {
    adb -s emulator-${port} push ${testpath} ${adb_work_dir}
    adb -s emulator-${port} shell chmod +x "${adb_work_dir}/${test_name}"
    local adb_model_path="${adb_work_dir}/`basename ${model_dir}`"
-    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --eval_model_dir=$adb_model_path"
+    adb -s emulator-${port} shell "${adb_work_dir}/${test_name} --model_dir=$adb_model_path"
 }
@@ -305,8 +305,8 @@ function build_test_arm_subtask_armlinux {
    echo "Done"
 }
-# sub-task3
+# sub-task-model
-function build_test_arm_subtask3_mobilenet_v2 {
+function build_test_arm_subtask_model {
    local port_armv8=5554
    local port_armv7=5556
    # We just test following single one environment to limit the CI time.
@@ -314,17 +314,20 @@ function build_test_arm_subtask3_mobilenet_v2 {
    local abi=armv8
    local lang=gcc
+    local test_name=$1
+    local model_name=$2
    cur_dir=$(pwd)
    build_dir=$cur_dir/build.lite.${os}.${abi}.${lang}
    mkdir -p $build_dir
    cd $build_dir
    cmake_arm $os $abi $lang
-    make test_cxx_api_lite -j$NUM_CORES_FOR_COMPILE
+    make $test_name -j$NUM_CORES_FOR_COMPILE
    prepare_emulator $port_armv8 $port_armv7
    # just test the model on armv8
-    test_arm_model "test_cxx_api_lite" $port_armv8 "./third_party/install/mobilenet_v2_relu"
+    test_arm_model $test_name $port_armv8 "./third_party/install/$model_name"
    adb devices | grep emulator | cut -f1 | while read line; do adb -s $line emu kill; done
    echo "Done"
@@ -441,8 +444,20 @@ function main {
                build_test_arm_subtask_armlinux
                shift
                ;;
-            build_test_arm_model1)
+            build_test_arm_model_mobilenetv1)
-                build_test_arm_subtask3_mobilenet_v2
+                build_test_arm_subtask_model test_mobilenetv1_lite mobilenet_v1
+                shift
+                ;;
+            build_test_arm_model_mobilenetv2)
+                build_test_arm_subtask_model test_mobilenetv2_lite mobilenet_v2
+                shift
+                ;;
+            build_test_arm_model_resnet50)
+                build_test_arm_subtask_model test_resnet50_lite resnet50
+                shift
+                ;;
+            build_test_arm_model_inceptionv4)
+                build_test_arm_subtask_model test_inceptionv4_lite inception_v4
                shift
                ;;
            check_style)