Merge pull request #1048 from hjchen2/dev-latest

Add quantize and dequantize op and fix some code style

Merge pull request #1048 from hjchen2/dev-latest
Add quantize and dequantize op and fix some code style
5d05c474 · Ray Liu · GitHub · 15eb9986 · 39c67eb2 · 5d05c474
146 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
 cmake_minimum_required(VERSION 3.0)
-option(USE_OPENMP "openmp support" OFF)
 project(paddle-mobile)
+# select the platform to build
+option(CPU "armv7 with neon support" ON)
+option(MALI_GPU "mali gpu support" OFF)
+option(FPGA "fpga support" OFF)
+option(USE_OPENMP "openmp support" OFF)
 option(DEBUGING "enable debug mode" ON)
 option(USE_EXCEPTION "use std exception" OFF)
 option(LOG_PROFILE "log profile" OFF)
-# select the platform to build
-option(CPU "armv7 with neon" ON)
-option(MALI_GPU "mali gpu" OFF)
-option(FPGA "fpga" OFF)
 file(GLOB_RECURSE PADDLE_MOBILE_CC src/*.cc src/*.cpp src/*.c src/*.mm)
 file(GLOB_RECURSE PADDLE_MOBILE_H src/*.h)
 include_directories(src/)
+set(CMAKE_BUILD_TYPE Release)
+add_definitions(-O3 -s -DNDEBUG)
 if(IS_IOS)
-    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc -std=gnu++11 -stdlib=libc++ -O3 -s -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-mfpu=neon -marm -fobjc-abi-version=2 -fobjc-arc \
+        -std=gnu++11 -stdlib=libc++ -isysroot ${CMAKE_OSX_SYSROOT} ${CMAKE_CXX_FLAGS}")
 else()
-    set(CMAKE_CXX_FLAGS "-std=c++14 -O3 -s ${CMAKE_CXX_FLAGS}")
+    set(CMAKE_CXX_FLAGS "-std=c++11 ${CMAKE_CXX_FLAGS}")
 endif()
-if (DEBUGING)
+if(DEBUGING)
-    message(STATUS "debug")
+    message(STATUS "debugging mode")
-    set(CMAKE_BUILD_TYPE Release)
-    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
    add_definitions(-DPADDLE_MOBILE_DEBUG)
-else ()
+else()
-    set(CMAKE_BUILD_TYPE Release)
-    set(CMAKE_CXX_FLAGS_RELEASE "-DNDEBUG")
    add_definitions(-fvisibility=hidden -fvisibility-inlines-hidden)
-endif ()
+endif()
-if (USE_EXCEPTION)
+if(USE_EXCEPTION)
    message(STATUS "use exception")
-    add_definitions(-DENABLE_EXCEPTION)
+    add_definitions(-DENABLE_EXCEPTION -fexceptions)
-    add_definitions(-fexceptions)
 else()
    add_definitions(-fno-exceptions)
-endif ()
+endif()
-if (LOG_PROFILE)
+if(LOG_PROFILE)
    add_definitions(-DPADDLE_MOBILE_PROFILE)
 endif()
@@ -50,12 +47,12 @@ if(USE_OPENMP)
 endif()
 # platform control
-if (ARM_LINUX)
+if(ARM_LINUX)
    include("${CMAKE_CURRENT_LIST_DIR}/tools/arm-platform.cmake")
-endif ()
+endif()
-if (CPU)
+if(CPU)
-  add_definitions(-DPADDLE_MOBILE_CPU)
+    add_definitions(-DPADDLE_MOBILE_CPU)
 else()
    file(GLOB_RECURSE _tmp_list src/operators/kernel/arm/*.cpp src/operators/kernel/arm/*.cc)
    foreach(f ${_tmp_list})
@@ -68,7 +65,7 @@ else()
    endforeach()
 endif()
-if (MALI_GPU)
+if(MALI_GPU)
    add_definitions(-DPADDLE_MOBILE_MALI_GPU)
    add_definitions(-DUSE_ACL=1)
    add_definitions(-DUSE_OPENCL)
@@ -120,20 +117,20 @@ else()
    endforeach()
 endif()
-if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
    set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -llog")
 else()
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.h)
    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/jni/paddle_mobile_jni.cpp)
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/operators/math/math_func_neon.h)
-endif ()
+endif()
-if (IS_IOS)
+if(IS_IOS)
 else()
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.h)
    list(REMOVE_ITEM PADDLE_MOBILE_CC ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/PaddleMobileCPU.mm)
    list(REMOVE_ITEM PADDLE_MOBILE_H ${CMAKE_CURRENT_SOURCE_DIR}/src/ios_io/op_symbols.h)
-endif ()
+endif()
 set(CMAKE_VERBOSE_MAKEFILE ON)
 set(CMAKE_EXPORT_COMPILE_COMMANDS ON)
@@ -142,7 +139,7 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build)
 set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build)
 # NET default
-if (FPGA)
+if(FPGA)
    set(NET "FPGAnets" CACHE STRING "select net type")
 else()
    set(NET "default" CACHE STRING "select net type")
@@ -153,7 +150,7 @@ include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake")
 # build library
-if (ANDROID_NDK_TOOLCHAIN_INCLUDED)
+if(ANDROID_NDK_TOOLCHAIN_INCLUDED)
    list(REMOVE_DUPLICATES CMAKE_CXX_FLAGS)
    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
 elseif(IS_IOS)
@@ -168,9 +165,9 @@ elseif(IS_IOS)
    else()
        add_library(paddle-mobile STATIC ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
    endif()
-else ()
+else()
-    add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
+  add_library(paddle-mobile SHARED ${PADDLE_MOBILE_CC} ${PADDLE_MOBILE_H})
-endif ()
+endif()
 # unit test
 if(DEBUGING)

--- a/src/common/dep_core.h
+++ b/src/common/dep_core.h
@@ -13,6 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma once
 #ifdef PADDLE_EXECUTOR_MULTITHREAD
 #include <string>
 #include <unordered_map>
@@ -60,6 +61,7 @@ class depCore {
  std::vector<std::vector<int>> deps;
  std::vector<std::vector<int>> next;
 };
 }  // namespace paddle_mobile
 #endif
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -63,6 +63,9 @@ const char *G_OP_TYPE_BILINEAR_INTERP = "bilinear_interp";
 const char *G_OP_TYPE_FLATTEN = "flatten";
 const char *G_OP_TYPE_SHAPE = "shape";
+const char *G_OP_TYPE_QUANTIZE = "quantize";
+const char *G_OP_TYPE_DEQUANTIZE = "dequantize";
 std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key = {
@@ -111,6 +114,8 @@ std::unordered_map<
        {G_OP_TYPE_BILINEAR_INTERP, {{"OutSize", "X"}, {"Out"}}},
        {G_OP_TYPE_FLATTEN, {{"X"}, {"Out"}}},
        {G_OP_TYPE_SHAPE, {{"Input"}, {"Out"}}},
-        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}}};
+        {G_OP_TYPE_CONV_TRANSPOSE, {{"Input"}, {"Output"}}},
+        {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}},
+        {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}};
 }  // namespace paddle_mobile
--- a/src/common/types.h
+++ b/src/common/types.h
@@ -79,6 +79,13 @@ enum PMStatus {
  PMWrongDevice = 0x08     /*!< un-correct device. */
 };
+enum RoundType {
+  ROUND_UNK = 0,
+  ROUND_NEAREST_AWAY_ZERO = 1,
+  ROUND_NEAREST_TOWARDS_ZERO = 2,
+  ROUND_NEAREST_TO_EVEN = 3
+};
 extern const char *G_OP_TYPE_CONV;
 extern const char *G_OP_TYPE_BATCHNORM;
 extern const char *G_OP_TYPE_BOX_CODER;
@@ -120,6 +127,9 @@ extern const char *G_OP_TYPE_FUSION_CONV_BN;
 extern const char *G_OP_TYPE_CONV_TRANSPOSE;
 extern const char *G_OP_TYPE_PRELU;
+extern const char *G_OP_TYPE_QUANTIZE;
+extern const char *G_OP_TYPE_DEQUANTIZE;
 extern std::unordered_map<
    std::string, std::pair<std::vector<std::string>, std::vector<std::string>>>
    op_input_output_key;

--- a/src/common/util.cpp
+++ b/src/common/util.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "common/util.h"
+namespace paddle_mobile {
+char *ReadFileToBuff(std::string filename) {
+  FILE *file = fopen(filename.c_str(), "rb");
+  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
+                        filename.c_str());
+  fseek(file, 0, SEEK_END);
+  int64_t size = ftell(file);
+  PADDLE_MOBILE_ENFORCE(size > 0, "file should not be empty");
+  rewind(file);
+  char *data = new char[size];
+  size_t bytes_read = fread(data, 1, size, file);
+  PADDLE_MOBILE_ENFORCE(bytes_read == size,
+                        "read binary file bytes do not match with fseek");
+  fclose(file);
+  return data;
+}
+}  // namespace paddle_mobile
--- a/src/common/util.h
+++ b/src/common/util.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "common/enforce.h"
+namespace paddle_mobile {
+char *ReadFileToBuff(std::string filename);
+}  // namespace paddle_mobile
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -12,15 +12,15 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "api.h"
+#include "fpga/api.h"
 #include <fcntl.h>
 #include <sys/ioctl.h>
 #include <sys/mman.h>
 #include <algorithm>
 #include <map>
-#include "bias_scale.h"
+#include "fpga/bias_scale.h"
-#include "filter.h"
+#include "fpga/filter.h"
-#include "image.h"
+#include "fpga/image.h"
 #define FPGA_TEST_MODE
 #define PADDLE_MOBILE_OS_LINUX
@@ -263,7 +263,7 @@ void format_image(framework::Tensor *image_tensor) {
  auto channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = image_tensor->data<float>();
  size_t memory_size = channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
@@ -334,13 +334,13 @@ int get_aligned_filter_num(int num) {
 void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
-  filter_tensor->scale[0] = float(max_value / 127.0);
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
-  filter_tensor->scale[1] = float(127.0 / max_value);
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
  auto dims = filter_tensor->dims();
  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = filter_tensor->data<float>();
  size_t memory_size = num * channel * height * width * sizeof(float);
-  auto new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_filter(&new_data, num, channel, height, width, group_num,
                        max_value);
@@ -381,7 +381,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
-  arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));
+  arg->conv_args =
+      (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));  // NOLINT
  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
@@ -390,12 +391,15 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  arg->concat_arg.width = (uint32_t)filter->dims()[3];
  int n = arg->split_num;
-  arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
+  arg->concat_arg.images_in =
-  arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
+      (half **)fpga_malloc(n * sizeof(int *));  // NOLINT
-  arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
+  arg->concat_arg.scales_in =
+      (float **)fpga_malloc(n * sizeof(float *));  // NOLINT
+  arg->concat_arg.channel_num =
+      (uint32_t *)fpga_malloc(n * sizeof(uint32_t));  // NOLINT
  arg->concat_arg.image_out = out_ptr;
-  auto channel = (int)out->dims()[1];
+  auto channel = (int)out->dims()[1];  // NOLINT
  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
  int element_num = get_aligned_filter_element_num(
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
@@ -415,29 +419,28 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
    arg->conv_args[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_args[i].image.pad_width = (uint32_t)padding_w;
    arg->conv_args[i].filter_scale_address = filter->scale;
-    arg->conv_args[i].filter_address =
+    arg->conv_args[i].filter_address = &(
-        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
+        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
-    arg->conv_args[i].filter_num =
+    arg->conv_args[i].filter_num = (uint32_t)(
-        (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
+        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
-                              : filter_num_per_div);
+                   : filter_num_per_div);
    if (n > 1) {
      arg->conv_args[i].output.scale_address =
-          (float *)fpga_malloc(2 * sizeof(float));
+          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
      arg->conv_args[i].output.address = fpga_malloc(
          input->dims()[2] *
          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
                     IMAGE_ALIGNMENT) *
          sizeof(half));
-    }
+    } else {
-    else {
      arg->conv_args[i].output.scale_address = out->scale;
      arg->conv_args[i].output.address = out_ptr;
    }
-    arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
+    arg->concat_arg.images_in[i] =
+        (half *)arg->conv_args[i].output.address;  // NOLINT
    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
  }

--- a/src/fpga/bias_scale.cpp
+++ b/src/fpga/bias_scale.cpp
@@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "bias_scale.h"
+#include "fpga/bias_scale.h"
 #include <memory.h>
-#include "api.h"
+#include "fpga/api.h"
 namespace paddle_mobile {
 namespace fpga {
@@ -29,7 +29,8 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
      align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
  int num_element =
      2 * div_num * num_per_div_after_alignment;  // including bias & scale
-  float *ptr_aligned = (float *)fpga_malloc(num_element * sizeof(float));
+  float *ptr_aligned =
+      (float *)fpga_malloc(num_element * sizeof(float));  // NOLINT
  memset(ptr_aligned, 0, num_element * sizeof(float));
@@ -59,7 +60,7 @@ void interleave(float **data_in, int num_after_alignment) {
  float *ptr_uninterleaved = *data_in;
  float *ptr_interleaved =
-      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));
+      (float *)fpga_malloc(2 * num_after_alignment * sizeof(float));  // NOLINT
  int num = num_after_alignment / 4;
  for (int i = 0; i < num; i++) {
    memcpy(ptr_interleaved + 8 * i, ptr_uninterleaved + 4 * i,

--- a/src/fpga/filter.cpp
+++ b/src/fpga/filter.cpp
@@ -11,9 +11,11 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "filter.h"
+#include "fpga/filter.h"
 #include <memory.h>
-#include "api.h"
+#include <algorithm>
+#include "fpga/api.h"
 namespace paddle_mobile {
 namespace fpga {
@@ -55,7 +57,7 @@ void convert_to_hwc(char **data_in, int num, int channel, int height,
                    int width) {
  char *tmp = *data_in;
  int chw = channel * height * width;
-  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));
+  char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char));  // NOLINT
  for (int n = 0; n < num; n++) {
    int64_t amount_per_row = width * channel;
    for (int c = 0; c < channel; c++) {
@@ -102,7 +104,7 @@ void quantize(float **data_in, int data_size, float max) {
    tmp_data[i] = float_to_int8(
        (*data_in)[i] * scale);  // (signed char)((*data_in)[i] * scale);
  }
-  *data_in = (float *)tmp_data;
+  *data_in = (float *)tmp_data;  // NOLINT
  fpga_free(tmp);
 }
@@ -112,7 +114,8 @@ void align_element(char **data_in, int num, int chw) {
  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
  if (align_chw != chw) {
    char *tmp = *data_in;
-    char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char));
+    char *data_tmp =
+        (char *)fpga_malloc(num * align_chw * sizeof(char));  // NOLINT
    memset(data_tmp, 0, num * align_chw);
    for (j = 0; j < num; j++) {
@@ -134,7 +137,7 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
    int div_num =
        (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
    int num_element = div_num * num_per_div_after_alignment * align_chw;
-    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));
+    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
    memset(data_tmp, 0, num_element * sizeof(char));
@@ -156,7 +159,8 @@ void reorder(char **data_in, int num_after_alignment, int chw) {
  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
  char *tmp = *data_in;
  for (index = 0; index < num_after_alignment; index++) {
    new_index = index / 32 * 32 + (index % 16 / 4 * 8) + (index % 16 % 4) +
@@ -173,10 +177,11 @@ void interleave(char **data_in, int num_after_alignment, int chw) {
  int j = 0;
  int k = 0;
  int interleave_per_num = 16;
-  ;
  int chw_align = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
  char *data_tmp =
-      (char *)fpga_malloc(chw_align * num_after_alignment * sizeof(char));
+      (char *)fpga_malloc(chw_align * num_after_alignment *  // NOLINT
+                          sizeof(char));
  char *tmp = *data_in;
  int interleave_num = chw_align * 2 / interleave_per_num;
  for (i = 0; i < num_after_alignment; i += 2) {
@@ -209,7 +214,7 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
  quantize(data_in, data_size, max);
-  char **quantize_data = (char **)data_in;
+  char **quantize_data = (char **)data_in;  // NOLINT
  convert_to_hwc(quantize_data, num, channel, height, width);
  align_element(quantize_data, num, chw);

--- a/src/fpga/image.cpp
+++ b/src/fpga/image.cpp
@@ -12,9 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "image.h"
+#include "fpga/image.h"
 #include <memory.h>
-#include "api.h"
+#include <algorithm>
+#include "fpga/api.h"
 namespace paddle_mobile {
 namespace fpga {
@@ -23,7 +24,7 @@ namespace image {
 void convert_to_hwc(float **data_in, int channel, int height, int width) {
  float *tmp = *data_in;
  float *data_tmp =
-      (float *)fpga_malloc(channel * height * width * sizeof(float));
+      (float *)fpga_malloc(channel * height * width * sizeof(float));  // NOLINT
  int64_t amount_per_row = width * channel;
  for (int c = 0; c < channel; c++) {
    for (int h = 0; h < height; h++) {
@@ -42,12 +43,14 @@ void align_element_conv(float **data_in, int height, int cw) {
  int align_cw = align_to_x(cw, IMAGE_ALIGNMENT);
  if (align_cw != cw) {
    float *tmp = *data_in;
-    float *data_tmp = (float *)fpga_malloc(height * align_cw * sizeof(float));
+    float *data_tmp =
+        (float *)fpga_malloc(height * align_cw * sizeof(float));  // NOLINT
    memset(data_tmp, 0, height * align_cw * sizeof(float));
    for (h = 0; h < height; h++) {
-      memcpy((void *)(data_tmp + h * align_cw), (void *)(*data_in + h * cw),
+      memcpy((void *)(data_tmp + h * align_cw),  // NOLINT
+             (void *)(*data_in + h * cw),        // NOLINT
             cw * sizeof(float));
    }
@@ -95,7 +98,7 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
      for (i = 0; i < image_num; i++) {
        align_each_in_area_cw =
            align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-        memcpy((int16_t *)image_out + tmp_channel +
+        memcpy((int16_t *)image_out + tmp_channel +  // NOLINT
                   k * align_each_out_area_cw_differ,
               images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
               channel_num[i] * sizeof(int16_t));

--- a/src/framework/CMakeLists.txt
+++ b/src/framework/CMakeLists.txt
--- a/src/framework/framework.pb-c.c
+++ b/src/framework/framework.pb-c.c
@@ -7,6 +7,35 @@
 #endif
 #include "framework.pb-c.h"
+void paddle_mobile__framework__proto__version__init(
+    PaddleMobile__Framework__Proto__Version *message) {
+  static const PaddleMobile__Framework__Proto__Version init_value =
+      PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT;
+  *message = init_value;
+}
+size_t paddle_mobile__framework__proto__version__get_packed_size(
+    const PaddleMobile__Framework__Proto__Version *message) {
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__version__descriptor);
+  return protobuf_c_message_get_packed_size(
+      (const ProtobufCMessage *)(message));
+}
+PaddleMobile__Framework__Proto__Version *
+paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data) {
+  return (PaddleMobile__Framework__Proto__Version *)protobuf_c_message_unpack(
+      &paddle_mobile__framework__proto__version__descriptor, allocator, len,
+      data);
+}
+void paddle_mobile__framework__proto__version__free_unpacked(
+    PaddleMobile__Framework__Proto__Version *message,
+    ProtobufCAllocator *allocator) {
+  if (!message) return;
+  assert(message->base.descriptor ==
+         &paddle_mobile__framework__proto__version__descriptor);
+  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
+}
 void paddle_mobile__framework__proto__op_desc__attr__init(
    PaddleMobile__Framework__Proto__OpDesc__Attr *message) {
  static const PaddleMobile__Framework__Proto__OpDesc__Attr init_value =
@@ -32,7 +61,6 @@ size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
  return protobuf_c_message_get_packed_size(
      (const ProtobufCMessage *)(message));
 }
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -74,7 +102,6 @@ size_t paddle_mobile__framework__proto__op_proto__get_packed_size(
  return protobuf_c_message_get_packed_size(
      (const ProtobufCMessage *)(message));
 }
 PaddleMobile__Framework__Proto__OpProto *
 paddle_mobile__framework__proto__op_proto__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -171,7 +198,6 @@ size_t paddle_mobile__framework__proto__var_desc__get_packed_size(
  return protobuf_c_message_get_packed_size(
      (const ProtobufCMessage *)(message));
 }
 PaddleMobile__Framework__Proto__VarDesc *
 paddle_mobile__framework__proto__var_desc__unpack(ProtobufCAllocator *allocator,
                                                  size_t len,
@@ -201,7 +227,6 @@ size_t paddle_mobile__framework__proto__block_desc__get_packed_size(
  return protobuf_c_message_get_packed_size(
      (const ProtobufCMessage *)(message));
 }
 PaddleMobile__Framework__Proto__BlockDesc *
 paddle_mobile__framework__proto__block_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -230,7 +255,6 @@ size_t paddle_mobile__framework__proto__program_desc__get_packed_size(
  return protobuf_c_message_get_packed_size(
      (const ProtobufCMessage *)(message));
 }
 PaddleMobile__Framework__Proto__ProgramDesc *
 paddle_mobile__framework__proto__program_desc__unpack(
    ProtobufCAllocator *allocator, size_t len, const uint8_t *data) {
@@ -247,8 +271,46 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked(
         &paddle_mobile__framework__proto__program_desc__descriptor);
  protobuf_c_message_free_unpacked((ProtobufCMessage *)message, allocator);
 }
+static const int64_t
+    paddle_mobile__framework__proto__version__version__default_value = 0ll;
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[12] = {
+    paddle_mobile__framework__proto__version__field_descriptors[1] = {
+        {
+            "version", 1, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_INT64,
+            offsetof(PaddleMobile__Framework__Proto__Version, has_version),
+            offsetof(PaddleMobile__Framework__Proto__Version, version), NULL,
+            &paddle_mobile__framework__proto__version__version__default_value,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
+};
+static const unsigned
+    paddle_mobile__framework__proto__version__field_indices_by_name[] = {
+        0, /* field[0] = version */
+};
+static const ProtobufCIntRange
+    paddle_mobile__framework__proto__version__number_ranges[1 + 1] = {{1, 0},
+                                                                      {0, 1}};
+const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__version__descriptor = {
+        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
+        "paddle_mobile.framework.proto.Version",
+        "Version",
+        "PaddleMobile__Framework__Proto__Version",
+        "paddle_mobile.framework.proto",
+        sizeof(PaddleMobile__Framework__Proto__Version),
+        1,
+        paddle_mobile__framework__proto__version__field_descriptors,
+        paddle_mobile__framework__proto__version__field_indices_by_name,
+        1,
+        paddle_mobile__framework__proto__version__number_ranges,
+        (ProtobufCMessageInit)paddle_mobile__framework__proto__version__init,
+        NULL,
+        NULL,
+        NULL /* reserved[123] */
+};
+static const ProtobufCFieldDescriptor
+    paddle_mobile__framework__proto__op_desc__attr__field_descriptors[13] = {
        {
            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
            0, /* quantifier_offset */
@@ -335,11 +397,20 @@ static const ProtobufCFieldDescriptor
            NULL, 0,      /* flags */
            0, NULL, NULL /* reserved1,reserved2, etc */
        },
+        {
+            "blocks_idx", 14, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_INT32,
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr,
+                     n_blocks_idx),
+            offsetof(PaddleMobile__Framework__Proto__OpDesc__Attr, blocks_idx),
+            NULL, NULL, 0, /* flags */
+            0, NULL, NULL  /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
    paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name[] = {
        8,  /* field[8] = b */
        10, /* field[10] = block_idx */
+        12, /* field[12] = blocks_idx */
        9,  /* field[9] = bools */
        3,  /* field[3] = f */
        6,  /* field[6] = floats */
@@ -353,7 +424,7 @@ static const unsigned
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__op_desc__attr__number_ranges[2 + 1] = {
-        {1, 0}, {10, 8}, {0, 12}};
+        {1, 0}, {10, 8}, {0, 13}};
 const ProtobufCMessageDescriptor
    paddle_mobile__framework__proto__op_desc__attr__descriptor = {
        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -362,7 +433,7 @@ const ProtobufCMessageDescriptor
        "PaddleMobile__Framework__Proto__OpDesc__Attr",
        "paddle_mobile.framework.proto",
        sizeof(PaddleMobile__Framework__Proto__OpDesc__Attr),
-        12,
+        13,
        paddle_mobile__framework__proto__op_desc__attr__field_descriptors,
        paddle_mobile__framework__proto__op_desc__attr__field_indices_by_name,
        2,
@@ -500,7 +571,7 @@ static const protobuf_c_boolean
    paddle_mobile__framework__proto__op_proto__var__dispensable__default_value =
        0;
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__op_proto__var__field_descriptors[5] = {
+    paddle_mobile__framework__proto__op_proto__var__field_descriptors[6] = {
        {
            "name", 1, PROTOBUF_C_LABEL_REQUIRED, PROTOBUF_C_TYPE_STRING,
            0, /* quantifier_offset */
@@ -546,6 +617,13 @@ static const ProtobufCFieldDescriptor
            0,            /* flags */
            0, NULL, NULL /* reserved1,reserved2, etc */
        },
+        {
+            "reuse", 6, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_STRING,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__OpProto__Var, reuse), NULL,
+            NULL, 0,      /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
    paddle_mobile__framework__proto__op_proto__var__field_indices_by_name[] = {
@@ -554,10 +632,11 @@ static const unsigned
        2, /* field[2] = duplicable */
        3, /* field[3] = intermediate */
        0, /* field[0] = name */
+        5, /* field[5] = reuse */
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__op_proto__var__number_ranges[1 + 1] = {
-        {1, 0}, {0, 5}};
+        {1, 0}, {0, 6}};
 const ProtobufCMessageDescriptor
    paddle_mobile__framework__proto__op_proto__var__descriptor = {
        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -566,7 +645,7 @@ const ProtobufCMessageDescriptor
        "PaddleMobile__Framework__Proto__OpProto__Var",
        "paddle_mobile.framework.proto",
        sizeof(PaddleMobile__Framework__Proto__OpProto__Var),
-        5,
+        6,
        paddle_mobile__framework__proto__op_proto__var__field_descriptors,
        paddle_mobile__framework__proto__op_proto__var__field_indices_by_name,
        1,
@@ -1012,7 +1091,7 @@ const ProtobufCMessageDescriptor
        NULL /* reserved[123] */
 };
 static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[19] =
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_number[22] =
        {
            {"BOOL", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__BOOL",
             0},
@@ -1057,31 +1136,29 @@ static const ProtobufCEnumValue
            {"RAW", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__RAW", 17},
            {"TUPLE", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__TUPLE",
             18},
+            {"SIZE_T",
+             "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T", 19},
+            {"UINT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8",
+             20},
+            {"INT8", "PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8",
+             21},
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__var_type__type__value_ranges[] = {{0, 0},
-                                                                       {0, 19}};
+                                                                       {0, 22}};
 static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[19] = {
+    paddle_mobile__framework__proto__var_type__type__enum_values_by_name[22] = {
-        {"BOOL", 0},
+        {"BOOL", 0},           {"CHANNEL", 16},
-        {"CHANNEL", 16},
+        {"FEED_MINIBATCH", 9}, {"FETCH_LIST", 10},
-        {"FEED_MINIBATCH", 9},
+        {"FP16", 4},           {"FP32", 5},
-        {"FETCH_LIST", 10},
+        {"FP64", 6},           {"INT16", 1},
-        {"FP16", 4},
+        {"INT32", 2},          {"INT64", 3},
-        {"FP32", 5},
+        {"INT8", 21},          {"LOD_RANK_TABLE", 12},
-        {"FP64", 6},
+        {"LOD_TENSOR", 7},     {"LOD_TENSOR_ARRAY", 13},
-        {"INT16", 1},
+        {"PLACE_LIST", 14},    {"RAW", 17},
-        {"INT32", 2},
+        {"READER", 15},        {"SELECTED_ROWS", 8},
-        {"INT64", 3},
+        {"SIZE_T", 19},        {"STEP_SCOPES", 11},
-        {"LOD_RANK_TABLE", 12},
+        {"TUPLE", 18},         {"UINT8", 20},
-        {"LOD_TENSOR", 7},
-        {"LOD_TENSOR_ARRAY", 13},
-        {"PLACE_LIST", 14},
-        {"RAW", 17},
-        {"READER", 15},
-        {"SELECTED_ROWS", 8},
-        {"STEP_SCOPES", 11},
-        {"TUPLE", 18},
 };
 const ProtobufCEnumDescriptor
    paddle_mobile__framework__proto__var_type__type__descriptor = {
@@ -1090,9 +1167,9 @@ const ProtobufCEnumDescriptor
        "Type",
        "PaddleMobile__Framework__Proto__VarType__Type",
        "paddle_mobile.framework.proto",
-        19,
+        22,
        paddle_mobile__framework__proto__var_type__type__enum_values_by_number,
-        19,
+        22,
        paddle_mobile__framework__proto__var_type__type__enum_values_by_name,
        1,
        paddle_mobile__framework__proto__var_type__type__value_ranges,
@@ -1325,7 +1402,7 @@ const ProtobufCMessageDescriptor
        NULL /* reserved[123] */
 };
 static const ProtobufCFieldDescriptor
-    paddle_mobile__framework__proto__program_desc__field_descriptors[1] = {
+    paddle_mobile__framework__proto__program_desc__field_descriptors[2] = {
        {
            "blocks", 1, PROTOBUF_C_LABEL_REPEATED, PROTOBUF_C_TYPE_MESSAGE,
            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, n_blocks),
@@ -1334,14 +1411,23 @@ static const ProtobufCFieldDescriptor
            0,            /* flags */
            0, NULL, NULL /* reserved1,reserved2, etc */
        },
+        {
+            "version", 2, PROTOBUF_C_LABEL_OPTIONAL, PROTOBUF_C_TYPE_MESSAGE,
+            0, /* quantifier_offset */
+            offsetof(PaddleMobile__Framework__Proto__ProgramDesc, version),
+            &paddle_mobile__framework__proto__version__descriptor, NULL,
+            0,            /* flags */
+            0, NULL, NULL /* reserved1,reserved2, etc */
+        },
 };
 static const unsigned
    paddle_mobile__framework__proto__program_desc__field_indices_by_name[] = {
        0, /* field[0] = blocks */
+        1, /* field[1] = version */
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__program_desc__number_ranges[1 + 1] = {
-        {1, 0}, {0, 1}};
+        {1, 0}, {0, 2}};
 const ProtobufCMessageDescriptor
    paddle_mobile__framework__proto__program_desc__descriptor = {
        PROTOBUF_C__MESSAGE_DESCRIPTOR_MAGIC,
@@ -1350,7 +1436,7 @@ const ProtobufCMessageDescriptor
        "PaddleMobile__Framework__Proto__ProgramDesc",
        "paddle_mobile.framework.proto",
        sizeof(PaddleMobile__Framework__Proto__ProgramDesc),
-        1,
+        2,
        paddle_mobile__framework__proto__program_desc__field_descriptors,
        paddle_mobile__framework__proto__program_desc__field_indices_by_name,
        1,
@@ -1362,7 +1448,7 @@ const ProtobufCMessageDescriptor
        NULL /* reserved[123] */
 };
 static const ProtobufCEnumValue
-    paddle_mobile__framework__proto__attr_type__enum_values_by_number[10] = {
+    paddle_mobile__framework__proto__attr_type__enum_values_by_number[11] = {
        {"INT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT", 0},
        {"FLOAT", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__FLOAT", 1},
        {"STRING", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__STRING", 2},
@@ -1373,15 +1459,16 @@ static const ProtobufCEnumValue
        {"BOOLEANS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS", 7},
        {"BLOCK", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK", 8},
        {"LONG", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG", 9},
+        {"BLOCKS", "PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS", 10},
 };
 static const ProtobufCIntRange
    paddle_mobile__framework__proto__attr_type__value_ranges[] = {{0, 0},
-                                                                  {0, 10}};
+                                                                  {0, 11}};
 static const ProtobufCEnumValueIndex
-    paddle_mobile__framework__proto__attr_type__enum_values_by_name[10] = {
+    paddle_mobile__framework__proto__attr_type__enum_values_by_name[11] = {
-        {"BLOCK", 8},  {"BOOLEAN", 6}, {"BOOLEANS", 7}, {"FLOAT", 1},
+        {"BLOCK", 8}, {"BLOCKS", 10}, {"BOOLEAN", 6}, {"BOOLEANS", 7},
-        {"FLOATS", 4}, {"INT", 0},     {"INTS", 3},     {"LONG", 9},
+        {"FLOAT", 1}, {"FLOATS", 4},  {"INT", 0},     {"INTS", 3},
-        {"STRING", 2}, {"STRINGS", 5},
+        {"LONG", 9},  {"STRING", 2},  {"STRINGS", 5},
 };
 const ProtobufCEnumDescriptor
    paddle_mobile__framework__proto__attr_type__descriptor = {
@@ -1390,9 +1477,9 @@ const ProtobufCEnumDescriptor
        "AttrType",
        "PaddleMobile__Framework__Proto__AttrType",
        "paddle_mobile.framework.proto",
-        10,
+        11,
        paddle_mobile__framework__proto__attr_type__enum_values_by_number,
-        10,
+        11,
        paddle_mobile__framework__proto__attr_type__enum_values_by_name,
        1,
        paddle_mobile__framework__proto__attr_type__value_ranges,

--- a/src/framework/framework.pb-c.h
+++ b/src/framework/framework.pb-c.h
@@ -4,16 +4,18 @@
 #ifndef PROTOBUF_C_framework_2eproto__INCLUDED
 #define PROTOBUF_C_framework_2eproto__INCLUDED
-#include "common/protobuf-c.h"
+#include <protobuf-c/protobuf-c.h>
 PROTOBUF_C__BEGIN_DECLS
 #if PROTOBUF_C_VERSION_NUMBER < 1000000
 # error This file was generated by a newer version of protoc-c which is incompatible with your libprotobuf-c headers. Please update your headers.
-#elif 1003000 < PROTOBUF_C_MIN_COMPILER_VERSION
+#elif 1003001 < PROTOBUF_C_MIN_COMPILER_VERSION
 # error This file was generated by an older version of protoc-c which is incompatible with your libprotobuf-c headers. Please regenerate this file with a newer version of protoc-c.
 #endif
+typedef struct _PaddleMobile__Framework__Proto__Version
+    PaddleMobile__Framework__Proto__Version;
 typedef struct _PaddleMobile__Framework__Proto__OpDesc
    PaddleMobile__Framework__Proto__OpDesc;
 typedef struct _PaddleMobile__Framework__Proto__OpDesc__Attr
@@ -60,6 +62,12 @@ typedef enum _PaddleMobile__Framework__Proto__VarType__Type {
  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP16 = 4,
  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP32 = 5,
  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__FP64 = 6,
+  /*
+   * Tensor<size_t> is used in C++.
+   */
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__SIZE_T = 19,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__UINT8 = 20,
+  PADDLE_MOBILE__FRAMEWORK__PROTO__VAR_TYPE__TYPE__INT8 = 21,
  /*
   * Other types that may need additional descriptions
   */
@@ -93,13 +101,32 @@ typedef enum _PaddleMobile__Framework__Proto__AttrType {
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEAN = 6,
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BOOLEANS = 7,
  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCK = 8,
-  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG =
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__LONG = 9,
-      9 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
+  PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__BLOCKS =
+      10 PROTOBUF_C__FORCE_ENUM_TO_BE_INT_SIZE(
          PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE)
 } PaddleMobile__Framework__Proto__AttrType;
 /* --- messages --- */
+/*
+ * Any incompatible changes to ProgramDesc and its dependencies should
+ * raise the version defined version.h.
+ * Serailization and Deserialization codes should be modified in a way
+ * that supports old versions following the version and compatibility policy.
+ */
+struct _PaddleMobile__Framework__Proto__Version {
+  ProtobufCMessage base;
+  protobuf_c_boolean has_version;
+  int64_t version;
+};
+#define PADDLE_MOBILE__FRAMEWORK__PROTO__VERSION__INIT         \
+  {                                                            \
+    PROTOBUF_C_MESSAGE_INIT(                                   \
+        &paddle_mobile__framework__proto__version__descriptor) \
+    , 0, 0ll                                                   \
+  }
 struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
  ProtobufCMessage base;
  char *name;
@@ -123,13 +150,15 @@ struct _PaddleMobile__Framework__Proto__OpDesc__Attr {
  int32_t block_idx;
  protobuf_c_boolean has_l;
  int64_t l;
+  size_t n_blocks_idx;
+  int32_t *blocks_idx;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_DESC__ATTR__INIT                   \
  {                                                                            \
    PROTOBUF_C_MESSAGE_INIT(                                                   \
        &paddle_mobile__framework__proto__op_desc__attr__descriptor)           \
    , NULL, PADDLE_MOBILE__FRAMEWORK__PROTO__ATTR_TYPE__INT, 0, 0, 0, 0, NULL, \
-        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0                   \
+        0, NULL, 0, NULL, 0, NULL, 0, 0, 0, NULL, 0, 0, 0, 0, 0, NULL          \
  }
 struct _PaddleMobile__Framework__Proto__OpDesc__Var {
@@ -181,12 +210,13 @@ struct _PaddleMobile__Framework__Proto__OpProto__Var {
  protobuf_c_boolean intermediate;
  protobuf_c_boolean has_dispensable;
  protobuf_c_boolean dispensable;
+  char *reuse;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__OP_PROTO__VAR__INIT         \
  {                                                                  \
    PROTOBUF_C_MESSAGE_INIT(                                         \
        &paddle_mobile__framework__proto__op_proto__var__descriptor) \
-    , NULL, NULL, 0, 0, 0, 0, 0, 0                                   \
+    , NULL, NULL, 0, 0, 0, 0, 0, 0, NULL                             \
  }
 /*
@@ -375,14 +405,27 @@ struct _PaddleMobile__Framework__Proto__ProgramDesc {
  ProtobufCMessage base;
  size_t n_blocks;
  PaddleMobile__Framework__Proto__BlockDesc **blocks;
+  PaddleMobile__Framework__Proto__Version *version;
 };
 #define PADDLE_MOBILE__FRAMEWORK__PROTO__PROGRAM_DESC__INIT         \
  {                                                                 \
    PROTOBUF_C_MESSAGE_INIT(                                        \
        &paddle_mobile__framework__proto__program_desc__descriptor) \
-    , 0, NULL                                                       \
+    , 0, NULL, NULL                                                 \
  }
+/* PaddleMobile__Framework__Proto__Version methods */
+void paddle_mobile__framework__proto__version__init(
+    PaddleMobile__Framework__Proto__Version *message);
+size_t paddle_mobile__framework__proto__version__get_packed_size(
+    const PaddleMobile__Framework__Proto__Version *message);
+PaddleMobile__Framework__Proto__Version *
+paddle_mobile__framework__proto__version__unpack(ProtobufCAllocator *allocator,
+                                                 size_t len,
+                                                 const uint8_t *data);
+void paddle_mobile__framework__proto__version__free_unpacked(
+    PaddleMobile__Framework__Proto__Version *message,
+    ProtobufCAllocator *allocator);
 /* PaddleMobile__Framework__Proto__OpDesc__Attr methods */
 void paddle_mobile__framework__proto__op_desc__attr__init(
    PaddleMobile__Framework__Proto__OpDesc__Attr *message);
@@ -392,10 +435,8 @@ void paddle_mobile__framework__proto__op_desc__var__init(
 /* PaddleMobile__Framework__Proto__OpDesc methods */
 void paddle_mobile__framework__proto__op_desc__init(
    PaddleMobile__Framework__Proto__OpDesc *message);
 size_t paddle_mobile__framework__proto__op_desc__get_packed_size(
    const PaddleMobile__Framework__Proto__OpDesc *message);
 PaddleMobile__Framework__Proto__OpDesc *
 paddle_mobile__framework__proto__op_desc__unpack(ProtobufCAllocator *allocator,
                                                 size_t len,
@@ -487,6 +528,8 @@ void paddle_mobile__framework__proto__program_desc__free_unpacked(
    ProtobufCAllocator *allocator);
 /* --- per-message closures --- */
+typedef void (*PaddleMobile__Framework__Proto__Version_Closure)(
+    const PaddleMobile__Framework__Proto__Version *message, void *closure_data);
 typedef void (*PaddleMobile__Framework__Proto__OpDesc__Attr_Closure)(
    const PaddleMobile__Framework__Proto__OpDesc__Attr *message,
    void *closure_data);
@@ -539,6 +582,8 @@ typedef void (*PaddleMobile__Framework__Proto__ProgramDesc_Closure)(
 extern const ProtobufCEnumDescriptor
    paddle_mobile__framework__proto__attr_type__descriptor;
+extern const ProtobufCMessageDescriptor
+    paddle_mobile__framework__proto__version__descriptor;
 extern const ProtobufCMessageDescriptor
    paddle_mobile__framework__proto__op_desc__descriptor;
 extern const ProtobufCMessageDescriptor

--- a/src/framework/framework.proto
+++ b/src/framework/framework.proto
@@ -16,6 +16,13 @@ syntax = "proto2";
 option optimize_for = LITE_RUNTIME;
 package paddle_mobile.framework.proto;
+// Any incompatible changes to ProgramDesc and its dependencies should
+// raise the version defined version.h.
+//
+// Serailization and Deserialization codes should be modified in a way
+// that supports old versions following the version and compatibility policy.
+message Version { optional int64 version = 1 [ default = 0 ]; }
 enum AttrType {
  INT = 0;
  FLOAT = 1;
@@ -27,6 +34,7 @@ enum AttrType {
  BOOLEANS = 7;
  BLOCK = 8;
  LONG = 9;
+  BLOCKS = 10;
 }
 // OpDesc describes an instance of a C++ framework::OperatorBase
@@ -46,6 +54,7 @@ message OpDesc {
    repeated bool bools = 11;
    optional int32 block_idx = 12;
    optional int64 l = 13;
+    repeated int32 blocks_idx = 14;
  };
  message Var {
@@ -71,6 +80,7 @@ message OpProto {
    optional bool duplicable = 3 [ default = false ];
    optional bool intermediate = 4 [ default = false ];
    optional bool dispensable = 5 [ default = false ];
+    optional string reuse = 6;
  }
  // AttrProto describes the C++ type Attribute.
@@ -101,6 +111,10 @@ message VarType {
    FP16 = 4;
    FP32 = 5;
    FP64 = 6;
+    // Tensor<size_t> is used in C++.
+    SIZE_T = 19;
+    UINT8 = 20;
+    INT8 = 21;
    // Other types that may need additional descriptions
    LOD_TENSOR = 7;
@@ -173,4 +187,8 @@ message BlockDesc {
 // for more details.
 // TODO(panyx0718): A model can have multiple programs. Need a
 // way to distinguish them. Maybe ID or name?
-message ProgramDesc { repeated BlockDesc blocks = 1; }
+message ProgramDesc {
+  repeated BlockDesc blocks = 1;
+  optional Version version = 2;
+}
--- a/src/framework/load_ops.h
+++ b/src/framework/load_ops.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#ifdef PADDLE_MOBILE_CPU
+#define LOAD_CPU_OP(op_type)                                           \
+  extern int TouchOpRegistrar_##op_type##_##cpu();                     \
+  static int use_op_itself_##op_type##_##cpu __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##cpu()
+#else
+#define LOAD_CPU_OP(op_type)
+#endif
+#ifdef PADDLE_MOBILE_MALI_GPU
+#define LOAD_MALI_GPU_OP(op_type)                                           \
+  extern int TouchOpRegistrar_##op_type##_##mali_gpu();                     \
+  static int use_op_itself_##op_type##_##mali_gpu __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##mali_gpu()
+#else
+#define LOAD_MALI_GPU_OP(op_type)
+#endif
+#ifdef PADDLE_MOBILE_FPGA
+#define LOAD_FPGA_OP(op_type)                                           \
+  extern int TouchOpRegistrar_##op_type##_##fpga();                     \
+  static int use_op_itself_##op_type##_##fpga __attribute__((unused)) = \
+      TouchOpRegistrar_##op_type##_##mali_gpu()
+#else
+#define LOAD_FPGA_OP(op_type)
+#endif
+#define LOAD_FUSION_MATCHER(op_type)                                       \
+  extern int TouchFusionMatcherRegistrar_##op_type();                      \
+  static int use_fusion_matcher_itself_##op_type __attribute__((unused)) = \
+      TouchFusionMatcherRegistrar_##op_type();
+#define LOAD_OP(op_type)     \
+  LOAD_CPU_OP(op_type);      \
+  LOAD_MALI_GPU_OP(op_type); \
+  LOAD_FPGA_OP(op_type);
+#define LOAD_OP1(op_type, device_type) LOAD_##device_type##_OP(op_type);
+#define LOAD_OP2(op_type, device_type1, device_type2) \
+  LOAD_OP1(op_type, device_type1)                     \
+  LOAD_OP1(op_type, device_type2)
+#define LOAD_OP3(op_type, device_type1, device_type2, device_type3) \
+  LOAD_OP2(op_type, device_type1, device_type2)                     \
+  LOAD_OP1(op_type, device_type3)
+// load requared ops
+LOAD_OP(feed)
+LOAD_OP(fetch)
+#ifdef BATCHNORM_OP
+LOAD_OP2(batch_norm, CPU, MALI_GPU);
+#endif
+#ifdef BILINEAR_INTERP_OP
+LOAD_OP1(bilinear_interp, CPU);
+#endif
+#ifdef BOXCODER_OP
+LOAD_OP1(box_coder, CPU);
+#endif
+#ifdef CONCAT_OP
+LOAD_OP3(concat, CPU, MALI_GPU, FPGA);
+#endif
+#ifdef CONV_OP
+LOAD_OP3(conv2d, CPU, MALI_GPU, FPGA);
+#endif
+#ifdef LRN_OP
+LOAD_OP2(lrn, CPU, MALI_GPU);
+#endif
+#ifdef SIGMOID_OP
+LOAD_OP1(sigmoid, CPU);
+#endif
+#ifdef FUSION_FC_RELU_OP
+LOAD_OP3(fusion_fc_relu, CPU, MALI_GPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_fc_relu);
+#endif
+#ifdef FUSION_ELEMENTWISEADDRELU_OP
+LOAD_OP3(fusion_elementwise_add_relu, CPU, MALI_GPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_elementwise_add_relu);
+#endif
+#ifdef SPLIT_OP
+LOAD_OP1(split, CPU);
+#endif
+#ifdef RESIZE_OP
+LOAD_OP2(resize, CPU, MALI_GPU);
+#endif
+#ifdef FUSION_CONVADDBNRELU_OP
+LOAD_OP2(fusion_conv_add_bn_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_bn_relu);
+#endif
+#ifdef RESHAPE_OP
+LOAD_OP2(reshape, CPU, MALI_GPU);
+#endif
+#ifdef TRANSPOSE_OP
+LOAD_OP1(transpose, CPU);
+#endif
+#ifdef PRIORBOX_OP
+LOAD_OP1(prior_box, CPU);
+#endif
+#ifdef FUSION_CONVADDRELU_OP
+LOAD_OP2(fusion_conv_add_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_relu);
+#endif
+#ifdef FUSION_CONVADDADDPRELU_OP
+LOAD_OP2(fusion_conv_add_add_prelu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_add_prelu);
+#endif
+#ifdef FUSION_CONVADD_OP
+LOAD_OP2(fusion_conv_add, CPU, MALI_GPU);
+LOAD_FUSION_MATCHER(fusion_conv_add);
+#endif
+#ifdef SOFTMAX_OP
+LOAD_OP2(softmax, CPU, MALI_GPU);
+#endif
+#ifdef SHAPE_OP
+LOAD_OP1(shape, CPU);
+#endif
+#ifdef DEPTHWISECONV_OP
+LOAD_OP1(depthwise_conv2d, CPU);
+#endif
+#ifdef CONV_TRANSPOSE_OP
+LOAD_OP1(conv2d_transpose, CPU);
+#endif
+#ifdef SCALE_OP
+LOAD_OP2(scale, CPU, MALI_GPU);
+#endif
+#ifdef ELEMENTWISEADD_OP
+LOAD_OP2(elementwise_add, CPU, MALI_GPU);
+#endif
+#ifdef PRELU_OP
+LOAD_OP2(prelu, CPU, MALI_GPU);
+#endif
+#ifdef FLATTEN_OP
+LOAD_OP1(flatten, CPU);
+#endif
+#ifdef FUSION_CONVBNADDRELU_OP
+LOAD_OP2(fusion_conv_bn_add_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_bn_add_relu);
+#endif
+#ifdef FUSION_CONVBNRELU_OP
+LOAD_OP2(fusion_conv_bn_relu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_bn_relu);
+#endif
+#ifdef GRU_OP
+LOAD_OP1(gru, CPU);
+#endif
+#ifdef FUSION_CONVADDBN_OP
+LOAD_OP2(fusion_conv_add_bn, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_bn);
+#endif
+#ifdef DROPOUT_OP
+LOAD_OP2(dropout, CPU, FPGA);
+#endif
+#ifdef FUSION_CONVADDPRELU_OP
+LOAD_OP2(fusion_conv_add_prelu, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_add_prelu);
+#endif
+#ifdef FUSION_DWCONVBNRELU_OP
+LOAD_OP1(fusion_dwconv_bn_relu, CPU);
+LOAD_FUSION_MATCHER(fusion_dwconv_bn_relu);
+#endif
+#ifdef CRF_OP
+LOAD_OP1(crf_decoding, CPU);
+#endif
+#ifdef MUL_OP
+LOAD_OP2(mul, CPU, MALI_GPU);
+#endif
+#ifdef RELU_OP
+LOAD_OP2(relu, CPU, MALI_GPU);
+#endif
+#ifdef IM2SEQUENCE_OP
+LOAD_OP1(im2sequence, CPU);
+#endif
+#ifdef LOOKUP_OP
+LOAD_OP1(lookup_table, CPU);
+#endif
+#ifdef FUSION_FC_OP
+LOAD_OP3(fusion_fc, CPU, MALI_GPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_fc);
+#endif
+#ifdef POOL_OP
+LOAD_OP3(pool2d, CPU, MALI_GPU, FPGA);
+#endif
+#ifdef MULTICLASSNMS_OP
+LOAD_OP1(multiclass_nms, CPU);
+#endif
+#ifdef SLICE_OP
+LOAD_OP2(slice, CPU, MALI_GPU);
+#endif
+#ifdef FUSION_CONVBN_OP
+LOAD_OP2(fusion_conv_bn, CPU, FPGA);
+LOAD_FUSION_MATCHER(fusion_conv_bn);
+#endif
+LOAD_OP1(quantize, CPU);
+LOAD_OP1(dequantize, CPU);
--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -97,6 +97,7 @@ class OpRegistry {
 };
 #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
+  template class op_class<device_type, float>;                             \
  template <typename Dtype, typename T>                                    \
  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {   \
   public:                                                                 \
@@ -119,16 +120,5 @@ class OpRegistry {
 #define REGISTER_OPERATOR_FPGA(op_type, op_class) \
  REGISTER_OPERATOR(op_type, op_class, fpga, paddle_mobile::FPGA);
-#define USE_OP(op_type, device_name)                                           \
-  extern int TouchOpRegistrar_##op_type##_##device_name();                     \
-  static int use_op_itself_##op_type##_##device_name __attribute__((unused)) = \
-      TouchOpRegistrar_##op_type##_##device_name()
-#define USE_OP_CPU(op_type) USE_OP(op_type, cpu);
-#define USE_OP_MALI_GPU(op_type) USE_OP(op_type, mali_gpu);
-#define USE_OP_FPGA(op_type) USE_OP(op_type, fpga);
 }  // namespace framework
 }  // namespace paddle_mobile
--- a/src/framework/program/program-optimize/fusion_op_register.h
+++ b/src/framework/program/program-optimize/fusion_op_register.h
@@ -67,7 +67,16 @@ class FusionOpRegistrar {
  explicit FusionOpRegistrar(FusionOpMatcher* matcher) {
    FusionOpRegister::Instance()->regist(matcher);
  }
+  void Touch() {}
 };
 }  // namespace framework
 }  // namespace paddle_mobile
+#define REGISTER_FUSION_MATCHER(op_type, matcher)          \
+  static paddle_mobile::framework::FusionOpRegistrar       \
+      __fusion_matcher_registrar_##op_type(new matcher()); \
+  int TouchFusionMatcherRegistrar_##op_type() {            \
+    __fusion_matcher_registrar_##op_type.Touch();          \
+    return 0;                                              \
+  }
--- a/src/framework/program/program.h
+++ b/src/framework/program/program.h
@@ -33,8 +33,6 @@ class Program {
  bool quantification = false;
  size_t combined_params_len;
  const uint8_t *combined_params_buf;
- private:
 };
 }  // namespace framework

--- a/src/framework/program/tensor_desc.h
+++ b/src/framework/program/tensor_desc.h
@@ -40,7 +40,10 @@ enum VarType_Type {
  VARTYPE_TYPE_READER = 15,
  VARTYPE_TYPE_CHANNEL = 16,
  VARTYPE_TYPE_RAW = 17,
-  VARTYPE_TYPE_TUPLE = 18
+  VARTYPE_TYPE_TUPLE = 18,
+  VARTYPE_TYPE_SIZE_T = 19,
+  VARTYPE_TYPE_UINT8 = 20,
+  VARTYPE_TYPE_INT8 = 21,
 };
 class TensorDesc {
@@ -58,8 +61,9 @@ class TensorDesc {
    }
    data_type_ = (VarType_Type)desc->data_type;
  }
+  // return tensor dim as a vector
  std::vector<int64_t> Dims() const { return dims_; };
+  // return tensor data type
  VarType_Type DataType() const { return data_type_; }
 private:

--- a/src/framework/program/var_desc.h
+++ b/src/framework/program/var_desc.h
@@ -31,6 +31,7 @@ class VarDesc {
    this->tensor_desc_ = var_desc.tensor_desc_;
    this->type_ = var_desc.type_;
  }
  VarDesc(PaddleMobile__Framework__Proto__VarDesc *desc) {
    type_ = (VarType_Type)desc->type->type;
    name_ = std::string(desc->name);
@@ -44,9 +45,7 @@ class VarDesc {
        tensor_desc_ = TensorDesc(desc->type->lod_tensor->tensor);
        break;
      case VARTYPE_TYPE_STEP_LOD_TENSOR_ARRAY:
-        desc->type->tensor_array->tensor->data_type;
        tensor_desc_ = TensorDesc(desc->type->tensor_array->tensor);
        break;
      default:
        break;
@@ -60,6 +59,7 @@ class VarDesc {
        break;
    }
  }
  std::string Name() const { return name_; }
  VarType_Type Type() const { return type_; }

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -319,10 +319,11 @@ class Tensor {
   * begins.
   */
  size_t offset_;
 #ifdef PADDLE_MOBILE_FPGA
- public:
+ public:  // NOLINT
  inline void reset_data_ptr(void *p) {
-    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);
+    ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p);  // NOLINT
  }
  float scale[2];  // scale[0]= MAX/127.0, scale[1]= 127.0/MAX
 #endif
@@ -335,11 +336,12 @@ inline Print &operator<<(Print &printer, const Tensor &tensor) {
  stride = stride > 0 ? stride : 1;
 #ifndef PADDLE_MOBILE_FPGA
  for (int i = 0; i < tensor.numel(); i += stride) {
-    //  这不一定是float的
    if (tensor.type() == typeid(float)) {
      printer << tensor.data<float>()[i] << " ";
    } else if (tensor.type() == typeid(int64_t)) {
      printer << tensor.data<int64_t>()[i] << " ";
+    } else if (tensor.type() == typeid(int8_t)) {
+      printer << tensor.data<int8_t>()[i] << " ";
    }
  }
 #endif

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -13,8 +13,8 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "io/executor.h"
-#include <operators/math/gemm.h>
 #include <algorithm>
+#include <utility>
 #include <vector>
 #include "common/enforce.h"
 #include "common/log.h"
@@ -26,74 +26,45 @@ limitations under the License. */
 #include "framework/program/var_desc.h"
 #include "framework/scope.h"
 #include "framework/tensor.h"
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
+#include "operators/math/gemm.h"
-#include <queue>
-#include <utility>
-#include "common/threadpool.h"
-#endif
 namespace paddle_mobile {
-using framework::Variable;
-char *Get_binary_data(std::string filename) {
+using framework::Variable;
-  FILE *file = fopen(filename.c_str(), "rb");
-  PADDLE_MOBILE_ENFORCE(file != nullptr, "can't open file: %s ",
-                        filename.c_str());
-  fseek(file, 0, SEEK_END);
-  int64_t size = ftell(file);
-  PADDLE_MOBILE_ENFORCE(size > 0, "size is too small");
-  rewind(file);
-  char *data = new char[size];
-  size_t bytes_read = fread(data, 1, size, file);
-  PADDLE_MOBILE_ENFORCE(bytes_read == size,
-                        "read binary file bytes do not match with fseek");
-  fclose(file);
-  return data;
-}
-#pragma mark - executor
 template <typename Dtype, Precision P>
 Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
-                             bool use_optimize, bool loddable)
+                             const bool use_optimize, const bool loddable)
    : program_(p),
      batch_size_(batch_size),
      use_optimize_(use_optimize),
      loddable_(loddable) {
-  if (use_optimize_) {
-    to_predict_program_ = program_.optimizeProgram;
-  } else {
-    to_predict_program_ = program_.originProgram;
-  }
  Variable *variable_ptr = program_.scope->Var("batch_size");
-  variable_ptr[0].SetValue<int>(batch_size);
+  variable_ptr->SetValue<int>(batch_size);
+  to_predict_program_ =
+      use_optimize_ ? program_.optimizeProgram : program_.originProgram;
  PADDLE_MOBILE_ENFORCE(to_predict_program_ != nullptr,
                        "to_predict_program_ == NULL!");
-  const std::vector<std::shared_ptr<framework::BlockDesc>> blocks =
+  const std::vector<std::shared_ptr<framework::BlockDesc>> &blocks =
      to_predict_program_->Blocks();
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  depManager.resize(blocks.size());
+  DLOG << "executor in loaddable mode: " << loddable_;
-#endif
-  DLOG << "executer in loaddable mode: " << loddable_;
  for (int i = 0; i < blocks.size(); ++i) {
    std::shared_ptr<framework::BlockDesc> block_desc = blocks[i];
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << j << "  " << op->Type();
+      DLOG << "create op: " << op->Type();
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
-      // use pre_infershape to pre resize , but if u use an lod mode tensor u
+      // infer shape to reshape tensor before predict,
-      // need to resize in runtime
+      // but for lod tensor, it will need to reshape in runtime
      if (!loddable_) {
        op_base->InferShape();
      }
      ops_of_block_[*block_desc.get()].push_back(op_base);
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-      depManager[i].analysisDep(ops_of_block_[*block_desc.get()]);
-#endif
    }
-    DLOG << "Total " << ops.size() << " ops have been created ";
  }
  if (program_.combined) {
    InitCombineMemory();
@@ -103,118 +74,81 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
-  int i = 0;
  for (const auto &op : ops) {
-    DLOG << "Init op: " << i++ << "  " << op->Type();
    op->Init();
  }
 }
-template <typename Dtype, Precision P>
+template <typename Dtype>
-void Executor<Dtype, P>::LoadMemory(const framework::VarDesc var_desc,
+void LoadMemInternal(void **data, framework::LoDTensor *tensor) {
-                                    framework::LoDTensor *tensor, char **data) {
+  char **data_buf = reinterpret_cast<char **>(data);
-  // 1. version
+  int64_t size = tensor->numel();
-  uint32_t version = *reinterpret_cast<uint32_t *>(*data);
+  Dtype *tensor_data = tensor->mutable_data<Dtype>();
+  if (0) {
-  (*data) += sizeof(uint32_t);
+    // TODO(hjchen2) should be moved into operator init function
+    float min_value;
-  // 2 Lod information
+    float max_value;
-  uint64_t *lod_level_ptr = new uint64_t();
+    memcpy(&min_value, data_buf, sizeof(float));
-  memcpy(lod_level_ptr, (*data), sizeof(uint64_t));
+    memcpy(&max_value, data_buf + sizeof(float), sizeof(float));
-  uint64_t lod_level = *lod_level_ptr;
+    data_buf += 2 * sizeof(float);
-  delete lod_level_ptr;
+    const float factor = (max_value - min_value) / 255.0;
-  (*data) += sizeof(uint64_t);
+    const uint8_t *uint8_data = reinterpret_cast<uint8_t *>(data_buf);
+    for (int k = 0; k < size; ++k) {
-  auto &lod = *tensor->mutable_lod();
+      tensor_data[k] = uint8_data[k] * factor + min_value;
-  lod.resize(lod_level);
-  for (uint64_t i = 0; i < lod_level; ++i) {
-    uint64_t size = *reinterpret_cast<uint64_t *>(*data);
-    (*data) += sizeof(uint64_t);
-    std::vector<size_t> tmp(size / sizeof(size_t));
-    for (int k = 0; k < tmp.size(); ++k) {
-      tmp[k] = *reinterpret_cast<size_t *>(*data);
-      (*data) += sizeof(size_t);
-    }
-    for (auto j : tmp) {
-      LOG(kLOG_DEBUG1) << "    lod - " << j;
    }
-    lod[i] = tmp;
+    data_buf += size * sizeof(uint8_t);
-  }
+  } else {
+    memcpy(tensor_data, *data_buf, size * sizeof(Dtype));
-  // 3. tensor version
+    *data_buf += size * sizeof(Dtype);
-  uint32_t tensor_version = *reinterpret_cast<uint32_t *>(*data);
-  (*data) += sizeof(uint32_t);
-  // 4. tensor desc
-  int32_t size = *reinterpret_cast<int32_t *>(*data);
-  (*data) += sizeof(int32_t);
-  std::unique_ptr<char[]> buf(new char[size]);
-  for (int m = 0; m < size; ++m) {
-    buf.get()[m] = (*data)[m];
  }
-  (*data) += (sizeof(char) * size);
+}
-  const framework::TensorDesc &desc = var_desc.Tensor_desc();
+template <typename Dtype, Precision P>
-  int memory_size = 1;
+void Executor<Dtype, P>::LoadMemory(
-  for (auto l : desc.Dims()) {
+    void **data, const std::shared_ptr<framework::VarDesc> var_desc,
-    memory_size *= l;
+    framework::LoDTensor *tensor) {
+  char **data_buf = reinterpret_cast<char **>(data);
+  // version
+  uint32_t version = *(reinterpret_cast<uint32_t *>(*data_buf));
+  *data_buf += sizeof(uint32_t);
+  // lod information
+  uint64_t lod_level = *(reinterpret_cast<uint64_t *>(*data_buf));
+  *data_buf += sizeof(uint64_t);
+  auto *lod = tensor->mutable_lod();
+  lod->resize(lod_level);
+  for (uint64_t i = 0; i < lod_level; ++i) {
+    uint64_t size = *(reinterpret_cast<uint64_t *>(*data_buf));
+    *data_buf += sizeof(uint64_t);
+    std::vector<size_t> tmp_dim(size / sizeof(size_t));
+    memcpy(tmp_dim.data(), *data_buf, size);
+    (*lod)[i] = std::move(tmp_dim);
+    *data_buf += size;
  }
+  // tensor version
-  tensor->Resize(framework::make_ddim(desc.Dims()));
+  uint32_t tensor_version = *(reinterpret_cast<uint32_t *>(*data_buf));
+  *data_buf += sizeof(uint32_t);
-  void *memory = nullptr;
+  // tensor desc size
-  int type_size = 0;
+  int32_t tensor_desc_size = *(reinterpret_cast<int32_t *>(*data_buf));
-  switch (desc.DataType()) {
+  *data_buf += sizeof(int32_t);
-    case framework::VARTYPE_TYPE_FP16:
+  // skip tensor desc
-      type_size = 2;
+  *data_buf += tensor_desc_size;
-      break;
+  const framework::TensorDesc &tensor_desc = var_desc->Tensor_desc();
+  tensor->Resize(framework::make_ddim(tensor_desc.Dims()));
+  // parse tensor from stream
+  switch (tensor_desc.DataType()) {
    case framework::VARTYPE_TYPE_FP32:
-      type_size = 4;
+      LoadMemInternal<float>(reinterpret_cast<void **>(data_buf), tensor);
-      memory = tensor->mutable_data<float>();
      break;
-    case framework::VARTYPE_TYPE_FP64:
+    case framework::VARTYPE_TYPE_INT8:
-      type_size = 8;
+      LoadMemInternal<int8_t>(reinterpret_cast<void **>(data_buf), tensor);
      break;
    case framework::VARTYPE_TYPE_INT32:
-      memory = tensor->mutable_data<int32_t>();
+      LoadMemInternal<int>(reinterpret_cast<void **>(data_buf), tensor);
-      type_size = 4;
-      break;
-    case framework::VARTYPE_TYPE_INT64:
-      type_size = 8;
-      break;
-    case framework::VARTYPE_TYPE_BOOL:
-      type_size = 1;
      break;
    default:
-      break;
+      LOG(kLOG_ERROR) << "data type is not supported";
-  }
-  if (program_.quantification) {
-    float min_value;
-    float max_value;
-    memcpy(&min_value, *data, sizeof(float));
-    memcpy(&max_value, *data + sizeof(float), sizeof(float));
-    *data += 2 * sizeof(float);
-    const float factor = (max_value - min_value) / 255.0;
-    uint8_t *uint8_data = reinterpret_cast<uint8_t *>(*data);
-    for (int k = 0; k < memory_size; ++k) {
-      static_cast<float *>(memory)[k] = uint8_data[k] * factor + min_value;
-    }
-    *data += (memory_size * sizeof(uint8_t));
-  } else {
-    for (int n = 0; n < memory_size; n++) {
-      float value;
-      memcpy(&value, *data + n * type_size, type_size);
-      if (value < 1e-30 && value > -1e-30) {
-        static_cast<float *>(memory)[n] = 0.0;
-      } else {
-        static_cast<float *>(memory)[n] = value;
-      }
-    }
-    (*data) += (sizeof(char) * memory_size * type_size);
  }
 }
@@ -223,35 +157,19 @@ void Executor<Dtype, P>::InitMemory() {
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
+      auto tensor = var->template GetMutable<framework::LoDTensor>();
      if (var_desc->Persistable()) {
-        auto tensor = var->template GetMutable<framework::LoDTensor>();
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
        char *origin_data =
-            Get_binary_data(program_.model_path + "/" + var_desc->Name());
+            ReadFileToBuff(program_.model_path + "/" + var_desc->Name());
        char *data = origin_data;
-        LoadMemory(*var_desc, tensor, &data);
+        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
+        delete[] origin_data;
-        //        DLOG << "-----      " << var_desc->Name();
-        //        DLOG << "-----      " << tensor->dims();
-        //        float *pDouble = tensor->template data<float>();
-        //        for (int i = 0; i < tensor->numel() && i < 30; ++i) {
-        //          std::cout << pDouble[i] << std::endl;
-        //        }
-        delete origin_data;
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          bool is_mute_match;
+          varInputMemory(var_desc, var, tensor);
-          framework::LoDTensor *tensor = nullptr;
-          is_mute_match = varInputMemory(var_desc, var, tensor);
-          PADDLE_MOBILE_ENFORCE(
-              is_mute_match,
-              "got unhandled var_desc->Tensor_desc().DataType(): %d",
-              var_desc->Tensor_desc().DataType());
        }
      }
    }
@@ -260,84 +178,65 @@ void Executor<Dtype, P>::InitMemory() {
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::InitCombineMemory() {
-  char *origin_data;
+  char *origin_data = nullptr;
+  bool self_alloc = false;
  if (program_.combined_params_buf && program_.combined_params_len) {
-    LOG(kLOG_INFO) << "use outter memory";
+    origin_data = reinterpret_cast<char *>(
-    origin_data = (char *)program_.combined_params_buf;
+        const_cast<uint8_t *>(program_.combined_params_buf));
  } else {
-    LOG(kLOG_INFO) << " begin init combine memory";
+    self_alloc = true;
-    origin_data = Get_binary_data(program_.para_path);
+    origin_data = ReadFileToBuff(program_.para_path);
  }
-  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "origin_data==nullptr!!!");
+  PADDLE_MOBILE_ENFORCE(origin_data != nullptr, "data == nullptr");
  char *data = origin_data;
  for (const auto &block : to_predict_program_->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = program_.scope->Var(var_desc->Name());
+      auto tensor = var->template GetMutable<framework::LoDTensor>();
      if (var_desc->Persistable()) {
-        auto tensor = var->template GetMutable<framework::LoDTensor>();
        if (var_desc->Name() == "feed" || var_desc->Name() == "fetch") {
          continue;
        }
-        LoadMemory(*var_desc, tensor, &data);
+        LoadMemory(reinterpret_cast<void **>(&data), var_desc, tensor);
      } else {
        if (var_desc->Type() == framework::VARTYPE_TYPE_LOD_TENSOR) {
-          bool is_mute_match = false;
+          varInputMemory(var_desc, var, tensor);
-          framework::LoDTensor *tensor;
-          is_mute_match = varInputMemory(var_desc, var, tensor);
-          PADDLE_MOBILE_ENFORCE(
-              is_mute_match,
-              "got unhandled var_desc->Tensor_desc().DataType(): %d",
-              var_desc->Tensor_desc().DataType());
        }
      }
    }
  }
-  delete origin_data;
+  if (self_alloc) {
-  LOG(kLOG_INFO) << " end init combine memory ";
+    delete[] origin_data;
+  }
+  LOG(kLOG_INFO) << "init combine memory finish";
 }
 template <typename Dtype, Precision P>
 bool Executor<Dtype, P>::varInputMemory(
    const std::shared_ptr<framework::VarDesc> &var_desc, Variable *var,
    framework::LoDTensor *tensor) const {
-  bool is_mute_match = false;
+  auto type = var_desc->Tensor_desc().DataType();
-  switch (var_desc->Tensor_desc().DataType()) {
+  switch (type) {
-    case framework::VARTYPE_TYPE_FP16: {
+    case framework::VARTYPE_TYPE_FP32:
+      tensor->mutable_data<float>();
      break;
-    }
+    case framework::VARTYPE_TYPE_INT8:
+      tensor->mutable_data<int8_t>();
-    case framework::VARTYPE_TYPE_FP32: {
-      tensor = var->template GetMutable<framework::LoDTensor>();
-      tensor->template mutable_data<Ptype>();
-      is_mute_match = true;
      break;
-    }
+    case framework::VARTYPE_TYPE_INT32:
+      tensor->mutable_data<int32_t>();
-    case framework::VARTYPE_TYPE_FP64: {
-      break;
-    }
-    case framework::VARTYPE_TYPE_INT32: {
-      tensor = var->template GetMutable<framework::LoDTensor>();
-      tensor->template mutable_data<int32_t>();
-      is_mute_match = true;
      break;
-    }
+    case framework::VARTYPE_TYPE_INT64:
+      tensor->mutable_data<int64_t>();
-    case framework::VARTYPE_TYPE_INT64: {
-      tensor = var->template GetMutable<framework::LoDTensor>();
-      tensor->template mutable_data<int64_t>();
-      is_mute_match = true;
      break;
-    }
+    default:
-    case framework::VARTYPE_TYPE_BOOL: {
      break;
-    }
-    default: { break; }
  }
+  bool is_mute_match = (type == framework::VARTYPE_TYPE_FP32) ||
+                       (type == framework::VARTYPE_TYPE_INT8) ||
+                       (type == framework::VARTYPE_TYPE_INT32) ||
+                       (type == framework::VARTYPE_TYPE_INT64);
+  PADDLE_MOBILE_ENFORCE(is_mute_match, "got unhandled data type : %d", type);
  return is_mute_match;
 }
@@ -356,61 +255,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
 #ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
 #endif
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  std::mutex m;
-  std::condition_variable cv;
-  std::queue<int> next;
-  next.push(0);
-  int rsize = ops.size();
-  std::vector<int> status(rsize, 0);
-  auto &threadPool = ThreadPool::getThreadPool();
-  auto &dep = depManager[0];
-  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
-    std::lock_guard<std::mutex> lk(m);
-    rsize--;
-    status[opi] = 2;
-    for (int i : dep.getNext(opi)) {
-      bool ok = true;
-      for (int j : dep.getDeps(i)) {
-        if (status[j] != 2) {
-          ok = false;
-          break;
-        }
-      }
-      if (ok && (status[i] == 0)) {
-        next.push(i);
-      }
-    }
-    cv.notify_one();
-  };
-  for (;;) {
-    std::unique_lock<std::mutex> lk(m);
-    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
-    if (rsize == 0) {
-      break;
-    }
-    while (next.size() > 0) {
-      int opi = next.front();
-      next.pop();
-      status[opi] = 1;
-      threadPool.enqueue([opi, &ops, &finishF, &profile] {
-        auto &op = ops[opi];
-#ifdef PADDLE_MOBILE_PROFILE
-        struct timespec ts;
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
-#endif
-        ops[opi]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-        finishF(opi);
-      });
-    }
-  }
-#else
  for (int i = 0; i < ops.size(); i++) {
 #ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
@@ -424,7 +268,6 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
  }
-#endif
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
  std::vector<std::string> out_keys = (*last_op)->GetOutKeys();
@@ -433,34 +276,12 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::Predict(
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO(haipeng): expose profile info as an interface, user can get them to
-  // analysis
-  //      the performance of their deepnet.
-  FILE *df = fopen("net.dot", "w");
-  fprintf(df, "digraph {\n");
-  for (int i = 0; i < ops.size(); i++) {
-    for (int j : dep.getNext(i)) {
-      fprintf(df, "op_%d -> op_%d\n", i, j);
-    }
-  }
-  for (int i = 0; i < ops.size(); i++) {
-    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
-  }
-  fprintf(df, "}\n");
-  fclose(df);
-#endif
-  //  FILE *pf = fopen("profile.out", "w");
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
-    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
-    //    ops[i]->Type().c_str(),
-    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
  }
-  //  fclose(pf);
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
@@ -501,61 +322,6 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
 #ifdef PADDLE_MOBILE_PROFILE
  std::vector<ProfInfo> profile(ops.size());
 #endif
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  std::mutex m;
-  std::condition_variable cv;
-  std::queue<int> next;
-  next.push(0);
-  int rsize = ops.size();
-  std::vector<int> status(rsize, 0);
-  auto &threadPool = ThreadPool::getThreadPool();
-  auto &dep = depManager[0];
-  auto finishF = [&ops, &m, &cv, &next, &status, &rsize, &dep](int opi) {
-    std::lock_guard<std::mutex> lk(m);
-    rsize--;
-    status[opi] = 2;
-    for (int i : dep.getNext(opi)) {
-      bool ok = true;
-      for (int j : dep.getDeps(i)) {
-        if (status[j] != 2) {
-          ok = false;
-          break;
-        }
-      }
-      if (ok && (status[i] == 0)) {
-        next.push(i);
-      }
-    }
-    cv.notify_one();
-  };
-  for (;;) {
-    std::unique_lock<std::mutex> lk(m);
-    cv.wait(lk, [&next, &rsize] { return rsize == 0 || !next.empty(); });
-    if (rsize == 0) {
-      break;
-    }
-    while (next.size() > 0) {
-      int opi = next.front();
-      next.pop();
-      status[opi] = 1;
-      threadPool.enqueue([opi, &ops, &finishF, &profile] {
-        auto &op = ops[opi];
-#ifdef PADDLE_MOBILE_PROFILE
-        struct timespec ts;
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-        profile[opi].tid = ThreadPool::getThreadPoolThreadId();
-#endif
-        ops[opi]->Run();
-#ifdef PADDLE_MOBILE_PROFILE
-        clock_gettime(CLOCK_MONOTONIC, &ts);
-        profile[opi].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
-#endif
-        finishF(opi);
-      });
-    }
-  }
-#else
  for (int i = 0; i < ops.size(); i++) {
 #ifdef PADDLE_MOBILE_PROFILE
    struct timespec ts;
@@ -565,14 +331,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
    if (loddable_) {
      ops[i]->InferShape();
    }
-    // to Run
    ops[i]->Run();
 #ifdef PADDLE_MOBILE_PROFILE
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
  }
-#endif
  auto last_op = ops.rbegin();
  auto output_map = (*last_op)->Outputs();
@@ -582,34 +346,12 @@ std::shared_ptr<framework::LoDTensor> Executor<Dtype, P>::PredictLod(
      framework::GetVarValue<framework::LoDTensor>(out_keys[0], output_map,
                                                   *(program_.scope));
 #ifdef PADDLE_MOBILE_PROFILE
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  // TODO(haipeng): expose profile info as an interface, user can get them to
-  // analysis
-  //      the performance of their deepnet.
-  FILE *df = fopen("net.dot", "w");
-  fprintf(df, "digraph {\n");
-  for (int i = 0; i < ops.size(); i++) {
-    for (int j : dep.getNext(i)) {
-      fprintf(df, "op_%d -> op_%d\n", i, j);
-    }
-  }
-  for (int i = 0; i < ops.size(); i++) {
-    fprintf(df, "op_%d[label=\"%s (%d)\"]\n", i, ops[i]->Type().c_str(), i);
-  }
-  fprintf(df, "}\n");
-  fclose(df);
-#endif
-  //  FILE *pf = fopen("profile.out", "w");
  std::unordered_map<std::string, uint64_t> _tp;
  for (int i = 0; i < profile.size(); i++) {
    const auto &pInfo = profile[i];
    uint64_t timeCost = pInfo.runEnd - pInfo.runBegin;
    _tp[ops[i]->Type()] += timeCost;
-    //    fprintf(pf, "%d\t%s\t%d\t%llu\t%llu\t%llu\n", i,
-    //    ops[i]->Type().c_str(),
-    //            pInfo.tid, pInfo.runBegin, pInfo.runEnd, timeCost);
  }
-  //  fclose(pf);
  printf("====================[ profile ]======================\n");
  using prof_t = std::pair<std::string, uint64_t>;
  std::vector<prof_t> _tv(_tp.begin(), _tp.end());
@@ -654,7 +396,6 @@ std::vector<typename Executor<Dtype, P>::Ptype> Executor<Dtype, P>::Predict(
 }
 #ifdef PADDLE_MOBILE_FPGA
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
                                        string var_name) {
@@ -663,12 +404,12 @@ void Executor<Dtype, P>::InjectVariable(const framework::Tensor &t,
      g_feed_value->GetMutable<framework::LoDTensor>();
  feed_tensor->Resize(t.dims());
  feed_tensor->ShareDataWith(t);
-};
+}
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::FeedData(const framework::Tensor &t) {
  InjectVariable(t, "feed");
-};
+}
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
@@ -684,14 +425,14 @@ std::shared_ptr<framework::Tensor> Executor<Dtype, P>::FetchResult(int id) {
  auto *output_tensor = framework::GetVarValue<framework::LoDTensor>(
      out_keys[0], output_map, *(program_.scope));
  return std::make_shared<framework::Tensor>(framework::Tensor(*output_tensor));
-};
+}
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::Predict_From_To(int start, int end) {
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
-  end = end < 0 ? (int)ops.size() : end;
+  end = end < 0 ? static_cast<int>(ops.size()) : end;
  PADDLE_MOBILE_ENFORCE(start >= 0 && start < end && end <= ops.size(),
                        "start or end parameter is wrong");
@@ -712,17 +453,17 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
    profile[i].runEnd = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
  }
-};
+}
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::Predict_From(int start) {
  Predict_From_To(start);
-};
+}
 template <typename Dtype, Precision P>
 void Executor<Dtype, P>::Predict_To(int end) {
  Predict_From_To(0, end);
-};
+}
 #endif
 template class Executor<CPU, Precision::FP32>;

--- a/src/io/executor.h
+++ b/src/io/executor.h
@@ -18,19 +18,12 @@ limitations under the License. */
 #include <memory>
 #include <string>
 #include <vector>
 #include "common/types.h"
+#include "common/util.h"
 #include "framework/lod_tensor.h"
 #include "framework/operator.h"
 #include "framework/program/program.h"
 #include "framework/tensor.h"
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-#include <condition_variable>
-#include <mutex>
-#include <thread>
-#include "common/dep_core.h"
-#endif
-using std::string;
 namespace paddle_mobile {
@@ -38,50 +31,61 @@ template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Executor {
 public:
  typedef typename PrecisionTrait<P>::ptype Ptype;
+  // exector constructor
+  // @param program program converted from proto program in PaddlePaddle
+  // @param use_optimize bool whether use operator fusion to speed up or not
+  // @param loddable bool
+  Executor(const framework::Program<Dtype> program, int batch_size = 1,
+           const bool use_optimize = true, const bool loddable = false);
-  /*
+  // predict with tensor input
-   * @b init executor with program load by Loader class
+  // @param t input tensor to do prediction
-   * @b 用 loader load 的 program 实例化 executor
+  // @return predicted tensor
-   * */
-  Executor(const framework::Program<Dtype> p, int batch_size = 1,
-           bool use_optimize = true, bool loddable = false);
-  /*
-   * @b to predict
-   * */
  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
-  /*
-   * @b to predict
+  // predict with lod tensor input
-   * */
+  // @param t input lod tensor to do prediction
+  // @return predicted lod tensor
  std::shared_ptr<framework::LoDTensor> PredictLod(
      const framework::LoDTensor &t);
-  /*
-   * @b to predict with vector and dim
+  // predict with vector input and dims
-   *
+  // @param input vector whose elements will be formed
-   * @b 使用 输入 和 输入的维度信息 进行预测
+  // @param       input lod tensor to do prediction
-   * */
+  // @param dims  vector whose elements will be formed
+  // @param       input tensor shape
+  // @return vector which is flatted from predicted tensor
  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);
+#ifdef PADDLE_MOBILE_FPGA
+  void InjectVariable(const framework::Tensor &t, string var_name);
+  void FeedData(const framework::Tensor &t);
+  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
+  void Predict_From_To(int start = 0, int end = -1);
+  void Predict_From(int start);
+  void Predict_To(int end);
+#endif
 protected:
  Executor() = default;
+  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
+                                             int block_id);
+  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
+                      framework::Variable *var,
+                      framework::LoDTensor *tensor) const;
  void InitMemory();
-  void LoadMemory(const framework::VarDesc var_desc,
-                  framework::LoDTensor *tensor, char **data);
  void InitCombineMemory();
+  void LoadMemory(void **data,
+                  const std::shared_ptr<framework::VarDesc> var_desc,
+                  framework::LoDTensor *tensor);
  framework::Program<Dtype> program_;
  int batch_size_ = 1;
  std::shared_ptr<framework::ProgramDesc> to_predict_program_;
-  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t,
-                                             int block_id);
  std::map<framework::BlockDesc,
           std::vector<std::shared_ptr<framework::OperatorBase<Dtype>>>>
      ops_of_block_;
-  bool use_optimize_ = false;
-  bool loddable_ = false;
-#ifdef PADDLE_EXECUTOR_MULTITHREAD
-  std::vector<depCore> depManager;
-#endif
 #ifdef PADDLE_MOBILE_PROFILE
  struct ProfInfo {
    int tid = 0;
@@ -89,21 +93,8 @@ class Executor {
    uint64_t runEnd = 0UL;
  };
 #endif
+  bool use_optimize_ = false;
-  bool varInputMemory(const std::shared_ptr<framework::VarDesc> &var_desc,
+  bool loddable_ = false;
-                      framework::Variable *var,
-                      framework::LoDTensor *tensor) const;
-#ifdef PADDLE_MOBILE_FPGA
- public:
-  void InjectVariable(const framework::Tensor &t, string var_name);
-  void FeedData(const framework::Tensor &t);
-  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
-  void Predict_From_To(int start = 0, int end = -1);
-  void Predict_From(int start);
-  void Predict_To(int end);
-#endif
 };
 }  // namespace paddle_mobile
--- a/src/io/loader.cpp
+++ b/src/io/loader.cpp
@@ -27,8 +27,8 @@ using framework::Variable;
 * @param scope
 */
 void InitMemoryFromProgram(
-    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,
+    std::shared_ptr<framework::ProgramDesc> &originProgramDesc,  // NOLINT
-    std::shared_ptr<framework::Scope> &scope) {
+    std::shared_ptr<framework::Scope> &scope) {                  // NOLINT
  for (const auto &block : originProgramDesc.get()->Blocks()) {
    for (const auto &var_desc : block->Vars()) {
      auto var = scope.get()->Var(var_desc->Name());
@@ -61,12 +61,16 @@ void InitMemoryFromProgram(
 */
 template <typename Dtype, Precision P>
 void FusionAndPrintInfos(
-    bool &optimize, bool &can_add_split, framework::Program<Dtype, P> &program,
+    bool optimize, bool can_add_split,
+    framework::Program<Dtype, P> &program,  // NOLINT
    const std::shared_ptr<framework::ProgramDesc> &originProgramDesc) {
  if (optimize) {
    framework::ProgramOptimize program_optimize;
    program.optimizeProgram =
        program_optimize.FusionOptimize(originProgramDesc, can_add_split);
+    if (!program.optimizeProgram) {
+      program.optimizeProgram = originProgramDesc;
+    }
  }
  if (optimize) {
    program.optimizeProgram->Description("optimize: ");
@@ -74,6 +78,7 @@ void FusionAndPrintInfos(
    originProgramDesc->Description("program: ");
  }
 }
 static size_t ReadBuffer(const char *file_name, uint8_t **out) {
  FILE *fp;
  fp = fopen(file_name, "rb");

--- a/src/io/loader.h
+++ b/src/io/loader.h
@@ -24,19 +24,11 @@ namespace paddle_mobile {
 template <typename Dtype = CPU, Precision P = Precision::FP32>
 class Loader {
 public:
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
  const framework::Program<Dtype, P> Load(const std::string &dirname,
                                          bool optimize = false,
                                          bool quantification = false,
                                          bool can_add_split = false);
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
  const framework::Program<Dtype, P> Load(const std::string &model_path,
                                          const std::string &para_path,
                                          bool optimize = false,

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -19,10 +19,9 @@ namespace paddle_mobile {
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::SetThreadNum(int num) {
 #ifdef _OPENMP
-  //  omp_set_dynamic(0);
  omp_set_num_threads(num);
 #endif
-};
+}
 template <typename Dtype, Precision P>
 bool PaddleMobile<Dtype, P>::Load(const std::string &dirname, bool optimize,
@@ -135,33 +134,31 @@ void PaddleMobile<Dtype, P>::InjectVariable(const framework::Tensor &t,
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::FeedData(const framework::Tensor &t) {
  executor_->FeedData(t);
-};
+}
 template <typename Dtype, Precision P>
 std::shared_ptr<framework::Tensor> PaddleMobile<Dtype, P>::FetchResult(int id) {
  return executor_->FetchResult(id);
-};
+}
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::Predict_From_To(int start, int end) {
  executor_->Predict_From_To(start, end);
-};
+}
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::Predict_From(int start) {
  executor_->Predict_From(start);
-};
+}
 template <typename Dtype, Precision P>
 void PaddleMobile<Dtype, P>::Predict_To(int end) {
  executor_->Predict_To(end);
-};
+}
 #endif
 template class PaddleMobile<CPU, Precision::FP32>;
 template class PaddleMobile<FPGA, Precision::FP32>;
 template class PaddleMobile<GPU_MALI, Precision::FP32>;
 }  // namespace paddle_mobile
--- a/src/io/paddle_mobile.h
+++ b/src/io/paddle_mobile.h
@@ -22,6 +22,7 @@ limitations under the License. */
 #endif  // _OPENMP
 #include "common/types.h"
+#include "framework/load_ops.h"
 #include "framework/tensor.h"
 #include "io/executor.h"
 #include "io/loader.h"
@@ -34,67 +35,31 @@ class PaddleMobile {
 public:
  PaddleMobile() {}
-  /*
-   * @b load separate format fluid model
-   * @b 加载分开形式的 fluid 模型
-   * */
  bool Load(const std::string &dirname, bool optimize = false,
            bool quantification = false, int batch_size = 1,
            bool loddable = false);
-  /*
-   * @b load combine format fluid mode
-   * @b 加载结合在一起格式的模型
-   * */
  bool Load(const std::string &model_path, const std::string &para_path,
            bool optimize = false, bool quantification = false,
            int batch_size = 1, bool loddable = false);
-  /*
-   * @b 设置线程数, 当 cmake 中开启 openmp 时生效
-   * */
-  void SetThreadNum(int num);
-  /*
-   * @b to predict
-   * */
  std::shared_ptr<framework::Tensor> Predict(const framework::Tensor &t);
-  /*
-   * @b to predict
-   * */
  std::shared_ptr<framework::Tensor> PredictLod(const framework::LoDTensor &t);
-  /*
-   * @b to predict with vector and dim
-   *
-   * @b 使用 输入 和 输入的维度信息 进行预测
-   * */
  std::vector<Ptype> Predict(const std::vector<Ptype> &input,
                             const std::vector<int64_t> &dims);
-  /**
-   * 从内存加载model 以及 combinedparams的接口
-   *
-   * @param model_len model 文件的内存大小
-   * @param model_buf model文件的内存
-   * @param combined_params_len  params文件的内存大小
-   * @param combined_params_buf  params文件的内存
-   * @return
-   */
  bool LoadCombinedMemory(size_t model_len, const uint8_t *model_buf,
                          size_t combined_params_len,
                          const uint8_t *combined_params_buf);
+  void SetThreadNum(int num);
  void Clear();
  ~PaddleMobile();
- private:
-  std::shared_ptr<Loader<Dtype, P>> loader_;
-  std::shared_ptr<Executor<Dtype, P>> executor_;
 #ifdef PADDLE_MOBILE_FPGA
- public:
  void InjectVariable(const framework::Tensor &t, string var_name);
  void FeedData(const framework::Tensor &t);
  std::shared_ptr<framework::Tensor> FetchResult(int id = -1);
@@ -102,6 +67,10 @@ class PaddleMobile {
  void Predict_From(int start);
  void Predict_To(int end);
 #endif
+ private:
+  std::shared_ptr<Loader<Dtype, P>> loader_;
+  std::shared_ptr<Executor<Dtype, P>> executor_;
 };
 }  // namespace paddle_mobile
--- a/src/ios_io/PaddleMobileCPU.mm
+++ b/src/ios_io/PaddleMobileCPU.mm
@@ -13,15 +13,12 @@
 limitations under the License. */
 #import "PaddleMobileCPU.h"
+#import "framework/load_ops.h"
-#import "op_symbols.h"
+#import "framework/tensor.h"
-#include "framework/tensor.h"
 #import "io/paddle_mobile.h"
 #import <memory>
 #import <vector>
 @interface PaddleMobileCPUResult()
 -(void)toSetOutput:(float *)output;

--- a/src/ios_io/op_symbols.h
+++ b/src/ios_io/op_symbols.h
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
- Licensed under the Apache License, Version 2.0 (the "License");
- you may not use this file except in compliance with the License.
- You may obtain a copy of the License at
- http://www.apache.org/licenses/LICENSE-2.0
- Unless required by applicable law or agreed to in writing, software
- distributed under the License is distributed on an "AS IS" BASIS,
- WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- See the License for the specific language governing permissions and
- limitations under the License. */
-#pragma once
-#include "operators/batchnorm_op.h"
-#include "operators/bilinear_interp_op.h"
-#include "operators/box_coder_op.h"
-#include "operators/concat_op.h"
-#include "operators/conv_op.h"
-#include "operators/conv_transpose_op.h"
-#include "operators/crf_op.h"
-#include "operators/depthwise_conv_op.h"
-#include "operators/dropout_op.h"
-#include "operators/elementwise_add_op.h"
-#include "operators/feed_op.h"
-#include "operators/fetch_op.h"
-#include "operators/flatten_op.h"
-#include "operators/fusion_conv_add.h"
-#include "operators/fusion_conv_add_add_prelu_op.h"
-#include "operators/fusion_conv_add_bn_op.h"
-#include "operators/fusion_conv_add_bn_relu_op.h"
-#include "operators/fusion_conv_add_prelu_op.h"
-#include "operators/fusion_conv_add_relu_op.h"
-#include "operators/fusion_conv_bn_add_relu_op.h"
-#include "operators/fusion_conv_bn_relu_op.h"
-#include "operators/fusion_dwconv_bn_relu_op.h"
-#include "operators/fusion_elementwise_add_relu_op.h"
-#include "operators/fusion_fc_op.h"
-#include "operators/fusion_fc_relu_op.h"
-#include "operators/gru_op.h"
-#include "operators/im2sequence_op.h"
-#include "operators/lookup_op.h"
-#include "operators/lrn_op.h"
-#include "operators/mul_op.h"
-#include "operators/multiclass_nms_op.h"
-#include "operators/pool_op.h"
-#include "operators/prelu_op.h"
-#include "operators/prior_box_op.h"
-#include "operators/relu_op.h"
-#include "operators/reshape_op.h"
-#include "operators/resize_op.h"
-#include "operators/scale_op.h"
-#include "operators/shape_op.h"
-#include "operators/sigmoid_op.h"
-#include "operators/slice_op.h"
-#include "operators/softmax_op.h"
-#include "operators/split_op.h"
-#include "operators/transpose_op.h"
--- a/src/jni/paddle_mobile_jni.cpp
+++ b/src/jni/paddle_mobile_jni.cpp
@@ -84,14 +84,14 @@ Java_com_baidu_paddle_PML_loadnlp(JNIEnv *env, jclass thiz, jstring modelPath) {
 #ifdef ENABLE_EXCEPTION
  try {
    isLoadOk = getPaddleMobileInstance()->Load(
-        jstring2cppstring(env, modelPath), optimize, false, 1, true);
+        jstring2cppstring(env, modelPath), optimize, false, true);
  } catch (paddle_mobile::PaddleMobileException &e) {
    ANDROIDLOGE("jni got an PaddleMobileException! ", e.what());
    isLoadOk = false;
  }
 #else
  isLoadOk = getPaddleMobileInstance()->Load(jstring2cppstring(env, modelPath),
-                                             optimize, false, 1, true);
+                                             optimize, false, true);
 #endif
  return static_cast<jboolean>(isLoadOk);
 }

--- a/src/operators/batchnorm_op.h
+++ b/src/operators/batchnorm_op.h
@@ -46,13 +46,4 @@ class BatchNormOp
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(batch_norm);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(batch_norm);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/bilinear_interp_op.h
+++ b/src/operators/bilinear_interp_op.h
@@ -50,12 +50,4 @@ class BilinearOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(bilinear_interp);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/box_coder_op.h
+++ b/src/operators/box_coder_op.h
@@ -51,12 +51,4 @@ class BoxCoderOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(box_coder);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/concat_op.h
+++ b/src/operators/concat_op.h
@@ -46,14 +46,4 @@ class ConcatOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(concat);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(concat);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(concat);
-#endif
 #endif
--- a/src/operators/conv_op.h
+++ b/src/operators/conv_op.h
@@ -46,14 +46,4 @@ class ConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d);
-#endif
 #endif
--- a/src/operators/conv_transpose_op.cpp
+++ b/src/operators/conv_transpose_op.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 #include "operators/conv_transpose_op.h"

--- a/src/operators/conv_transpose_op.h
+++ b/src/operators/conv_transpose_op.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 #pragma once
@@ -88,14 +88,4 @@ class ConvOpTranspose : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(conv2d_transpose);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(conv2d_transpose);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(conv2d_transpose);
-#endif
 #endif
--- a/src/operators/crf_op.h
+++ b/src/operators/crf_op.h
@@ -47,12 +47,4 @@ class CrfOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(crf_decoding);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/depthwise_conv_op.cpp
+++ b/src/operators/depthwise_conv_op.cpp
@@ -56,9 +56,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(depthwise_conv2d, ops::DepthwiseConvOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/depthwise_conv_op.h
+++ b/src/operators/depthwise_conv_op.h
@@ -48,12 +48,4 @@ class DepthwiseConvOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(depthwise_conv2d);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/dequantize_op.cpp
+++ b/src/operators/dequantize_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/dequantize_op.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void DequantizeOp<DeviceType, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.out_->Resize(input_dims);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(dequantize, ops::DequantizeOp);
+#endif
--- a/src/operators/dequantize_op.h
+++ b/src/operators/dequantize_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/dequantize_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class DequantizeOp
+    : public framework::OperatorWithKernel<DeviceType,
+                                           DequantizeParam<DeviceType>,
+                                           DequantizeKernel<DeviceType, T>> {
+ public:
+  DequantizeOp(const std::string &type, const VariableNameMap &inputs,
+               const VariableNameMap &outputs,
+               const framework::AttributeMap &attrs,
+               std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, DequantizeParam<DeviceType>,
+                                      DequantizeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -30,8 +30,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
 #endif

--- a/src/operators/dropout_op.h
+++ b/src/operators/dropout_op.h
@@ -50,13 +50,4 @@ class DropoutOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(dropout);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(dropout);
-#endif
 #endif
--- a/src/operators/elementwise_add_op.cpp
+++ b/src/operators/elementwise_add_op.cpp
@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(elementwise_add, ops::ElementwiseAddOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(elementwise_add, ops::ElementwiseAddOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/elementwise_add_op.h
+++ b/src/operators/elementwise_add_op.h
@@ -48,13 +48,4 @@ class ElementwiseAddOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(elementwise_add);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(elementwise_add);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -12,12 +12,10 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "feed_op.h"
+#include "operators/feed_op.h"
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(feed, ops::FeedOp);
 #endif

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -20,11 +20,11 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-using std::string;
 template <typename DeviceType, typename T>
 class FeedOp : public framework::OperatorBase<DeviceType> {
 public:
-  FeedOp(const string &type, const VariableNameMap &inputs,
+  FeedOp(const std::string &type, const VariableNameMap &inputs,
         const VariableNameMap &outputs, const framework::AttributeMap attrs,
         std::shared_ptr<framework::Scope> scope)
      : framework::OperatorBase<DeviceType>(type, inputs, outputs, attrs,
@@ -35,10 +35,6 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
    auto out_dims = param_.Out()->dims();
    out_dims[0] = param_.BatchSize();
    param_.Out()->Resize(out_dims);
-    //  note : mobile infershape iscalled when executer is created.  so  do not
-    //  pass lod here .
-    // it is empty
  }
 #ifdef PADDLE_MOBILE_FPGA
@@ -49,7 +45,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
  }
  void RunImpl() const {
-    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());
+    auto input = (Tensor *)const_cast<LoDTensor *>(param_.InputX());  // NOLINT
    fpga::format_image(input);
    auto input_ptr = input->data<float>();
    Tensor *output = param_.Out();
@@ -61,7 +57,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
    args.output_data_type = fpga::DATA_TYPE_FP16;
    args.input_layout_type = fpga::LAYOUT_CHW;
    args.output_layout_type = fpga::LAYOUT_HWC;
-    args.image.address = (void *)input_ptr;
+    args.image.address = (void *)input_ptr;  // NOLINT
    args.image.channels = (uint32_t)input->dims()[1];
    args.image.height = (uint32_t)input->dims()[2];
    args.image.width = (uint32_t)input->dims()[3];
@@ -86,13 +82,3 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(feed);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(feed);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(feed);
-#endif
--- a/src/operators/fetch_op.cpp
+++ b/src/operators/fetch_op.cpp
@@ -12,10 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#include "fetch_op.h"
+#include "operators/fetch_op.h"
-namespace paddle_mobile {
-namespace operators {}
-}  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU

--- a/src/operators/fetch_op.h
+++ b/src/operators/fetch_op.h
@@ -46,13 +46,3 @@ class FetchOp : public framework::OperatorBase<DeviceType> {
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fetch);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fetch);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fetch);
-#endif
--- a/src/operators/flatten_op.cpp
+++ b/src/operators/flatten_op.cpp
@@ -53,8 +53,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(flatten, ops::FlattenOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/flatten_op.h
+++ b/src/operators/flatten_op.h
@@ -63,12 +63,4 @@ class FlattenOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(flatten);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/fusion_conv_add_add_prelu.cpp
+++ b/src/operators/fusion_conv_add_add_prelu.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDADDPRELU_OP
-#include "fusion_conv_add_add_prelu_op.h"
+#include "operators/fusion_conv_add_add_prelu_op.h"
 #include "operators/math/conv_func.h"
 namespace paddle_mobile {
@@ -48,13 +48,14 @@ void FusionConvAddAddPReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_add_prelu,
+                        ops::FusionConvAddAddPReluOpMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_add_prelu, ops::FusionConvAddAddPReluOp);
 #endif
-#endif
+#endif  // FUSION_CONVADDADDPRELU_OP
--- a/src/operators/fusion_conv_add_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_add_prelu_op.h
@@ -76,37 +76,7 @@ class FusionConvAddAddPReluOp
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef CONV_ADD_ADD_PRELU_REGISTER
-#define CONV_ADD_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
-    new FusionConvAddAddPReluOpMatcher());
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef CONV_ADD_ADD_PRELU_REGISTER
-#define CONV_ADD_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_add_prelu_registrar(
-    new FusionConvAddAddPReluOpMatcher());
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_add_prelu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_add_prelu);
-#endif
 #endif
--- a/src/operators/fusion_conv_add_bn_op.cpp
+++ b/src/operators/fusion_conv_add_bn_op.cpp
@@ -49,11 +49,11 @@ void FusionConvAddBNOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_bn, ops::FusionConvAddBNMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_bn, ops::FusionConvAddBNOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_bn, ops::FusionConvAddBNOp);
 #endif

--- a/src/operators/fusion_conv_add_bn_op.h
+++ b/src/operators/fusion_conv_add_bn_op.h
@@ -70,46 +70,7 @@ class FusionConvAddBNOp : public framework::OperatorWithKernel<
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_CONV_ADD_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
-    new FusionConvAddBNMatcher());
-#define FUSION_CONV_ADD_BN_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_CONV_ADD_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
-    new FusionConvAddBNMatcher());
-#define FUSION_CONV_ADD_BN_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_CONV_ADD_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_registrar(
-    new FusionConvAddBNMatcher());
-#define FUSION_CONV_ADD_BN_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_bn);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_bn);
-#endif
 #endif
--- a/src/operators/fusion_conv_add_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_add_bn_relu_op.cpp
@@ -49,11 +49,12 @@ void FusionConvAddBNReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_bn_relu,
+                        ops::FusionConvAddBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_bn_relu, ops::FusionConvAddBNReluOp);
 #endif

--- a/src/operators/fusion_conv_add_bn_relu_op.h
+++ b/src/operators/fusion_conv_add_bn_relu_op.h
@@ -75,46 +75,7 @@ class FusionConvAddBNReluOp
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-    new FusionConvAddBNReluMatcher());
-#define FUSION_CONV_ADD_BN_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-    new FusionConvAddBNReluMatcher());
-#define FUSION_CONV_ADD_BN_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_CONV_ADD_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_bn_relu_registrar(
-    new FusionConvAddBNReluMatcher());
-#define FUSION_CONV_ADD_BN_RELU_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_bn_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_bn_relu);
-#endif
 #endif
--- a/src/operators/fusion_conv_add.cpp
+++ b/src/operators/fusion_conv_add.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADD_OP
-#include "operators/fusion_conv_add.h"
+#include "operators/fusion_conv_add_op.h"
 #include "operators/math/conv_func.h"
 namespace paddle_mobile {
@@ -49,13 +49,13 @@ void FusionConvAddOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add, ops::FusionConvAddMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(fusion_conv_add, ops::FusionConvAddOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/fusion_conv_add.h
+++ b/src/operators/fusion_conv_add.h
@@ -65,40 +65,7 @@ class FusionConvAddOp : public framework::OperatorWithKernel<
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef CONV_ADD_REGISTER
-static framework::FusionOpRegistrar convadd_registrar(
-    new FusionConvAddMatcher());
-#define CONV_ADD_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef CONV_ADD_REGISTER
-static framework::FusionOpRegistrar convadd_registrar(
-    new FusionConvAddMatcher());
-#define CONV_ADD_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fusion_conv_add);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/fusion_conv_add_prelu_op.cpp
+++ b/src/operators/fusion_conv_add_prelu_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDPRELU_OP
-#include "fusion_conv_add_prelu_op.h"
+#include "operators/fusion_conv_add_prelu_op.h"
 #include "operators/math/conv_func.h"
 namespace paddle_mobile {
@@ -48,11 +48,12 @@ void FusionConvAddPReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_prelu,
+                        ops::FusionConvAddPReluOpMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_prelu, ops::FusionConvAddPReluOp);
 #endif

--- a/src/operators/fusion_conv_add_prelu_op.h
+++ b/src/operators/fusion_conv_add_prelu_op.h
@@ -71,37 +71,7 @@ class FusionConvAddPReluOp
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef CONV_ADD_PRELU_REGISTER
-#define CONV_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
-    new FusionConvAddPReluOpMatcher());
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef CONV_ADD_PRELU_REGISTER
-#define CONV_ADD_PRELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_prelu_registrar(
-    new FusionConvAddPReluOpMatcher());
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_prelu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_prelu);
-#endif
 #endif
--- a/src/operators/fusion_conv_add_relu_op.cpp
+++ b/src/operators/fusion_conv_add_relu_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef FUSION_CONVADDRELU_OP
-#include "fusion_conv_add_relu_op.h"
+#include "operators/fusion_conv_add_relu_op.h"
 #include "operators/math/conv_func.h"
 namespace paddle_mobile {
@@ -48,11 +48,11 @@ void FusionConvAddReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_add_relu, ops::FusionConvAddReluOpMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_add_relu, ops::FusionConvAddReluOp);
 #endif

--- a/src/operators/fusion_conv_add_relu_op.h
+++ b/src/operators/fusion_conv_add_relu_op.h
@@ -65,37 +65,7 @@ class FusionConvAddReluOp : public framework::OperatorWithKernel<
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef CONV_ADD_RELU_REGISTER
-#define CONV_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
-    new FusionConvAddReluOpMatcher());
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef CONV_ADD_RELU_REGISTER
-#define CONV_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_add_relu_registrar(
-    new FusionConvAddReluOpMatcher());
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_add_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_add_relu);
-#endif
 #endif
--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -49,11 +49,12 @@ void FusionConvBNAddReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
+                        ops::FusionConvBNAddReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif

--- a/src/operators/fusion_conv_bn_add_relu_op.h
+++ b/src/operators/fusion_conv_bn_add_relu_op.h
@@ -80,46 +80,7 @@ class FusionConvBNAddReluOp
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
-    new FusionConvBNAddReluMatcher());
-#define FUSION_CONV_BN_ADD_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
-    new FusionConvBNAddReluMatcher());
-#define FUSION_CONV_BN_ADD_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_CONV_BN_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_add_relu_registrar(
-    new FusionConvBNAddReluMatcher());
-#define FUSION_CONV_BN_ADD_RELU_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_bn_add_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_bn_add_relu);
-#endif
 #endif
--- a/src/operators/fusion_conv_bn_op.cpp
+++ b/src/operators/fusion_conv_bn_op.cpp
@@ -48,11 +48,11 @@ void FusionConvBNOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_bn, ops::FusionConvBNMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn, ops::FusionConvBNOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn, ops::FusionConvBNOp);
 #endif

--- a/src/operators/fusion_conv_bn_op.h
+++ b/src/operators/fusion_conv_bn_op.h
@@ -67,39 +67,7 @@ class FusionConvBNOp : public framework::OperatorWithKernel<
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_CONV_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_registrar(
-    new FusionConvBNMatcher());
-#define FUSION_CONV_BN_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_CONV_BN_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_registrar(
-    new FusionConvBNMatcher());
-#define FUSION_CONV_BN_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_bn);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_bn);
-#endif
 #endif
--- a/src/operators/fusion_conv_bn_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_relu_op.cpp
@@ -49,11 +49,11 @@ void FusionConvBNReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_conv_bn_relu, ops::FusionConvBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_relu, ops::FusionConvBNReluOp);
 #endif

--- a/src/operators/fusion_conv_bn_relu_op.h
+++ b/src/operators/fusion_conv_bn_relu_op.h
@@ -72,39 +72,7 @@ class FusionConvBNReluOp : public framework::OperatorWithKernel<
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_CONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
-    new FusionConvBNReluMatcher());
-#define FUSION_CONV_BN_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_CONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_conv_bn_relu_registrar(
-    new FusionConvBNReluMatcher());
-#define FUSION_CONV_BN_RELU_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_conv_bn_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_conv_bn_relu);
-#endif
 #endif
--- a/src/operators/fusion_dwconv_bn_relu_op.cpp
+++ b/src/operators/fusion_dwconv_bn_relu_op.cpp
@@ -49,11 +49,11 @@ void FusionDWConvBNReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_dwconv_bn_relu, ops::FusionDWConvBNReluOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/fusion_dwconv_bn_relu_op.h
+++ b/src/operators/fusion_dwconv_bn_relu_op.h
@@ -73,38 +73,7 @@ class FusionDWConvBNReluOp
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_DWCONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
-    new FusionDWConvBNReluMatcher());
-#define FUSION_DWCONV_BN_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_DWCONV_BN_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_dwconv_bn_relu_registrar(
-    new FusionDWConvBNReluMatcher());
-#define FUSION_DWCONV_BN_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_dwconv_bn_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/fusion_elementwise_add_relu_op.cpp
+++ b/src/operators/fusion_elementwise_add_relu_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef FUSION_ELEMENTWISEADDRELU_OP
-#include "fusion_elementwise_add_relu_op.h"
+#include "operators/fusion_elementwise_add_relu_op.h"
 namespace paddle_mobile {
 namespace operators {
@@ -29,6 +29,9 @@ void FusionElementwiseAddReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_elementwise_add_relu,
+                        ops::FusioneElementwiseAddReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 // REGISTER_OPERATOR_CPU(fusion_elementwise_add_relu,
 //                      ops::FusionElementwiseAddReluOp);

--- a/src/operators/fusion_elementwise_add_relu_op.h
+++ b/src/operators/fusion_elementwise_add_relu_op.h
@@ -61,39 +61,7 @@ class FusionElementwiseAddReluOp
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
-    new FusioneElementwiseAddReluMatcher());
-#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
-    new FusioneElementwiseAddReluMatcher());
-#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-static framework::FusionOpRegistrar fusion_elementwise_relu_registrar(
-    new FusioneElementwiseAddReluMatcher());
-#define FUSION_ELEMENTWISE_ADD_RELU_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_elementwise_add_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_elementwise_add_relu);
-#endif
 #endif
--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_FC_OP
 #include "operators/fusion_fc_op.h"
 namespace paddle_mobile {
 namespace operators {
@@ -54,6 +55,8 @@ void FusionFcOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #endif
@@ -64,4 +67,4 @@ REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
 REGISTER_OPERATOR_FPGA(fusion_fc, ops::FusionFcOp);
 #endif
-#endif
+#endif  // FUSION_FC_OP
--- a/src/operators/fusion_fc_op.h
+++ b/src/operators/fusion_fc_op.h
@@ -25,8 +25,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
-using std::string;
-using std::vector;
 class FusionFcMatcher : public framework::FusionOpMatcher {
 public:
  FusionFcMatcher() {
@@ -49,7 +48,7 @@ class FusionFcOp : public framework::OperatorWithKernel<
                       DeviceType, FusionFcParam<DeviceType>,
                       operators::FusionFcKernel<DeviceType, T>> {
 public:
-  FusionFcOp(const string &type, const VariableNameMap &inputs,
+  FusionFcOp(const std::string &type, const VariableNameMap &inputs,
             const VariableNameMap &outputs,
             const framework::AttributeMap &attrs,
             std::shared_ptr<framework::Scope> scope)
@@ -60,42 +59,11 @@ class FusionFcOp : public framework::OperatorWithKernel<
  using framework::OperatorWithKernel<
      DeviceType, FusionFcParam<DeviceType>,
      operators::FusionFcKernel<DeviceType, T>>::OperatorWithKernel;
-  void InferShape() const override;
- protected:
+  void InferShape() const override;
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_FC_REGISTER
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
-#define FUSION_FC_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_FC_REGISTER
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
-#define FUSION_FC_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_FC_REGISTER
-static framework::FusionOpRegistrar fc_registrar(new FusionFcMatcher());
-#define FUSION_FC_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
+#endif  // FUSION_FC_OP
-USE_OP_CPU(fusion_fc);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fusion_fc);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_fc);
-#endif
-#endif
--- a/src/operators/fusion_fc_relu_op.cpp
+++ b/src/operators/fusion_fc_relu_op.cpp
@@ -54,6 +54,9 @@ void FusionFcReluOp<Dtype, T>::InferShape() const {
 }  // namespace paddle_mobile
 namespace ops = paddle_mobile::operators;
+REGISTER_FUSION_MATCHER(fusion_fc_relu, ops::FusionFcReluMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc_relu, ops::FusionFcReluOp);
 #endif

--- a/src/operators/fusion_fc_relu_op.h
+++ b/src/operators/fusion_fc_relu_op.h
@@ -64,39 +64,7 @@ class FusionFcReluOp : public framework::OperatorWithKernel<
 protected:
 };
-#ifdef PADDLE_MOBILE_CPU
-#ifndef FUSION_FC_RELU_REGISTER
-static framework::FusionOpRegistrar fc_relu_registrar(
-    new FusionFcReluMatcher());
-#define FUSION_FC_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#ifndef FUSION_FC_RELU_REGISTER
-static framework::FusionOpRegistrar fc_relu_registrar(
-    new FusionFcReluMatcher());
-#define FUSION_FC_RELU_REGISTER
-#endif
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#ifndef FUSION_FC_RELU_REGISTER
-static framework::FusionOpRegistrar fc_relu_registrar(
-    new FusionFcReluMatcher());
-#define FUSION_FC_RELU_REGISTER
-#endif
-#endif
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(fusion_fc_relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(fusion_fc_relu);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(fusion_fc_relu);
-#endif
 #endif  // FUSION_FC_RELU_OP
--- a/src/operators/gru_op.cpp
+++ b/src/operators/gru_op.cpp
@@ -64,8 +64,6 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(gru, ops::GruOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
 #ifdef PADDLE_MOBILE_FPGA
 #endif

--- a/src/operators/gru_op.h
+++ b/src/operators/gru_op.h
@@ -47,12 +47,4 @@ class GruOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(gru);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/im2sequence_op.h
+++ b/src/operators/im2sequence_op.h
@@ -50,12 +50,4 @@ class Im2SequenceOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(im2sequence);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/kernel/arm/bilinear_interp_kernel.cpp
+++ b/src/operators/kernel/arm/bilinear_interp_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #ifdef BILINEAR_INTERP_OP

--- a/src/operators/kernel/arm/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/arm/conv_transpose_kernel.cpp
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 #include "operators/kernel/conv_transpose_kernel.h"
 #include "operators/kernel/central-arm-func/conv_transpose_arm_func.h"

--- a/src/operators/kernel/arm/dequantize_kernel.cpp
+++ b/src/operators/kernel/arm/dequantize_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_MOBILE_CPU
+#include "operators/kernel/dequantize_kernel.h"
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#endif
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool DequantizeKernel<CPU, float>::Init(DequantizeParam<CPU> *param) {
+  return true;
+}
+template <>
+void DequantizeKernel<CPU, float>::Compute(
+    const DequantizeParam<CPU> &param) const {
+  const Tensor *input = param.input_;
+  Tensor *output = param.out_;
+  float activation_scale = param.activation_scale_->data<float>()[0];
+  float weight_scale = param.weight_scale_;
+  const int32_t *x = input->data<const int32_t>();
+  float *y = output->mutable_data<float>();
+  size_t size = output->numel();
+  float scale = 1.f / (activation_scale * weight_scale);
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  float32x4_t s = vdupq_n_f32(scale);
+  for (size_t i = 0; i < loop; ++i) {
+    int32x4_t r0 = vld1q_s32(x);
+    int32x4_t r1 = vld1q_s32(x + 4);
+    int32x4_t r2 = vld1q_s32(x + 8);
+    int32x4_t r3 = vld1q_s32(x + 12);
+    float32x4_t f0 = vcvtq_f32_s32(r0);
+    float32x4_t f1 = vcvtq_f32_s32(r1);
+    float32x4_t f2 = vcvtq_f32_s32(r2);
+    float32x4_t f3 = vcvtq_f32_s32(r3);
+    f0 = vmulq_f32(f0, s);
+    f1 = vmulq_f32(f1, s);
+    f2 = vmulq_f32(f2, s);
+    f3 = vmulq_f32(f3, s);
+    vst1q_f32(y, f0);
+    vst1q_f32(y + 4, f1);
+    vst1q_f32(y + 8, f2);
+    vst1q_f32(y + 12, f3);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = x[i] * scale;
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/flatten_kernel.cpp
+++ b/src/operators/kernel/arm/flatten_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #ifdef FLATTEN_OP

--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef PADDLE_MOBILE_CPU
+#include "operators/kernel/quantize_kernel.h"
+#include <cmath>
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+#include <arm_neon.h>
+#ifndef __aarch64__
+float32_t vmaxvq_f32(float32x4_t r) {
+  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));
+  return vget_lane_f32(vpmax_f32(v, v), 0);
+}
+#endif
+int32x4_t vrnd_towards_zero(float32x4_t r) { return vcvtq_s32_f32(r); }
+int32x4_t vrnd_away_zero(float32x4_t r) {
+  float32x4_t plus = vdupq_n_f32(0.5);
+  float32x4_t minus = vdupq_n_f32(-0.5);
+  float32x4_t zero = vdupq_n_f32(0);
+  uint32x4_t more_than_zero = vcgtq_f32(r, zero);
+  float32x4_t temp = vbslq_f32(more_than_zero, plus, minus);
+  temp = vaddq_f32(r, temp);
+  int32x4_t ret = vcvtq_s32_f32(temp);
+  return ret;
+}
+int32x4_t vrnd_to_even(float32x4_t r) {
+#if 0
+  int32x4_t ret;
+  float value[4];
+  vst1q_f32(value, r);
+  for (int i = 0; i < 4; ++i) {
+    float v = round(value[i]);
+    int32_t q = (int32_t)v;
+    if (abs(abs(v - value[i]) - 0.5) > 0) {
+      ret[i] = q;
+    } else {
+      if (abs(q) % 2 == 0) {
+        ret[i] = q;
+      } else {
+        ret[i] = q + ((q > 0) ? -1 : 1);
+      }
+    }
+  }
+  return ret;
+#else
+  float32x4_t point5 = vdupq_n_f32(0.5);
+  int32x4_t one = vdupq_n_s32(1);
+  int32x4_t zero = vdupq_n_s32(0);
+  int32x4_t rnd = vrnd_away_zero(r);
+  float32x4_t frnd = vcvtq_f32_s32(rnd);
+  frnd = vsubq_f32(frnd, r);
+  frnd = vabsq_f32(frnd);
+  uint32x4_t equal_point5 = vceqq_f32(frnd, point5);
+  int32x4_t abs_rnd = vabsq_s32(rnd);
+  abs_rnd = vandq_s32(abs_rnd, one);
+  uint32x4_t not_mod2 = vreinterpretq_u32_s32(abs_rnd);
+  uint32x4_t mask = vandq_u32(equal_point5, not_mod2);
+  uint32x4_t more_than_zero = vcgtq_s32(rnd, zero);
+  more_than_zero = vandq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+  mask = veorq_u32(more_than_zero, mask);
+  more_than_zero = veorq_u32(more_than_zero, vreinterpretq_u32_s32(one));
+  mask = vaddq_u32(more_than_zero, mask);
+  int32x4_t smask = vreinterpretq_s32_u32(mask);
+  smask = vsubq_s32(smask, one);
+  rnd = vaddq_s32(rnd, smask);
+  return rnd;
+#endif
+}
+#endif
+namespace paddle_mobile {
+namespace operators {
+static float find_abs_max(const Tensor *input) {
+  float max_abs = 0.f;
+  const float *x = input->data<const float>();
+  size_t size = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t max;
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vabsq_f32(r0);
+    r1 = vabsq_f32(r1);
+    r2 = vabsq_f32(r2);
+    r3 = vabsq_f32(r3);
+    max[0] = vmaxvq_f32(r0);
+    max[1] = vmaxvq_f32(r1);
+    max[2] = vmaxvq_f32(r2);
+    max[3] = vmaxvq_f32(r3);
+    max[0] = vmaxvq_f32(max);
+    if (max[0] > max_abs) {
+      max_abs = max[0];
+    }
+    x += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    float value = std::abs(x[i]);
+    if (value > max_abs) {
+      max_abs = value;
+    }
+  }
+  return max_abs;
+}
+static void quantize_round_to_even(const Tensor *input, const float scale,
+                                   Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vmulq_n_f32(r0, scale);
+    r1 = vmulq_n_f32(r1, scale);
+    r2 = vmulq_n_f32(r2, scale);
+    r3 = vmulq_n_f32(r3, scale);
+    int32x4_t q0 = vrnd_to_even(r0);
+    int32x4_t q1 = vrnd_to_even(r1);
+    int32x4_t q2 = vrnd_to_even(r2);
+    int32x4_t q3 = vrnd_to_even(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(y, d5);
+    vst1_s8(y + 8, d6);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    float value = x[i] * scale;
+    float v = round(value);
+    int32_t q = (int32_t)v;
+    if (abs(abs(q - value) - 0.5) > 0) {
+      y[i] = q;
+    } else {
+      if (abs(q) % 2 == 0) {
+        y[i] = q;
+      } else {
+        y[i] = q + ((q > 0) ? -1 : 1);
+      }
+    }
+  }
+}
+static void quantize_round_to_zero(const Tensor *input, const float scale,
+                                   Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vmulq_n_f32(r0, scale);
+    r1 = vmulq_n_f32(r1, scale);
+    r2 = vmulq_n_f32(r2, scale);
+    r3 = vmulq_n_f32(r3, scale);
+    int32x4_t q0 = vrnd_towards_zero(r0);
+    int32x4_t q1 = vrnd_towards_zero(r1);
+    int32x4_t q2 = vrnd_towards_zero(r2);
+    int32x4_t q3 = vrnd_towards_zero(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(y, d5);
+    vst1_s8(y + 8, d6);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = trunc(x[i] * scale);
+  }
+}
+static void quantize_round_to_nearest(const Tensor *input, const float scale,
+                                      Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+#ifdef defined(__ARM_NEON__) || defined(__ARM_NEON)
+  size_t loop = size >> 4;
+  size_t remain = size & 0xF;
+  for (size_t i = 0; i < loop; ++i) {
+    float32x4_t r0 = vld1q_f32(x);
+    float32x4_t r1 = vld1q_f32(x + 4);
+    float32x4_t r2 = vld1q_f32(x + 8);
+    float32x4_t r3 = vld1q_f32(x + 12);
+    r0 = vmulq_n_f32(r0, scale);
+    r1 = vmulq_n_f32(r1, scale);
+    r2 = vmulq_n_f32(r2, scale);
+    r3 = vmulq_n_f32(r3, scale);
+    int32x4_t q0 = vrnd_away_zero(r0);
+    int32x4_t q1 = vrnd_away_zero(r1);
+    int32x4_t q2 = vrnd_away_zero(r2);
+    int32x4_t q3 = vrnd_away_zero(r3);
+    int16x4_t d0 = vmovn_s32(q0);
+    int16x4_t d1 = vmovn_s32(q1);
+    int16x4_t d2 = vmovn_s32(q2);
+    int16x4_t d3 = vmovn_s32(q3);
+    int16x8_t q5 = vcombine_s16(d0, d1);
+    int16x8_t q6 = vcombine_s16(d2, d3);
+    int8x8_t d5 = vmovn_s16(q5);
+    int8x8_t d6 = vmovn_s16(q6);
+    vst1_s8(y, d5);
+    vst1_s8(y + 8, d6);
+    x += 16;
+    y += 16;
+  }
+  size = remain;
+#endif
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = round(x[i] * scale);
+  }
+}
+template <>
+bool QuantizeKernel<CPU, float>::Init(QuantizeParam<CPU> *param) {
+  return true;
+}
+template <>
+void QuantizeKernel<CPU, float>::Compute(
+    const QuantizeParam<CPU> &param) const {
+  float max_abs = 0.f;
+  const Tensor *input = param.input_;
+  Tensor *output = param.out_;
+  Tensor *output_scale = param.online_scale_;
+  if (param.is_static_) {
+    max_abs = param.static_scale_;
+  } else {
+    max_abs = find_abs_max(input);
+  }
+  max_abs = std::max(max_abs, 1e-6f);
+  // only support int8 currently
+  float online_scale = 127 / max_abs;
+  param.online_scale_->mutable_data<float>()[0] = online_scale;
+  switch (param.round_type_) {
+    case ROUND_NEAREST_TO_EVEN:
+      quantize_round_to_even(input, online_scale, output);
+      break;
+    case ROUND_NEAREST_TOWARDS_ZERO:
+      quantize_round_to_zero(input, online_scale, output);
+      break;
+    case ROUND_NEAREST_AWAY_ZERO:
+      quantize_round_to_nearest(input, online_scale, output);
+    default:
+      LOG(kLOG_ERROR) << "round type is not supported.";
+      break;
+  }
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/arm/shape_kernel.cpp
+++ b/src/operators/kernel/arm/shape_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #ifdef SHAPE_OP

--- a/src/operators/kernel/arm/split_kernel.cpp
+++ b/src/operators/kernel/arm/split_kernel.cpp
@@ -11,7 +11,6 @@ distributed under the License is distributed on an "AS IS" BASIS,
 WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#pragma once
 #ifdef SPLIT_OP

--- a/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_transpose_arm_func.h
@@ -12,18 +12,17 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef CONV_TRANSPOSE
+#pragma once
-#include <vector>
+#ifdef CONV_TRANSPOSE_OP
+#include <vector>
 #include "framework/ddim.h"
 #include "operators/math/im2col.h"
 #include "operators/math/math_function.h"
 #include "operators/math/vol2col.h"
 #include "operators/op_param.h"
-#pragma once
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
+++ b/src/operators/kernel/central-arm-func/sigmoid_arm_func.h
@@ -24,7 +24,9 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 using framework::DDim;
 void sigmoid(const Tensor *X, Tensor *Y) {
 #ifdef __ARM_NEON
  const float *input = X->data<float>();

--- a/src/operators/kernel/conv_transpose_kernel.h
+++ b/src/operators/kernel/conv_transpose_kernel.h
@@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 #pragma once

--- a/src/operators/kernel/dequantize_kernel.h
+++ b/src/operators/kernel/dequantize_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class DequantizeKernel
+    : public framework::OpKernelBase<DeviceType, DequantizeParam<DeviceType>> {
+ public:
+  void Compute(const DequantizeParam<DeviceType> &param) const;
+  bool Init(DequantizeParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/concat_kernel.cpp
+++ b/src/operators/kernel/fpga/concat_kernel.cpp
@@ -24,10 +24,12 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
  auto inputs = param->Inputs();
  auto out = param->Out();
  auto image_num = inputs.size();
-  auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *));
+  auto images_in =
-  auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *));
+      (half **)fpga::fpga_malloc(image_num * sizeof(int *));  // NOLINT
+  auto scales_in =
+      (float **)fpga::fpga_malloc(image_num * sizeof(float *));  // NOLINT
  auto channel_num =
-      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));
+      (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));  // NOLINT
  auto height = inputs[0]->dims()[2];
  auto width = inputs[0]->dims()[3];
@@ -36,22 +38,21 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
    PADDLE_MOBILE_ENFORCE(
        input->dims()[2] == height && input->dims()[3] == width,
        "Image height & width should be unified");
-    images_in[i] = (half *)input->data<float>();
+    images_in[i] = (half *)input->data<float>();      // NOLINT
-    channel_num[i] = (uint32_t)inputs[i]->dims()[1];
+    channel_num[i] = (uint32_t)inputs[i]->dims()[1];  // NOLINT
    scales_in[i] = input->scale;
  }
-  fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
+  fpga::format_concat_output(out, height, width, image_num, channel_num);
-                             channel_num);
  fpga::ConcatArgs concatArgs = {0};
-  concatArgs.image_num = (uint32_t)image_num;
+  concatArgs.image_num = image_num;
  concatArgs.images_in = images_in;
  concatArgs.scales_in = scales_in;
-  concatArgs.image_out = (half *)out->data<float>();
+  concatArgs.image_out = (half *)out->data<float>();  // NOLINT
  concatArgs.scale_out = out->scale;
  concatArgs.channel_num = channel_num;
-  concatArgs.height = (uint32_t)height;
+  concatArgs.height = height;
-  concatArgs.width = (uint32_t)width;
+  concatArgs.width = width;
  param->SetFpgaArgs(concatArgs);
  return true;
 }

--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -38,7 +38,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
                        "Output channel should be equal to bias number");
  const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  auto new_scale = new Tensor();
  auto new_bias = new Tensor();
  auto new_scale_ptr = new_scale->mutable_data<float>({channel});

--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -31,7 +31,8 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
                        "Output channel should be equal to bias number");
  int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  for (int i = 0; i < channel; i++) {
    bs_ptr[i + channel] = 1;
    bs_ptr[i] = bias_ptr[i];

--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -33,7 +33,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
                        "Output channel should be equal to bias number");
  const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // // NOLINT
  auto new_scale = new Tensor();
  auto new_bias = new Tensor();
  auto new_scale_ptr = new_scale->mutable_data<float>({channel});

--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -33,7 +33,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
                        "Output channel should be equal to bias number");
  const int channel = out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  auto new_scale = new Tensor();
  auto new_bias = new Tensor();
  auto new_scale_ptr = new_scale->mutable_data<float>({channel});

--- a/src/operators/kernel/fpga/fc_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp
@@ -28,7 +28,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
                        "Image channel should be equal to weight number");
  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  for (int i = 0; i < channel; i++) {
    bs_ptr[i + channel] = 1;
    bs_ptr[i] = input_z_ptr[i];
@@ -61,7 +62,7 @@ template <>
 void FusionFcReluKernel<FPGA, float>::Compute(
    const FusionFcReluParam<FPGA> &param) const {
  fpga::ComputeFpgaConv(param.FpgaArgs());
-};
+}
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp
@@ -30,7 +30,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
  PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
                        "Image channel should be equal to weight number");
  int channel = (uint32_t)out->dims()[1];
-  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
+  auto bs_ptr =
+      (float *)fpga::fpga_malloc(2 * channel * sizeof(float));  // NOLINT
  for (int i = 0; i < channel; i++) {
    bs_ptr[i + channel] = 1;
    bs_ptr[i] = input_z_ptr[i];

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -54,8 +54,8 @@ void SoftmaxKernel<FPGA, float>::Compute(
  fpga::PerformBypass(param.FpgaArgs());
  fpga::fpga_invalidate(
-      (void *)in_x->data<float>(),
+      (void *)in_x->data<float>(),  // NOLINT
-      (size_t)fpga::get_align_image_cw((int)in_x->dims()[1]) * sizeof(float));
+      fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
  math::SoftmaxFuntor<CPU, float>()(in_x, out);
  fpga::fpga_flush(out->data<float>(), out->memory_size());

--- a/src/operators/kernel/fusion_fc_kernel.h
+++ b/src/operators/kernel/fusion_fc_kernel.h
@@ -30,6 +30,7 @@ class FusionFcKernel
  void Compute(const FusionFcParam<DeviceType>& param) const;
  bool Init(FusionFcParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/quantize_kernel.h
+++ b/src/operators/kernel/quantize_kernel.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include "framework/operator.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class QuantizeKernel
+    : public framework::OpKernelBase<DeviceType, QuantizeParam<DeviceType>> {
+ public:
+  void Compute(const QuantizeParam<DeviceType> &param) const;
+  bool Init(QuantizeParam<DeviceType> *param);
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/kernel/resize_kernel.h
+++ b/src/operators/kernel/resize_kernel.h
@@ -23,6 +23,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType>
 inline framework::DDim CalOutputShape(const ResizeParam<DeviceType> &param) {
  const auto *input_x = param.InputX();

--- a/src/operators/kernel/sigmoid_kernel.h
+++ b/src/operators/kernel/sigmoid_kernel.h
@@ -12,16 +12,18 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
-#ifdef SIGMOID_OP
 #pragma once
+#ifdef SIGMOID_OP
 #include "framework/operator.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 using framework::OpKernelBase;
-void sigmoid(const Tensor* X, Tensor* Y);
 template <typename DeviceType, typename T>
 class SigmoidKernel
    : public OpKernelBase<DeviceType, SigmoidParam<DeviceType>> {
@@ -29,6 +31,7 @@ class SigmoidKernel
  void Compute(const SigmoidParam<DeviceType>& param) const override;
  bool Init(SigmoidParam<DeviceType>* param);
 };
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/lookup_op.h
+++ b/src/operators/lookup_op.h
@@ -47,12 +47,4 @@ class LookupOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lookup_table);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -35,7 +35,5 @@ REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/lrn_op.h
+++ b/src/operators/lrn_op.h
@@ -47,13 +47,4 @@ class LrnOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(lrn);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(lrn);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/math/depthwise_conv_3x3.cpp
+++ b/src/operators/math/depthwise_conv_3x3.cpp
@@ -1465,7 +1465,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
                                     Tensor *output, const Tensor *new_scale,
                                     const Tensor *new_bias, bool if_relu) {
 #if __ARM_NEON
-  //#ifdef _OPENMP
+  // #ifdef _OPENMP
  //  const float *newscale_data = new_scale->data<float>();
  //  const float *newbias_data = new_bias->data<float>();
  //
@@ -1645,7 +1645,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
  //    }
  //  }
  //
-  //#else
+  // #else
  const float *input_data = input->data<float>();
  const float *filter_data = filter->data<float>();
@@ -1877,7 +1877,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const Tensor *input, const Tensor *filter,
    input_data += inhxw * c;
    output_data += outhxw * c;
  }
-//#endif
+// #endif
 #endif
 }

--- a/src/operators/math/math_function.cpp
+++ b/src/operators/math/math_function.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "operators/math/math_function.h"
 #include <cstring>
+#include <string>
 #include "operators/math/gemm.h"
 namespace paddle_mobile {
@@ -40,7 +41,7 @@ void matmul<float>(const framework::Tensor &matrix_a, bool trans_a,
    int numel = matrix_a.numel();
    int m = matrix_a.dims()[0];
    int n = matrix_a.dims()[1];
-    float *tmp = (float *)(matrix_a.data<float>());
+    float *tmp = (float *)(matrix_a.data<float>());  // NOLINT
    float *a = static_cast<float *>(
        paddle_mobile::memory::Alloc(sizeof(float) * numel));
    int index = 0;
@@ -126,7 +127,7 @@ struct ClearTensor<CPU, T> {
  void operator()(framework::Tensor *tensor) {
    auto size = tensor->numel();
    auto *tensor_data = tensor->data<float>();
-    memset((void *)tensor_data, 0, sizeof(T) * size);
+    memset((void *)tensor_data, 0, sizeof(T) * size);  // NOLINT
  }
 };

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -225,7 +225,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
 //
 //  const float coef = 1.0 / 9.0;
 //  for (int k = 0; k < batch_size; ++k) {
-//#pragma omp parallel for
+// #pragma omp parallel for
 //    for (int c = 0; c < output_channels; ++c) {
 //      const float *input_seg = input_data + c * inputdata_channel_stride;
 //      float *output_seg = out_data + c * outputdata_channel_stride;

--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -61,7 +61,5 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/mul_op.h
+++ b/src/operators/mul_op.h
@@ -47,13 +47,4 @@ class MulOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(mul);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(mul);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/multiclass_nms_op.cpp
+++ b/src/operators/multiclass_nms_op.cpp
@@ -42,9 +42,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(multiclass_nms, ops::MultiClassNMSOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/multiclass_nms_op.h
+++ b/src/operators/multiclass_nms_op.h
@@ -52,12 +52,4 @@ class MultiClassNMSOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(multiclass_nms);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -40,30 +40,6 @@ using std::vector;
 template <typename Dtype>
 struct DtypeTensorTrait {
-  typedef void ptype;
-  typedef void rtype;
-};
-template <>
-struct DtypeTensorTrait<CPU> {
-  // This is the type we obtained in variable.
-  typedef framework::LoDTensor gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::Tensor rtype;
-};
-template <>
-struct DtypeTensorTrait<FPGA> {
-  // This is the type we obtained in variable.
-  typedef framework::LoDTensor gtype;
-  // This type will be the parent class type
-  // or the same type.
-  typedef framework::Tensor rtype;
-};
-template <>
-struct DtypeTensorTrait<GPU_MALI> {
  // This is the type we obtained in variable.
  typedef framework::LoDTensor gtype;
  // This type will be the parent class type
@@ -1935,7 +1911,7 @@ class DropoutParam : public OpParam {
 };
 #endif
-#ifdef CONV_TRANSPOSE
+#ifdef CONV_TRANSPOSE_OP
 template <typename Dtype>
 class ConvTransposeParam : public OpParam {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -2151,5 +2127,75 @@ class ShapeParam : public OpParam {
 };
 #endif
+template <typename Dtype>
+class QuantizeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  QuantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    if (HasAttr("is_static", attrs)) {
+      is_static_ = GetAttr<bool>("is_static", attrs);
+    }
+    // online
+    // scale = max(abs(x))
+    online_scale_ = GetVarValue<GType>("OutScale", outputs, scope);
+    // offline
+    if (HasAttr("static_scale", attrs)) {
+      static_scale_ = GetAttr<float>("static_scale", attrs);
+    }
+    // x = round(scale * x)
+    if (HasAttr("round_type", attrs)) {
+      round_type_ = GetAttr<RoundType>("round_type", attrs);
+    }
+  }
+ public:
+  // op input
+  RType *input_;
+  // op output
+  RType *out_;
+  //
+  RType *online_scale_;
+  // if static scale or not
+  bool is_static_ = false;
+  // quantize scale
+  float static_scale_ = 1.0f;
+  // round method type
+  // nearest_zero and nearest_even is valid currently
+  RoundType round_type_ = ROUND_NEAREST_TO_EVEN;
+};
+template <typename Dtype>
+class DequantizeParam : public OpParam {
+  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
+  typedef typename DtypeTensorTrait<Dtype>::rtype RType;
+ public:
+  DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs,
+                  const AttributeMap &attrs, const Scope &scope) {
+    input_ = InputXFrom<GType>(inputs, scope);
+    out_ = OutFrom<GType>(outputs, scope);
+    activation_scale_ = GetVarValue<GType>("Scale", inputs, scope);
+    // dequantization is performed as x = x / static_scale / online_scale
+    if (HasAttr("weight_scale", attrs)) {
+      weight_scale_ = GetAttr<float>("weight_scale", attrs);
+    } else {
+      weight_scale_ = GetAttr<float>("max_range", attrs);
+    }
+  }
+ public:
+  // op input
+  RType *input_;
+  // op output
+  RType *out_;
+  RType *activation_scale_;
+  float weight_scale_;
+};
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/pool_op.h
+++ b/src/operators/pool_op.h
@@ -48,14 +48,4 @@ class PoolOp : public OperatorWithKernel<DeviceType, PoolParam<DeviceType>,
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(pool2d);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(pool2d);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(pool2d);
-#endif
 #endif
--- a/src/operators/prelu_op.cpp
+++ b/src/operators/prelu_op.cpp
@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(prelu, ops::PReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(prelu, ops::PReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/prelu_op.h
+++ b/src/operators/prelu_op.h
@@ -50,14 +50,4 @@ class PReluOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prelu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(prelu);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(prelu);
-#endif
 #endif
--- a/src/operators/prior_box_op.cpp
+++ b/src/operators/prior_box_op.cpp
@@ -54,7 +54,5 @@ REGISTER_OPERATOR_CPU(prior_box, ops::PriorBoxOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/prior_box_op.h
+++ b/src/operators/prior_box_op.h
@@ -51,12 +51,4 @@ class PriorBoxOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(prior_box);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/quantize_op.cpp
+++ b/src/operators/quantize_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/quantize_op.h"
+#include <vector>
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+void QuantizeOp<DeviceType, T>::InferShape() const {
+  const auto& input_dims = this->param_.input_->dims();
+  this->param_.out_->Resize(input_dims);
+  auto scale_dims = framework::make_ddim(std::vector<int>{1});
+  this->param_.online_scale_->Resize(scale_dims);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU(quantize, ops::QuantizeOp);
+#endif
--- a/src/operators/quantize_op.h
+++ b/src/operators/quantize_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/quantize_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class QuantizeOp : public framework::OperatorWithKernel<
+                       DeviceType, QuantizeParam<DeviceType>,
+                       operators::QuantizeKernel<DeviceType, T>> {
+ public:
+  QuantizeOp(const std::string &type, const VariableNameMap &inputs,
+             const VariableNameMap &outputs,
+             const framework::AttributeMap &attrs,
+             std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, QuantizeParam<DeviceType>,
+                                      operators::QuantizeKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  // inference output shape
+  void InferShape() const override;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
--- a/src/operators/relu_op.cpp
+++ b/src/operators/relu_op.cpp
@@ -39,7 +39,5 @@ REGISTER_OPERATOR_CPU(relu, ops::ReluOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/relu_op.h
+++ b/src/operators/relu_op.h
@@ -53,13 +53,4 @@ class ReluOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(relu);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(relu);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/reshape_op.cpp
+++ b/src/operators/reshape_op.cpp
@@ -38,7 +38,5 @@ REGISTER_OPERATOR_CPU(reshape, ops::ReshapeOp);
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(reshape, ops::ReshapeOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/reshape_op.h
+++ b/src/operators/reshape_op.h
@@ -51,14 +51,4 @@ class ReshapeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-namespace ops = paddle_mobile::operators;
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(reshape);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(reshape);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/resize_op.cpp
+++ b/src/operators/resize_op.cpp
@@ -30,14 +30,10 @@ void ResizeOp<Dtype, T>::InferShape() const {
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(resize);
 REGISTER_OPERATOR_CPU(resize, ops::ResizeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(resize);
 REGISTER_OPERATOR_MALI_GPU(resize, ops::ResizeOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/scale_op.cpp
+++ b/src/operators/scale_op.cpp
@@ -30,14 +30,10 @@ void ScaleOp<Dtype, T>::InferShape() const {
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(scale);
 REGISTER_OPERATOR_CPU(scale, ops::ScaleOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(scale);
 REGISTER_OPERATOR_MALI_GPU(scale, ops::ScaleOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/shape_op.cpp
+++ b/src/operators/shape_op.cpp
@@ -36,7 +36,5 @@ REGISTER_OPERATOR_CPU(shape, ops::ShapeOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/shape_op.h
+++ b/src/operators/shape_op.h
@@ -48,12 +48,4 @@ class ShapeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(shape);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/sigmoid_op.cpp
+++ b/src/operators/sigmoid_op.cpp
@@ -18,6 +18,7 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 void SigmoidOp<DeviceType, T>::InferShape() const {
  this->param_.Out()->Resize(this->param_.InputX()->dims());
@@ -30,9 +31,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(sigmoid, ops::SigmoidOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/sigmoid_op.h
+++ b/src/operators/sigmoid_op.h
@@ -17,13 +17,13 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "framework/operator.h"
 #include "operators/kernel/sigmoid_kernel.h"
 #include "operators/op_param.h"
 namespace paddle_mobile {
 namespace operators {
 template <typename DeviceType, typename T>
 class SigmoidOp : public framework::OperatorWithKernel<
                      DeviceType, SigmoidParam<DeviceType>,
@@ -43,15 +43,8 @@ class SigmoidOp : public framework::OperatorWithKernel<
  void InferShape() const override;
 };
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(sigmoid);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/slice_op.cpp
+++ b/src/operators/slice_op.cpp
@@ -29,14 +29,10 @@ void SliceOp<Dtype, T>::InferShape() const {
 namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(slice);
 REGISTER_OPERATOR_CPU(slice, ops::SliceOp);
 #endif
 #ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(slice);
 REGISTER_OPERATOR_MALI_GPU(slice, ops::SliceOp);
 #endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -48,14 +48,4 @@ class SoftmaxOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(softmax);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-USE_OP_MALI_GPU(softmax);
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-USE_OP_FPGA(softmax);
-#endif
 #endif
--- a/src/operators/split_op.cpp
+++ b/src/operators/split_op.cpp
@@ -83,9 +83,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(split, ops::SplitOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-#endif
+#endif  // SPLIT_OP
--- a/src/operators/split_op.h
+++ b/src/operators/split_op.h
@@ -47,12 +47,4 @@ class SplitOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(split);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/operators/transpose_op.cpp
+++ b/src/operators/transpose_op.cpp
@@ -55,9 +55,5 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(transpose, ops::TransposeOp);
 #endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
-#endif
+#endif  // TRANSPOSE_OP
--- a/src/operators/transpose_op.h
+++ b/src/operators/transpose_op.h
@@ -50,12 +50,4 @@ class TransposeOp : public framework::OperatorWithKernel<
 }  // namespace operators
 }  // namespace paddle_mobile
-#ifdef PADDLE_MOBILE_CPU
-USE_OP_CPU(transpose);
-#endif
-#ifdef PADDLE_MOBILE_MALI_GPU
-#endif
-#ifdef PADDLE_MOBILE_FPGA
-#endif
 #endif
--- a/src/common/protobuf-c.c
+++ b/src/common/protobuf-c.c
--- a/src/common/protobuf-c.h
+++ b/src/common/protobuf-c.h
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -212,6 +212,14 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-fc-op operators/test_fusion_fc_op.cpp test_helper.h test_include.h)
    target_link_libraries(test-fc-op paddle-mobile)
+    # test quantize op
+    ADD_EXECUTABLE(test-quantize-op operators/test_quantize_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-quantize-op paddle-mobile)
+    # test dequantize op
+    ADD_EXECUTABLE(test-dequantize-op operators/test_dequantize_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-dequantize-op paddle-mobile)
    # gen test log
    ADD_EXECUTABLE(test-log common/test_log.cpp)
    target_link_libraries(test-log paddle-mobile)
@@ -316,6 +324,4 @@ if (NOT FOUND_MATCH)
    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 endif ()
--- a/test/operators/test_dequantize_op.cpp
+++ b/test/operators/test_dequantize_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/dequantize_op.h"
+namespace paddle_mobile {
+void dequantize(const Tensor* input, const float scale, Tensor* output) {
+  const int32_t* x = input->data<const int32_t>();
+  float* y = output->mutable_data<float>();
+  size_t size = output->numel();
+  for (size_t i = 0; i < size; ++i) {
+    y[i] = x[i] * scale;
+  }
+}
+int TestDequqntizeOp() {
+  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"input"});
+  inputs["Scale"] = std::vector<std::string>({"scale"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<int32_t>(input, dim, -1000, 1000);
+  auto scale_var = scope.get()->Var("scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  scale->Resize(framework::make_ddim({1}));
+  scale->mutable_data<float>()[0] = 1.27;
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["weight_scale"].Set<float>(1.74);
+  auto* op = new operators::DequantizeOp<CPU, float>("dequantize", inputs,
+                                                     outputs, attrs, scope);
+  op->InferShape();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const float* output_data = output->data<float>();
+  framework::Tensor output_cmp;
+  output_cmp.Resize(dim);
+  float dequant_scale = 1.f / (1.27 * 1.74);
+  dequantize(input, dequant_scale, &output_cmp);
+  const float* output_cmp_data = output_cmp.data<float>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+                          "output[%d] = %.6f, output_cmp[%d] = %.6f", i,
+                          output_data[i], i, output_cmp_data[i]);
+  }
+  delete op;
+  return 0;
+}
+}  // namespace paddle_mobile
+int main() { return paddle_mobile::TestDequqntizeOp(); }
--- a/test/operators/test_quantize_op.cpp
+++ b/test/operators/test_quantize_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/quantize_op.h"
+namespace paddle_mobile {
+// static float g_test_data[50] = {
+//   -5.55, -5.5, -5.45, -5.0, -4.55, -4.5, -4.45, -4.0, -3.55, -3.5,
+//   -3.45, -3.01, -2.75, -2.5, -2.501, -2.49, -2.01, -1.75, -1.5, -1.25,
+//   -1.0, -0.75, -0.5, -0.25, 0.0, 0.25, 0.5, 0.75, 1.0, 1.25,
+//   1.5, 1.75, 2.01, 2.49, 2.501, 2.5, 2.75, 3.01, 3.45, 3.5,
+//   3.55, 4.0, 4.45, 4.5, 4.55, 5.0, 5.45, 5.5, 5.55, 6.0,
+// };
+static float find_abs_max(const Tensor *input) {
+  float max_abs = 0.f;
+  const float *x = input->data<const float>();
+  size_t size = input->numel();
+  for (size_t i = 0; i < size; ++i) {
+    float value = std::abs(x[i]);
+    if (value > max_abs) {
+      max_abs = value;
+    }
+  }
+  return max_abs;
+}
+static void quantize_round_to_even(const Tensor *input, const float scale,
+                                   Tensor *output) {
+  const float *x = input->data<const float>();
+  int8_t *y = output->mutable_data<int8_t>();
+  size_t size = input->numel();
+  for (size_t i = 0; i < size; ++i) {
+    float value = x[i] * scale;
+    float v = round(value);
+    int32_t q = (int32_t)v;
+    if (abs(abs(q - value) - 0.5) > 0) {
+      y[i] = q;
+    } else {
+      if (abs(q) % 2 == 0) {
+        y[i] = q;
+      } else {
+        y[i] = q + ((q > 0) ? -1 : 1);
+      }
+    }
+  }
+}
+int TestQuqntizeOp() {
+  framework::DDim dim = framework::make_ddim({1, 3, 224, 224});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"input"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+  outputs["OutScale"] = std::vector<std::string>({"output_scale"});
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<float>(input, dim, -100.f, 100.f);
+  auto output_var = scope.get()->Var("output");
+  auto output_scale_var = scope.get()->Var("output_scale");
+  framework::AttributeMap attrs;
+  auto *op = new operators::QuantizeOp<CPU, float>("quantize", inputs, outputs,
+                                                   attrs, scope);
+  op->InferShape();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const int8_t *output_data = output->data<int8_t>();
+  auto output_scale = output_scale_var->template Get<framework::LoDTensor>();
+  const float *output_scale_data = output_scale->data<float>();
+  float max_abs = find_abs_max(input);
+  float output_scale_cmp = 127 / max_abs;
+  PADDLE_MOBILE_ENFORCE(output_scale_cmp == output_scale_data[0],
+                        "output_scale = %.6f, output_scale_cmp = %.6f",
+                        output_scale_cmp, output_scale_data[0]);
+  framework::Tensor output_cmp;
+  output_cmp.Resize(dim);
+  quantize_round_to_even(input, output_scale_cmp, &output_cmp);
+  int8_t *output_cmp_data = output_cmp.data<int8_t>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+                          "output[%d] = %d, output_cmp[%d] = %d", i,
+                          static_cast<int>(output_data[i]), i,
+                          static_cast<int>(output_cmp_data[i]));
+  }
+  delete op;
+  return 0;
+}
+}  // namespace paddle_mobile
+int main() { return paddle_mobile::TestQuqntizeOp(); }
--- a/test/operators/test_sigmoid_op.cpp
+++ b/test/operators/test_sigmoid_op.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
+#include "../../src/operators/kernel/central-arm-func/sigmoid_arm_func.h"
 #include "../../src/operators/kernel/sigmoid_kernel.h"
 #include "../test_helper.h"
 #include "io/executor.h"

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -356,7 +356,7 @@ if (FUSION_CONVBN_OP)
 endif()
 if (CONV_TRANSPOSE_OP)
-  add_definitions(-DCONV_TRANSPOSE)
+  add_definitions(-DCONV_TRANSPOSE_OP)
 endif()
 if (LOOKUP_OP)
@@ -386,4 +386,4 @@ endif()
 if (SHAPE_OP)
  add_definitions(-DSHAPE_OP)
 endif()
\ No newline at end of file
--- a/tools/pre-commit.hooks/cpplint.hook
+++ b/tools/pre-commit.hooks/cpplint.hook
@@ -3,7 +3,9 @@
 TOTAL_ERRORS=0
 # The trick to remove deleted files: https://stackoverflow.com/a/2413151
-for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | grep -v ".pb.cpp" | grep -v ".pb.h"); do
+for file in $(git diff --cached --name-status | awk '$1 != "D" {print $2}' | \
+        grep -v ".pb.cpp" | grep -v ".pb.h" | grep -v ".pb-c.h" | grep -v ".pb-c.c" | \
+        grep -v "protobuf-c.h" | grep -v "protobuf-c.c"); do
    cpplint $file;
    TOTAL_ERRORS=$(expr $TOTAL_ERRORS + $?);
 done