Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into dev-latest

81ff3830 · chenhoujiang · ca0fd822 · f3940b9c · 81ff3830 · 81ff3830
68 changed file
--- a/src/common/types.cpp
+++ b/src/common/types.cpp
@@ -24,6 +24,7 @@ const char *G_OP_TYPE_CONCAT = "concat";
 const char *G_OP_TYPE_ELEMENTWISE_ADD = "elementwise_add";
 const char *G_OP_TYPE_FILL_CONSTANT = "fill_constant";
 const char *G_OP_TYPE_FUSION_CONV_ADD_RELU = "fusion_conv_add_relu";
+const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8 = "fusion_conv_add_relu_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU = "fusion_conv_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU = "fusion_conv_add_add_prelu";
 const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU = "fusion_conv_add_bn_relu";
@@ -31,6 +32,7 @@ const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU = "fusion_conv_bn_add_relu";
 const char *G_OP_TYPE_FUSION_DWCONV_BN_RELU = "fusion_dwconv_bn_relu";
 const char *G_OP_TYPE_FUSION_CONV_BN_RELU = "fusion_conv_bn_relu";
 const char *G_OP_TYPE_FC = "fusion_fc";
+const char *G_OP_TYPE_FC_INT8 = "fusion_fc_int8";
 const char *G_OP_TYPE_FUSION_CONV_ADD = "fusion_conv_add";
 const char *G_OP_TYPE_LRN = "lrn";
 const char *G_OP_TYPE_MUL = "mul";
@@ -117,11 +119,13 @@ std::unordered_map<
        {G_OP_TYPE_MULTICLASS_NMS, {{"BBoxes", "Scores"}, {"Out"}}},
        {G_OP_TYPE_POLYGON_BOX_TRANSFORM, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FC, {{"X", "Y", "Z"}, {"Out"}}},
+        {G_OP_TYPE_FC_INT8, {{"X", "Y", "Z", "Scale"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE, {{"X"}, {"Out"}}},
        {G_OP_TYPE_RESHAPE2, {{"X"}, {"Out", "XShape"}}},
        {G_OP_TYPE_DEPTHWISE_CONV, {{"Input"}, {"Output"}}},
        {G_OP_TYPE_FILL_CONSTANT, {{}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_RELU, {{"Input"}, {"Out"}}},
+        {G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8, {{"Input", "Scale"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU, {{"Input"}, {"Out"}}},
        {G_OP_TYPE_IM2SEQUENCE, {{"X"}, {"Out"}}},

--- a/src/common/types.h
+++ b/src/common/types.h
@@ -108,9 +108,11 @@ extern const char *G_OP_TYPE_BOX_CODER;
 extern const char *G_OP_TYPE_CONCAT;
 extern const char *G_OP_TYPE_ELEMENTWISE_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU;
+extern const char *G_OP_TYPE_FUSION_CONV_ADD_RELU_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_PRELU;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_ADD_PRELU;
 extern const char *G_OP_TYPE_FC;
+extern const char *G_OP_TYPE_FC_INT8;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD;
 extern const char *G_OP_TYPE_FUSION_CONV_ADD_BN_RELU;
 extern const char *G_OP_TYPE_FUSION_CONV_BN_ADD_RELU;

--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -14,6 +14,7 @@ limitations under the License. */

 #include "fpga/V1/api.h"
 #include "fpga/V1/bias_scale.h"
+#include "fpga/V1/deconv_filter.h"
 #include "fpga/V1/filter.h"
 #include "fpga/V1/image.h"

@@ -124,6 +125,32 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) {
                           max_value);
  filter_tensor->reset_data_ptr(new_data);
 }
+void format_deconv_filter(framework::Tensor *filter_tensor, float max_value,
+                          int group_num, int stride) {
+  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
+  filter_tensor->scale[1] = float(127.0 / max_value);  // NOLINT
+  auto dims = filter_tensor->dims();
+  auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
+  auto data_ptr = filter_tensor->data<float>();
+  size_t memory_size = num * channel * height * width * sizeof(float);
+  auto new_data = (float *)fpga_malloc(memory_size);  // NOLINT
+  memcpy(new_data, data_ptr, memory_size);
+
+  int hw = height * width;
+  deconv_filter::deconv_NC_convert(&new_data, num, channel, hw);
+
+  num = dims[1];
+  channel = dims[0];
+  deconv_filter::deconv_format_filter(
+      &new_data, (int)num, (int)channel,          // NOLINT
+      (int)height,                                // NOLINT
+      (int)width, group_num, max_value, stride);  // NOLINT
+
+  framework::DDim dims_new =
+      framework::make_ddim({num, channel, height, width});
+  filter_tensor->Resize(dims_new);
+  filter_tensor->reset_data_ptr(new_data);
+}

 void format_bias_scale_array(float **bias_scale_array,
                             int element_num_per_division, int num) {
@@ -240,6 +267,100 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
  filter->reset_data_ptr(nullptr);
  fpga_free(bs_ptr);
 }
+void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
+                     framework::Tensor *out, framework::Tensor *filter,
+                     bool relu_enabled, int group_num, int stride_h,
+                     int stride_w, int padding_h, int padding_w,
+                     float *bs_ptr) {
+  auto input_ptr = input->data<float>();
+  auto filter_ptr = filter->data<float>();
+  auto out_ptr = out->data<float>();
+
+  arg->group_num = (uint32_t)group_num;
+  arg->sub_conv_num = stride_h;
+  arg->filter_num = (uint32_t)filter->dims()[0];
+
+  int sub_conv_num = arg->sub_conv_num;
+  int sub_stride = 1;
+  int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w,
+                                                   stride_w);
+  int sub_filter_width =
+      deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w);

+  int sub_output_width = deconv_filter::deconv_get_sub_out_axis(
+      input->dims()[3], sub_pad, sub_filter_width);
+  int sub_output_height = deconv_filter::deconv_get_sub_out_axis(
+      input->dims()[2], sub_pad, sub_filter_width);
+
+  arg->sub_output_width = sub_output_width;
+  arg->sub_output_height = sub_output_height;
+  arg->omit_size =
+      deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w);
+  arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
+
+  int sub_channels = (int32_t)input->dims()[1];
+  int omit_size = arg->omit_size;
+  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
+  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
+  int sub_filter_num = sub_conv_num * (arg->filter_num);
+
+  int conv_output_size =
+      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
+      sub_output_height;
+  int ouput_size = conv_output_size * sub_conv_num;
+
+  int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT);
+  int align_sub_filter_count =
+      align_to_x(sub_filter_width * sub_filter_width * sub_channels,
+                 FILTER_ELEMENT_ALIGNMENT);
+  int align_conv_sub_filter_count =
+      align_sub_filter_count * align_sub_filter_num;
+
+  for (int i = 0; i < sub_conv_num; ++i) {
+    arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
+    arg->conv_args[i].group_num = group_num;
+
+    arg->conv_args[i].filter_scale_address = filter->scale;
+    arg->conv_args[i].relu_enabled = relu_enabled;
+
+    arg->conv_args[i].kernel.width = sub_filter_width;
+    arg->conv_args[i].kernel.height = sub_filter_width;
+    arg->conv_args[i].kernel.stride_w = 1;
+    arg->conv_args[i].kernel.stride_h = 1;
+
+    // DeconvParam.conv_args[i].image.address = (void*)ptr_image;
+    arg->conv_args[i].image.scale_address = input->scale;
+    arg->conv_args[i].image.channels = sub_channels;
+    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
+    arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
+    arg->conv_args[i].image.pad_width = sub_pad;
+    arg->conv_args[i].image.pad_height = sub_pad;
+    arg->conv_args[i].image.address = input_ptr;
+
+    arg->conv_args[i].sb_address = (void *)bs_ptr;
+
+    char *filter_sub_space =
+        (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
+    fpga_copy(filter_sub_space,
+              (char *)filter_ptr + i * align_conv_sub_filter_count,
+              align_conv_sub_filter_count);
+    arg->conv_args[i].filter_address = (void *)(filter_sub_space);
+    fpga_flush(filter_sub_space, align_conv_sub_filter_count);
+
+    if (sub_conv_num == 1) {
+      arg->conv_args[i].output.address = out_ptr;
+      arg->conv_args[i].output.scale_address = out->scale;
+    } else {
+      half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
+      arg->conv_args[i].output.address = (void *)((half *)ptr_output);
+      float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
+      arg->conv_args[i].output.scale_address = ptr_output_scale;
+    }
+  }
+
+  arg->output.address = out_ptr;
+  arg->output.scale_address = out->scale;
+  // fpga_free(filter_ptr);
+}
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -43,6 +43,25 @@ void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    framework::Tensor* out, framework::Tensor* filter,
                    bool relu_enabled, int group_num, int stride_h,
                    int stride_w, int padding_h, int padding_w, float* bs_ptr);
+void fill_deconv_arg(struct DeconvArgs* arg, framework::Tensor* input,
+                     framework::Tensor* out, framework::Tensor* filter,
+                     bool relu_enabled, int group_num, int stride_h,
+                     int stride_w, int padding_h, int padding_w, float* bs_ptr);
+
+void format_deconv_filter(framework::Tensor* filter_tensor, float max_value,
+                          int group_num, int stride);
+
+template <typename Dtype>
+void savefile(std::string filename, void* buffer, int dataSize, Dtype tmp) {
+  float data;
+  std::ofstream out(filename.c_str());
+  for (int i = 0; i < dataSize; ++i) {
+    data = (((Dtype*)buffer)[i]);
+    out << data << std::endl;
+  }
+  out.close();
+  return;
+}

 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/deconv_bias_scale.cpp
+++ b/src/fpga/V1/deconv_bias_scale.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V1/deconv_bias_scale.h"
+// #include "deconv_bias_scale.h"
+#include "fpga/V1/bias_scale.h"
+// #include "bias_scale.h"
+#include <memory.h>
+
+#include "fpga/V1/api.h"
+// #include "fpga_api.h"
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_bias_scale {
+
+void deconv_bias_scale_expand(float** bias_scale_array, int num,
+                              int sub_conv_n) {
+  int sub_num = num * sub_conv_n;
+  float* ptr_tmp = *bias_scale_array;
+  float* ptr_bias_scale_expand =
+      (float*)fpga_malloc(sizeof(float) * sub_num * 2);
+  int scale_base_offset = sub_num;
+  for (int i = 0; i < sub_conv_n; ++i) {
+    int offset = num * i;
+    // copy bias
+    fpga_copy(ptr_bias_scale_expand + offset, ptr_tmp, num * sizeof(float));
+    // copy scale
+    fpga_copy(ptr_bias_scale_expand + scale_base_offset + offset, ptr_tmp + num,
+              num * sizeof(float));
+  }
+  *bias_scale_array = ptr_bias_scale_expand;
+  fpga_free(ptr_tmp);
+}
+
+}  // namespace deconv_bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V1/deconv_bias_scale.h
+++ b/src/fpga/V1/deconv_bias_scale.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+#define BS_NUM_ALIGNMENT 8
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_bias_scale {
+
+void deconv_bias_scale_expand(float** bias_scale_array, int num,
+                              int sub_conv_n);
+
+}  // namespace deconv_bias_scale
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V1/deconv_filter.cpp
+++ b/src/fpga/V1/deconv_filter.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "fpga/V1/deconv_filter.h"
+#include <memory.h>
+#include <algorithm>
+// #include "deconv_filter.h"
+#include "fpga/V1/filter.h"
+// #include "filter.h"
+#include "fpga/V1/api.h"
+// #include "fpga_api.h"
+
+// just for test
+//#include <string>
+//#include "deconv.h"
+//#include "deconv_api.h"
+// using namespace std;
+// using namespace paddle_mobile::fpga;
+// using namespace baidu::fpga::deconv::api;
+// namespace api = baidu::fpga::deconv::api;
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_filter {
+
+/*
+inverse kernel weights of each channel for every filter
+*/
+void deconv_inverse_filter(float** data_in, int num, int channel, int width,
+                           int height) {
+  float* tmp = *data_in;
+  // float fix_range = 127;//  float scale = fix_range / max;
+  int data_size = num * channel * width * height;
+  int hw_len = height * width;
+  float* tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
+  for (int i = 0; i < num; ++i) {
+    for (int j = 0; j < channel; ++j) {
+      for (int k = 0; k < hw_len; ++k) {
+        tmp_data[i * channel * hw_len + j * hw_len + k] =
+            (*data_in)[i * channel * hw_len + j * hw_len + hw_len - k - 1];
+      }
+    }
+  }
+  *data_in = (float*)tmp_data;  //
+  fpga_free(tmp);
+}
+
+/*
+    calculate sub padding number
+*/
+int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
+  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
+    // error
+    return 0;
+  }
+  return (filter_axis - pad - 1) / stride;
+}
+int deconv_get_sub_filter_axis(int filter_axis, int stride) {
+  return (filter_axis / stride);
+}
+
+int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
+  return ((image_axis + 2 * sub_pad - sub_filter_axis) + 1);
+}
+
+/*
+    (filter_width-pad,filter_width-pad) is the first pixel of sub-pixel image
+   position. so the omit rows or columns is (stride - )
+*/
+int deconv_get_omit(int stride, int filter_width, int pad) {
+  if (((filter_width - pad) <= 0)) {  // ((filter_width-pad) > stride) ||
+    // error
+    return 0;
+  }
+  int idx = 1;
+  bool flag = false;
+  for (idx = 1; idx <= stride; ++idx) {
+    int j = idx;
+    for (; j <= filter_width;) {
+      if (j == filter_width - pad) {
+        flag = true;
+        break;
+      }
+      j = j + stride;
+    }
+    if (flag) {
+      break;
+    }
+  }
+
+  return (stride - idx);
+}
+
+int deconv_get_sub_filter_num(int filter_num, int stride) {
+  return filter_num * stride;
+}
+
+void deconv_get_sub_filter(char** data_in, int height, int width,
+                           int sub_conv_n, int kernel_num, int channel) {
+  char* ptr_tmp = *data_in;
+  int sub_num = kernel_num * sub_conv_n;
+  int sub_h = height / sub_conv_n;
+  int sub_w = width / sub_conv_n;
+
+  int sub_filter_size =
+      kernel_num * sub_h * sub_w * channel * sub_conv_n * sub_conv_n;
+
+  char* ptr_sub_filter = (char*)fpga_malloc(sub_filter_size * sizeof(char));
+  for (int idx = 0; idx < sub_conv_n; ++idx) {
+    for (int nn = 0; nn < sub_num; ++nn) {
+      int ni = nn % kernel_num;
+
+      int woff = sub_conv_n - 1 - (nn / kernel_num);  //
+
+      for (int hh = 0; hh < sub_h; ++hh) {
+        int hi = hh * sub_conv_n + idx % sub_conv_n;
+        for (int ww = 0; ww < sub_w; ++ww) {
+          int wi = ww * sub_conv_n + woff;  // 1 0
+
+          int sidx = ((nn * sub_h + hh) * sub_w + ww) * channel;   //
+          int kidx = ((ni * height + hi) * width + wi) * channel;  //
+
+          fpga_copy(
+              ptr_sub_filter + idx * sub_h * sub_w * channel * sub_num + sidx,
+              (*data_in) + kidx, channel * sizeof(char));
+          // for (int cc =0; cc < channel; ++cc) {
+          //     ptr_sub_filter[idx*sub_h*sub_w*channel*sub_num + sidx + cc] =
+          //     (*data_in)[kidx + cc];
+          // }
+        }
+      }
+    }
+  }
+  *data_in = ptr_sub_filter;
+  fpga_free(ptr_tmp);
+}
+
+void deconv_NC_convert(float** filter_in, int kernel_num, int channels,
+                       int hw) {
+  float* tmp = *filter_in;
+  float* ptr_filter = (float*)(paddle_mobile::fpga::fpga_malloc(
+      hw * kernel_num * channels * sizeof(float)));
+
+  for (int c = 0; c < channels; ++c) {
+    for (int n = 0; n < kernel_num; ++n) {
+      paddle_mobile::fpga::fpga_copy(ptr_filter + n * hw + kernel_num * hw * c,
+                                     tmp + n * channels * hw + c * hw,
+                                     hw * sizeof(float));
+    }
+  }
+  *filter_in = ptr_filter;
+  paddle_mobile::fpga::fpga_free(tmp);
+}
+
+void deconv_format_filter(float** data_in, int num, int channel, int height,
+                          int width, int group_num, float max, int stride) {
+  int data_size = channel * height * width * num;
+
+  /*{
+       float result2 = (float)0;
+       string filename = "origin_filter_data";
+       api::savefile<float>(filename, (void *)*data_in, data_size, result2);
+    }*/
+
+  deconv_inverse_filter(data_in, num, channel, width, height);
+
+  /* {
+          float result2 = (float)0;
+          string filename = "inverse_filter_data";
+          api::savefile<float>(filename, (void *)*data_in, data_size, result2);
+   }*/
+
+  filter::quantize(data_in, data_size, max);
+  /* {
+        char result2 = (char)0;
+        string filename = "quantize_filter_data";
+        api::savefile<char>(filename, (void *)*data_in, data_size, result2);
+ }*/
+  char** quantize_data = (char**)data_in;  // NOLINT
+
+  filter::convert_to_hwc(quantize_data, num, channel, height, width);
+  /*{
+       char result2 = (char)0;
+       string filename = "convert_to_hwc_filter_data";
+       api::savefile<char>(filename, (void *)*quantize_data, data_size,
+  result2);
+  }*/
+
+  deconv_get_sub_filter(quantize_data, height, width, stride, num, channel);
+  /*{
+     char result2 = (char)0;
+     string filename = "sub_filter_filter_data";
+     api::savefile<char>(filename, (void *)*quantize_data, data_size, result2);
+}*/
+
+  int sub_conv_n = stride;
+  int sub_h = height / sub_conv_n;
+  int sub_w = width / sub_conv_n;
+  int sub_chw = sub_h * sub_w * channel;
+  int sub_num = sub_conv_n * num;
+  int division_capacity = filter::calc_division_capacity(sub_chw);
+  int num_per_div_before_alignment =
+      filter::calc_num_per_div(sub_num, group_num, division_capacity);
+  int num_per_div_after_alignment =
+      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
+  int div_num = (sub_num + num_per_div_before_alignment - 1) /
+                num_per_div_before_alignment;
+  int residual = (sub_num) % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
+
+  char** ptr_ptr_data = (char**)fpga_malloc(sub_conv_n * sizeof(char*));
+  int origin_offset = sub_chw * sub_num;
+  for (int i = 0; i < sub_conv_n; ++i) {
+    (ptr_ptr_data)[i] = (char*)fpga_malloc(origin_offset * sizeof(char));
+    fpga_copy((ptr_ptr_data)[i], (*quantize_data) + origin_offset * i,
+              origin_offset * sizeof(char));
+
+    /* char result2 = (char)0;
+     string filename = "ptr_ptr_data" + to_string(i);
+     api::savefile<char>(filename, (void *)(ptr_ptr_data[i]), origin_offset,
+     result2);
+     */
+  }
+  // char result2 = (char)0;
+  //      string filename = "interleave";
+  //      api::savefile<char>(filename, (void *)*ptr_ptr_data, origin_offset,
+  //      result2);
+  fpga_free(*quantize_data);
+
+  int align_offset =
+      align_to_x(sub_chw, FILTER_ELEMENT_ALIGNMENT) * num_after_alignment;
+  char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset *
+                                       sizeof(char));  // continuous space
+  for (int i = 0; i < sub_conv_n; ++i) {
+    int offset = i * origin_offset;
+    char* ptr_tmp = (ptr_ptr_data)[i];
+
+    filter::align_element(&ptr_tmp, sub_num, sub_chw);
+    filter::align_num(&ptr_tmp, num_per_div_before_alignment, sub_num, sub_chw);
+
+    filter::reorder(&ptr_tmp, num_after_alignment, sub_chw);
+    filter::interleave(&ptr_tmp, num_after_alignment, sub_chw);
+
+    /*   char result2 = (char)0;
+       string filename = "interleave" + to_string(i);
+       api::savefile<char>(filename, (void *)ptr_tmp, align_offset, result2);
+*/
+    fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset);
+    fpga_free(ptr_tmp);
+  }
+  *data_in = (float*)ptr_space;
+
+  /*    {
+        char result2 = (char)0;
+         string filename = "ptr_space";
+         api::savefile<char>(filename, (void *)ptr_space, sub_conv_n *
+     align_offset, result2);
+      }*/
+  fpga_flush(ptr_space, sub_conv_n * align_offset * sizeof(char));
+}
+
+}  // namespace deconv_filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V1/deconv_filter.h
+++ b/src/fpga/V1/deconv_filter.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma once
+
+namespace paddle_mobile {
+namespace fpga {
+namespace deconv_filter {
+
+void deconv_inverse_filter(float** data_in, int num, int channel, int width,
+                           int height);
+int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
+int deconv_get_sub_filter_num(int filter_num, int stride);
+int deconv_get_sub_filter_axis(int filter_axis, int stride);
+int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
+int deconv_get_omit(int stride, int filter_width, int pad);
+void deconv_get_sub_filter(char** data_in, int height, int width,
+                           int sub_conv_n, int kernel_num, int channel);
+void deconv_format_filter(float** data_in, int num, int channel, int height,
+                          int width, int group_num, float max, int stride);
+void deconv_NC_convert(float** filter_in, int kernel_num, int channels, int hw);
+
+}  // namespace deconv_filter
+}  // namespace fpga
+}  // namespace paddle_mobile
--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -146,12 +146,16 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,

  memset(data_tmp, 0, num_element * sizeof(char));

-  for (i = 0; i < div_num; i++) {
+  for (i = 0; i < div_num - 1; i++) {
    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
           *data_in + num_per_div_before_alignment * align_chw * i,
           num_per_div_before_alignment * align_chw);
  }

+  memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+         *data_in + num_per_div_before_alignment * align_chw * i,
+         (num - (div_num - 1) * num_per_div_before_alignment) * align_chw);
+
  *data_in = data_tmp;
  fpga_free(tmp);
 }

--- a/src/fpga/V1/filter.h
+++ b/src/fpga/V1/filter.h
@@ -29,11 +29,11 @@ void convert_to_hwc(char** data_in, int num, int channel, int height,
                    int width);
 float find_max(float* data_in, int data_size);
 void quantize(float** data_in, int data_size, float max);
-void align_element(float** data_in, int num, int chw);
+void align_element(char** data_in, int num, int chw);
 void align_num(char** data_in, int num_per_div_before_alignment, int num,
               int chw);
-void reorder(float** data_in, int num_after_alignment, int chw);
-void interleave(float** data_in, int num_after_alignment, int chw);
+void reorder(char** data_in, int num_after_alignment, int chw);
+void interleave(char** data_in, int num_after_alignment, int chw);
 void format_filter(float** data_in, int num, int channel, int height, int width,
                   int group_num, float max);


--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -146,11 +146,11 @@ int format_conv_data(framework::Tensor *filter_tensor,
 }

 int format_fc_data(framework::Tensor *filter_tensor,
-                   framework::Tensor *ofm_tensor, float *bs_ptr) {
+                   framework::Tensor *ofm_tensor, float **bs_ptr) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_fc_filter(filter_tensor, max_value);
  int aligned_num = get_aligned_filter_num(filter_tensor);
-  fpga::format_bias_scale_array(&bs_ptr,
+  fpga::format_bias_scale_array(bs_ptr,
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);
@@ -214,7 +214,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    arg->conv_arg[i].output.scale_address = out->scale;

    int num_after_alignment = filter::calc_aligned_num(
-        (int)input->dims()[1], arg->filter_num);  // NOLINT
+        arg->filter_num, (int)input->dims()[1]);  // NOLINT
    arg->conv_arg[i].free_space =
        fpga_malloc(num_after_alignment * 2 * sizeof(half));
  }

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -41,7 +41,7 @@ void format_concat_output(framework::Tensor* out, int height, int width,
 int format_conv_data(framework::Tensor* filter_tensor,
                     framework::Tensor* ofm_tensor, float** bs_ptr, int group);
 int format_fc_data(framework::Tensor* filter_tensor,
-                   framework::Tensor* ofm_tensor, float* bs_ptr);
+                   framework::Tensor* ofm_tensor, float** bs_ptr);
 void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,
                    framework::Tensor* out, framework::Tensor* filter,
                    bool relu_enabled, int group_num, int stride_h,

--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -92,7 +92,8 @@ void fpga_free(void *ptr) {
 }
 void fpga_copy(void *dest, const void *src, size_t num) {
 #ifdef PADDLE_MOBILE_ZU5
-  driver::fpga_copy_driver(dest, src, num);
+  // driver::fpga_copy_driver(dest, src, num);
+  memcpy(dest, src, num);
 #else
  memcpy(dest, src, num);
 #endif

--- a/src/fpga/common/pe.h
+++ b/src/fpga/common/pe.h
@@ -26,6 +26,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args);
 int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
 int ComputeFPGASplit(const struct SplitArgs& args);
+int ComputeFpgaDeconv(const struct DeconvArgs& args);

 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -56,6 +56,7 @@ class CLImage {
    tensor_dims_ = dim;
  }

+  bool isInit() { return initialized_; }
  /*
   * need call SetTensorData first
   *

--- a/src/framework/op_registry.h
+++ b/src/framework/op_registry.h
@@ -98,6 +98,24 @@ class OpRegistry {
  }
 };

+#define REGISTER_OPERATOR_INT8(op_type, op_class, device_name, device_type) \
+  template class op_class<device_type, int8_t>;                             \
+  template <typename Dtype, typename T>                                     \
+  class _OpClass_##op_type##_##device_name : public op_class<Dtype, T> {    \
+   public:                                                                  \
+    DEFINE_OP_CONSTRUCTOR(_OpClass_##op_type##_##device_name, op_class);    \
+  };                                                                        \
+  static paddle_mobile::framework::OperatorRegistrar<                       \
+      device_type, _OpClass_##op_type##_##device_name<device_type, int8_t>> \
+      __op_registrar_##op_type##_##device_name(#op_type);                   \
+  int TouchOpRegistrar_##op_type##_##device_name() {                        \
+    __op_registrar_##op_type##_##device_name.Touch();                       \
+    return 0;                                                               \
+  }
+
+#define REGISTER_OPERATOR_CPU_INT8(op_type, op_class) \
+  REGISTER_OPERATOR_INT8(op_type, op_class, cpu, paddle_mobile::CPU);
+
 #define REGISTER_OPERATOR(op_type, op_class, device_name, device_type)     \
  template class op_class<device_type, float>;                             \
  template <typename Dtype, typename T>                                    \

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -153,7 +153,8 @@ double PaddleMobile<CPU, Precision::FP32>::GetPredictTime() {
  paddle_mobile::operators::math::Gemm gemm;
  auto time1 = paddle_mobile::time();
  gemm.Sgemm(m, n, k, static_cast<float>(1), a, lda, b, ldb,
-             static_cast<float>(0), c, ldc, false, nullptr);
+             static_cast<float>(0), c, ldc, false,
+             static_cast<float *>(nullptr));
  auto time2 = paddle_mobile::time();
  double cost = paddle_mobile::time_diff(time1, time2);
  paddle_mobile::memory::Free(a);

--- a/src/operators/dropout_op.cpp
+++ b/src/operators/dropout_op.cpp
@@ -30,6 +30,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(dropout, ops::DropoutOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(dropout, ops::DropoutOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(dropout, ops::DropoutOp);
 #endif

--- a/src/operators/fusion_conv_add_relu_int8_op.cpp
+++ b/src/operators/fusion_conv_add_relu_int8_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+
+#include "operators/fusion_conv_add_relu_int8_op.h"
+#include <vector>
+#include "operators/math/conv_func.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionConvAddReluInt8Op<Dtype, T>::InferShape() const {
+  auto in_dims = this->param_.Input()->dims();
+  auto filter_dims = this->param_.Filter()->dims();
+  const std::vector<int> &strides = this->param_.Strides();
+  std::vector<int> paddings = this->param_.Paddings();
+  int groups = this->param_.Groups();
+  std::vector<int> dilations = this->param_.Dilations();
+
+  PADDLE_MOBILE_ENFORCE((in_dims.size() == filter_dims.size() &&
+                         dilations.size() == paddings.size() &&
+                         paddings.size() == strides.size()),
+                        "ConvParam is not suitable");
+
+  std::vector<int64_t> output_shape({in_dims[0], filter_dims[0]});
+  for (size_t i = 0; i < strides.size(); ++i) {
+    output_shape.push_back(
+        math::ConvOutputSize(in_dims[i + 2], filter_dims[i + 2], dilations[i],
+                             paddings[i], strides[i]));
+  }
+  framework::DDim ddim = framework::make_ddim(output_shape);
+  this->param_.Output()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU_INT8(fusion_conv_add_relu_int8,
+                           ops::FusionConvAddReluInt8Op);
+#endif
+#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/fusion_conv_add_relu_int8_op.h
+++ b/src/operators/fusion_conv_add_relu_int8_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+#pragma once
+#include <string>
+#include "framework/operator.h"
+#include "operators/kernel/conv_add_relu_kernel.h"
+#include "operators/op_param.h"
+namespace paddle_mobile {
+namespace operators {
+template <typename DeviceType, typename T>
+class FusionConvAddReluInt8Op
+    : public framework::OperatorWithKernel<DeviceType,
+                                           FusionConvAddReluParam<DeviceType>,
+                                           ConvAddReluKernel<DeviceType, T>> {
+ public:
+  FusionConvAddReluInt8Op(const std::string &type,
+                          const VariableNameMap &inputs,
+                          const VariableNameMap &outputs,
+                          const framework::AttributeMap &attrs,
+                          std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType,
+                                      FusionConvAddReluParam<DeviceType>,
+                                      ConvAddReluKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+  void InferShape() const override;
+};
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif  // FUSION_CONVADDRELU_INT8_OP
--- a/src/operators/fusion_conv_bn_add_relu_op.cpp
+++ b/src/operators/fusion_conv_bn_add_relu_op.cpp
@@ -55,6 +55,9 @@ REGISTER_FUSION_MATCHER(fusion_conv_bn_add_relu,
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
+#endif
 #ifdef PADDLE_MOBILE_FPGA
 REGISTER_OPERATOR_FPGA(fusion_conv_bn_add_relu, ops::FusionConvBNAddReluOp);
 #endif

--- a/src/operators/fusion_fc_int8_op.cpp
+++ b/src/operators/fusion_fc_int8_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_INT8_OP
+
+#include "operators/fusion_fc_int8_op.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename Dtype, typename T>
+void FusionFcInt8Op<Dtype, T>::InferShape() const {
+  auto x_dims = this->param_.InputX()->dims();
+  auto y_dims = this->param_.InputY()->dims();
+  int x_num_col_dims = this->param_.XNumColDims();
+  int y_num_col_dims = this->param_.YNumColDims();
+
+  assert(x_dims.size() > x_num_col_dims);
+  assert(y_dims.size() > y_num_col_dims);
+
+  /// (1,2,3,4) , x_num_col_dims = 2  -> (2,12)
+  auto x_mat_dims = framework::flatten_to_2d(x_dims, x_num_col_dims);
+  auto y_mat_dims = framework::flatten_to_2d(y_dims, y_num_col_dims);
+
+  assert(x_mat_dims[1] == y_mat_dims[0]);
+
+  std::vector<int64_t> output_dims;
+  output_dims.reserve(
+      static_cast<size_t>(x_num_col_dims + y_dims.size() - y_num_col_dims));
+
+  for (int i = 0; i < x_num_col_dims; ++i) {
+    output_dims.push_back(x_dims[i]);
+  }
+
+  for (int i = y_num_col_dims; i < y_dims.size(); ++i) {
+    output_dims.push_back(y_dims[i]);
+  }
+
+  framework::DDim ddim = framework::make_ddim(output_dims);
+  this->param_.Out()->Resize(ddim);
+}
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+namespace ops = paddle_mobile::operators;
+#ifdef PADDLE_MOBILE_CPU
+REGISTER_OPERATOR_CPU_INT8(fusion_fc_int8, ops::FusionFcInt8Op);
+#endif
+#endif  // FUSION_FC_INT8_OP
--- a/src/operators/fusion_fc_int8_op.h
+++ b/src/operators/fusion_fc_int8_op.h
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_FC_INT8_OP
+
+#pragma once
+
+#include <string>
+#include <vector>
+
+#include "framework/operator.h"
+#include "framework/program/program-optimize/fusion_op_register.h"
+#include "operators/kernel/fusion_fc_kernel.h"
+#include "operators/op_param.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <typename DeviceType, typename T>
+class FusionFcInt8Op
+    : public framework::OperatorWithKernel<DeviceType,
+                                           FusionFcParam<DeviceType>,
+                                           FusionFcKernel<DeviceType, T>> {
+ public:
+  FusionFcInt8Op(const std::string &type, const VariableNameMap &inputs,
+                 const VariableNameMap &outputs,
+                 const framework::AttributeMap &attrs,
+                 std::shared_ptr<framework::Scope> scope)
+      : framework::OperatorWithKernel<DeviceType, FusionFcParam<DeviceType>,
+                                      FusionFcKernel<DeviceType, T>>(
+            type, inputs, outputs, attrs, scope) {}
+
+  void InferShape() const override;
+};
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // FUSION_FC_INT8_OP
--- a/src/operators/kernel/arm/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/arm/conv_add_relu_kernel.cpp
@@ -28,10 +28,24 @@ bool ConvAddReluKernel<CPU, float>::Init(FusionConvAddReluParam<CPU> *param) {
 template <>
 void ConvAddReluKernel<CPU, float>::Compute(
    const FusionConvAddReluParam<CPU> &param) {
-  ConvAddReluCompute<float>(param);
+  ConvAddReluCompute<float, float>(param);
 }
 template class ConvAddReluKernel<CPU, float>;

+#ifdef FUSION_CONVADDRELU_INT8_OP
+template <>
+bool ConvAddReluKernel<CPU, int8_t>::Init(FusionConvAddReluParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void ConvAddReluKernel<CPU, int8_t>::Compute(
+    const FusionConvAddReluParam<CPU> &param) {
+  ConvAddReluCompute<int8_t, int32_t>(param);
+}
+template class ConvAddReluKernel<CPU, int8_t>;
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/arm/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/arm/fusion_fc_kernel.cpp
@@ -27,10 +27,27 @@ bool FusionFcKernel<CPU, float>::Init(FusionFcParam<CPU> *param) {

 template <>
 void FusionFcKernel<CPU, float>::Compute(const FusionFcParam<CPU> &param) {
-  FusionFcCompute<float>(param);
+  FusionFcCompute<float, float>(param);
  param.Out()->set_lod(param.InputX()->lod());
 }

+template class FusionFcKernel<CPU, float>;
+
+#ifdef FUSION_FC_INT8_OP
+template <>
+bool FusionFcKernel<CPU, int8_t>::Init(FusionFcParam<CPU> *param) {
+  return true;
+}
+
+template <>
+void FusionFcKernel<CPU, int8_t>::Compute(const FusionFcParam<CPU> &param) {
+  FusionFcCompute<int8_t, int32_t>(param);
+  param.Out()->set_lod(param.InputX()->lod());
+}
+
+template class FusionFcKernel<CPU, int8_t>;
+#endif
+
 }  // namespace operators
 }  // namespace paddle_mobile


--- a/src/operators/kernel/arm/quantize_kernel.cpp
+++ b/src/operators/kernel/arm/quantize_kernel.cpp
@@ -20,10 +20,12 @@ limitations under the License. */

 #if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #include <arm_neon.h>
+#endif

 namespace paddle_mobile {
 namespace operators {

+#if defined(__ARM_NEON__) || defined(__ARM_NEON)
 #ifndef __aarch64__
 inline float32_t vmaxvq_f32(float32x4_t r) {
  float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r));

--- a/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_add_relu_arm_func.h
@@ -25,21 +25,30 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {

-template <typename P>
+template <typename P, typename S>
 void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  const Tensor *input = param.Input();
  Tensor filter = *param.Filter();
  Tensor bias = *param.Bias();
-  int axis = param.Axis();
+  int32_t axis = param.Axis();
+  S *bias_data = bias.data<S>();
  Tensor *output = param.Output();
-  float *biase_data = bias.data<float>();
+  output->mutable_data<P>();

-  int groups = param.Groups();
-  std::vector<int> strides = param.Strides();
-  std::vector<int> paddings = param.Paddings();
-  std::vector<int> dilations = param.Dilations();
+  float alpha = 1.0f;
+  float beta = 1.0f;

-  const int batch_size = static_cast<int>(input->dims()[0]);
+#ifdef FUSION_CONVADDRELU_INT8_OP
+  alpha = param.InputScale()->data<float>()[0];
+  beta = 0.0f;
+#endif
+
+  int32_t groups = param.Groups();
+  std::vector<int32_t> strides = param.Strides();
+  std::vector<int32_t> paddings = param.Paddings();
+  std::vector<int32_t> dilations = param.Dilations();
+
+  const int32_t batch_size = static_cast<int32_t>(input->dims()[0]);

  std::vector<int64_t> filter_shape_vec(framework::vectorize(filter.dims()));

@@ -61,13 +70,13 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
  Tensor col;
  Tensor col_matrix;
  if (is_expand) {
-    col.mutable_data<float>(col_shape);
+    col.mutable_data<P>(col_shape);
    col_matrix.ShareDataWith(col);
    col_matrix.Resize(col_matrix_shape);
  }

  framework::DDim input_shape = framework::slice_ddim(
-      input->dims(), 1, static_cast<int>(input->dims().size()));
+      input->dims(), 1, static_cast<int32_t>(input->dims().size()));

  framework::DDim filter_matrix_shape = {filter.dims()[0],
                                         filter.numel() / filter.dims()[0]};
@@ -77,17 +86,17 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      output->numel() / (output->dims()[0] * output->dims()[1])};

  // convolution operator: im2col(or vol2col) + gemm
-  int in_step = static_cast<int>(input->dims()[1]) / groups;
-  int out_step = static_cast<int>(output->dims()[1]) / groups;
+  int32_t in_step = static_cast<int32_t>(input->dims()[1]) / groups;
+  int32_t out_step = static_cast<int32_t>(output->dims()[1]) / groups;

-  math::Vol2ColFunctor<CPU, float> vol2col;
-  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, float> im2col;
+  math::Vol2ColFunctor<CPU, P> vol2col;
+  math::Im2ColFunctor<math::ColFormat::kCFO, CPU, P> im2col;

-  for (int i = 0; i < batch_size; i++) {
+  for (int32_t i = 0; i < batch_size; i++) {
    Tensor in_batch = input->Slice(i, i + 1).Resize(input_shape);
    Tensor out_batch = output->Slice(i, i + 1).Resize(output_matrix_shape);

-    for (int g = 0; g < groups; g++) {
+    for (int32_t g = 0; g < groups; g++) {
      Tensor in_slice = in_batch.Slice(g * in_step, (g + 1) * in_step);

      if (!is_expand) {
@@ -97,8 +106,8 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      } else if (data_dim == 2U) {
        // im2col
        im2col(in_slice, dilations, strides,
-               std::vector<int>{paddings[0], paddings[1], paddings[0],
-                                paddings[1]},
+               std::vector<int32_t>{paddings[0], paddings[1], paddings[0],
+                                    paddings[1]},
               &col);
      } else if (data_dim == 3U) {
        // vol2col
@@ -108,9 +117,9 @@ void ConvAddReluCompute(const FusionConvAddReluParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      math::matmul<float>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(1), true, biase_data);
+
+      math::matmul(filter_slice, false, col_matrix, false, alpha, &out_slice,
+                   beta, true, bias_data);
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -106,10 +106,9 @@ inline void GemmConv(const ConvParam<CPU> &param) {
      // gemm
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-
-      math::matmul<Itype>(filter_slice, false, col_matrix, false,
-                          static_cast<float>(1), &out_slice,
-                          static_cast<float>(0));
+      math::matmul(filter_slice, false, col_matrix, false,
+                   static_cast<float>(1), &out_slice, static_cast<float>(0),
+                   false, static_cast<Otype *>(nullptr));
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
+++ b/src/operators/kernel/central-arm-func/fusion_fc_arm_func.h
@@ -15,23 +15,29 @@ limitations under the License. */
 #ifdef FUSION_FC_OP

 #pragma once
+
+#include <type_traits>
 #include "operators/math/math_function.h"
 #include "operators/op_param.h"

 namespace paddle_mobile {
 namespace operators {

-template <typename P>
+template <typename P, typename S>
 void FusionFcCompute(const FusionFcParam<CPU> &param) {
  const Tensor *input_x = param.InputX();
  const Tensor *input_y = param.InputY();
-  const Tensor *input_z = param.InputZ();
-  auto *input_z_data = input_z->data<float>();
+  Tensor *input_z = param.InputZ();
+  S *input_z_data = input_z->data<S>();
  int axis = param.Axis();
  Tensor *out = param.Out();
  //  int m = out->dims()[0];
  //  int n = out->dims()[1];
-  auto *out_data = out->mutable_data<float>();
+  auto *out_data = out->mutable_data<P>();
+
+  float alpha = 1.0f;
+  float beta = 1.0f;
+
  const Tensor x_matrix =
      input_x->dims().size() > 2
          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
@@ -51,21 +57,28 @@ void FusionFcCompute(const FusionFcParam<CPU> &param) {
  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");

-  int64_t classes = input_z->numel();
-  for (int i = 0; i < out_dim[0]; i++) {
-    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
-  }
+  if (std::is_same<P, int8_t>::value) {
+#ifdef FUSION_FC_INT8_OP
+    alpha = param.InputScale()->data<float>()[0];
+    beta = 0.0f;
+    math::matmul(x_matrix, false, y_matrix, false, alpha, out, beta, false,
+                 input_z_data, true);
+#endif
+  } else {
+    // bias_data的维度和out的第二个维度一致
+    int64_t classes = input_z->numel();
+    for (int i = 0; i < out_dim[0]; i++) {
+      memory::Copy(out_data + i * classes, input_z_data,
+                   sizeof(float) * classes);
+    }

-  //  for (int i = 0; i < out->numel(); i++) {
-  //    DLOG << out_data[i];
-  //  }
-  // bias_data的维度和out的维度一致
-  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
-                      out, static_cast<float>(1), false);
+    math::matmul<float>(x_matrix, false, y_matrix, false, alpha, out, beta,
+                        false);
+  }
  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
-  //            if (out_dim.size() != 2) {
-  //                out->Resize(out_dim);
-  //            }
+  //  if (out_dim.size() != 2) {
+  //      out->Resize(out_dim);
+  //  }
 }

 }  // namespace operators

--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -73,8 +73,9 @@ void MulCompute(const MulParam<CPU> &param) {
  }
  if (param.InputX()->type() == typeid(int8_t)) {
    out->mutable_data<int32_t>();
-    math::matmul<int8_t>(x_matrix, false, y_matrix, false,
-                         static_cast<int8_t>(1), out, static_cast<int8_t>(0));
+    math::matmul<float, int32_t>(x_matrix, false, y_matrix, false,
+                                 static_cast<float>(1), out,
+                                 static_cast<float>(0));

  } else {
    out->mutable_data<float>();

--- a/src/operators/kernel/central-arm-func/pool_arm_func.h
+++ b/src/operators/kernel/central-arm-func/pool_arm_func.h
@@ -23,20 +23,22 @@ namespace paddle_mobile {
 namespace operators {
 using framework::Tensor;

-inline void PoolBasic(std::string pooling_type, std::vector<int> ksize,
-                      std::vector<int> strides, std::vector<int> paddings,
-                      const Tensor *in_x, Tensor *out) {
+template <typename T, typename S>
+void PoolBasic(std::string pooling_type, std::vector<int> ksize,
+               std::vector<int> strides, std::vector<int> paddings,
+               const Tensor *in_x, Tensor *out) {
  if (pooling_type == "max") {
-    math::PoolFunctor<CPU, math::MaxPool<float>, float> pool2d_forward;
-    math::MaxPool<float> pool_process;
+    math::PoolFunctor<CPU, math::MaxPool<T>, T> pool2d_forward;
+    math::MaxPool<T> pool_process;
    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);

  } else if (pooling_type == "avg") {
-    math::PoolFunctor<CPU, math::AvgPool<float>, float> pool2d_forward;
-    math::AvgPool<float> pool_process;
+    math::PoolFunctor<CPU, math::AvgPool<T, S>, T> pool2d_forward;
+    math::AvgPool<T, S> pool_process;
    pool2d_forward(*in_x, ksize, strides, paddings, pool_process, out);
  }
 }
+
 template <typename P>
 void PoolCompute(const PoolParam<CPU> &param) {
  const Tensor *in_x = param.Input();
@@ -52,50 +54,67 @@ void PoolCompute(const PoolParam<CPU> &param) {
    LOG(paddle_mobile::LogLevel::kLOG_ERROR)
        << "Pool op only supports 2D and 3D input.";
  }
-
  if (param.isGlobalPooling()) {
    for (size_t i = 0; i < ksize.size(); ++i) {
      paddings[i] = 0;
      ksize[i] = static_cast<int>(in_x->dims()[i + 2]);
    }
  }
-  if (ksize[0] == 3 && ksize[0] == ksize[1]) {
-    if (pooling_type == "max") {
-      if (strides[0] == strides[1] && strides[0] == 1 &&
-          paddings[0] == paddings[1] && paddings[1] == 1) {
-        math::Pool3x3Maxs1p1(in_x, out);
+  if (in_x->type() == typeid(int8_t)) {
+    if (pooling_type == "max" && ksize[0] == 3 && ksize[0] == ksize[1]) {
+      if (strides[0] == strides[1] && strides[0] == 1) {
+        math::Pool3x3Maxs1_int8(in_x, out, paddings[0], paddings[1]);
+      } else if (strides[0] == strides[1] && strides[0] == 2) {
+        math::Pool3x3Maxs2_int8(in_x, out, paddings[0], paddings[1]);
      } else {
-        math::Pool3x3Max(strides, paddings, in_x, out);
-      }
-    } else if (pooling_type == "avg") {
-      if (strides[0] == strides[1] && strides[0] == 1 &&
-          paddings[0] == paddings[1] && paddings[1] == 1) {
-        math::Pool3x3Avgs1p1(in_x, out);
-      } else {
-        math::Pool3x3Avg(strides, paddings, in_x, out);
+        math::Pool3x3Max_int8(strides, paddings, in_x, out);
      }
+    } else {
+      PoolBasic<int8_t, int32_t>(pooling_type, ksize, strides, paddings, in_x,
+                                 out);
    }
+  } else {
+    if (ksize[0] == 3 && ksize[0] == ksize[1]) {
+      if (pooling_type == "max") {
+        if (strides[0] == strides[1] && strides[0] == 1 &&
+            paddings[0] == paddings[1] && paddings[1] == 1) {
+          math::Pool3x3Maxs1p1(in_x, out);
+        } else {
+          math::Pool3x3Max(strides, paddings, in_x, out);
+        }
+      } else if (pooling_type == "avg") {
+        if (strides[0] == strides[1] && strides[0] == 1 &&
+            paddings[0] == paddings[1] && paddings[1] == 1) {
+          math::Pool3x3Avgs1p1(in_x, out);
+        } else {
+          math::Pool3x3Avg(strides, paddings, in_x, out);
+        }
+      }

-  } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
-             strides[0] == strides[1] && paddings[0] == paddings[1] &&
-             paddings[1] == 0) {
+    } else if (ksize[0] == 2 && ksize[0] == ksize[1] && strides[0] == 2 &&
+               strides[0] == strides[1] && paddings[0] == paddings[1] &&
+               paddings[1] == 0) {
 #if __ARM_NEON
 #if __aarch64__
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
+                              out);
 #else
-    /// todo: fix bug in Pool2x2
-    if (pooling_type == "max") {
-      math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
-    } else if (pooling_type == "avg") {
-      math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
-    }
+      /// todo: fix bug in Pool2x2
+      if (pooling_type == "max") {
+        math::Pool2x2Maxs2p0(strides, paddings, in_x, out);
+      } else if (pooling_type == "avg") {
+        math::Pool2x2Avgs2p0(strides, paddings, in_x, out);
+      }
 #endif
 #else
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
+                              out);
 #endif  // __ARM_NEON

-  } else {
-    PoolBasic(pooling_type, ksize, strides, paddings, in_x, out);
+    } else {
+      PoolBasic<float, float>(pooling_type, ksize, strides, paddings, in_x,
+                              out);
+    }
  }
 }


--- a/src/operators/kernel/cl/batchnorm_kernel.cpp
+++ b/src/operators/kernel/cl/batchnorm_kernel.cpp
@@ -77,15 +77,25 @@ void BatchNormKernel<GPU_CL, float>::Compute(
  auto new_scale = param.NewScale()->GetCLImage();
  auto new_bias = param.NewBias()->GetCLImage();
  const int out_width = default_work_size[1];
-
-  clSetKernelArg(kernel, 1, sizeof(int), &out_width);
-  clSetKernelArg(kernel, 2, sizeof(cl_mem), &input);
-  clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_scale);
-  clSetKernelArg(kernel, 4, sizeof(cl_mem), &new_bias);
-  clSetKernelArg(kernel, 5, sizeof(cl_mem), &out);
-
-  //  cl_event out_event = param.OutputY()->GetClEvent();
-  //  cl_event wait_event = param.InputX()->GetClEvent();
+  DLOG << *param.InputX();
+  DLOG << *param.NewBias();
+  DLOG << *param.NewScale();
+  DLOG << default_work_size[0];
+  DLOG << default_work_size[1];
+  DLOG << default_work_size[2];
+  DLOG << out_width;
+  DLOG << *param.OutputY();
+  cl_int status;
+  clSetKernelArg(kernel, 0, sizeof(cl_int), &out_width);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 1, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 2, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 3, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+  clSetKernelArg(kernel, 4, sizeof(cl_mem), &out);
+  CL_CHECK_ERRORS(status);
  clEnqueueNDRangeKernel(this->cl_helper_.CLCommandQueue(), kernel, 3, NULL,
                         default_work_size.data(), NULL, 0, NULL, NULL);
 }

--- a/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_bn_add_relu_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+
+#define BATCH_NORM
+#define BIASE
+#define RELU
+
+#include "conv_kernel.inc.cl"
--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -924,6 +924,387 @@ __kernel void conv_5x5(__private const int global_size_dim0,
    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
 }

+__kernel void convBNAdd_3x3(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter,
+
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+
+
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+
+
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+
+
+    half4 output = (half4)0.0f;
+
+   half4 input[9];
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        input[0] = select(read_imageh(input_image, sampler,
+                            (int2)(pos_in.x - dilation, pos_in.y - dilation)),
+                            (half4)(0.0f),
+                            (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[1] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[2] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y - dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y - dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y - dilation >= input_height) << 15));
+
+        input[3] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[4] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[5] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y >= input_height) << 15));
+
+        input[6] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x - dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x - dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x - dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+        input[7] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+        input[8] = select(read_imageh(input_image, sampler,
+                          (int2)(pos_in.x + dilation, pos_in.y + dilation)),
+                          (half4)(0.0f),
+                          (ushort4)((in_pos_in_one_block.x + dilation < 0 || in_pos_in_one_block.y + dilation < 0 || in_pos_in_one_block.x + dilation >= input_width || in_pos_in_one_block.y + dilation >= input_height) << 15));
+
+
+/*
+        for (int j = 0; j < 9; ++j) {
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+        }
+*/
+            int j = 0;
+            int2 pos_of_weight;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            half4 weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            half4 weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            half4 weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            half4 weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 1;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 2;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 3;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 4;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+            j = 5;
+            pos_of_weight.x = i * 3 + j % 3;
+            pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+            weight_x = read_imageh(filter, sampler, pos_of_weight);
+            output.x += dot(input[j], weight_x);
+
+            pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+            weight_y = read_imageh(filter, sampler, pos_of_weight);
+            output.y += dot(input[j], weight_y);
+
+            pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+            weight_z = read_imageh(filter, sampler, pos_of_weight);
+            output.z += dot(input[j], weight_z);
+
+            pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+            weight_w = read_imageh(filter, sampler, pos_of_weight);
+            output.w += dot(input[j], weight_w);
+
+           j = 6;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+           j = 7;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+           j = 8;
+           pos_of_weight.x = i * 3 + j % 3;
+           pos_of_weight.y = out_c * 4 * 3 + 0 * 3 + j / 3;
+           weight_x = read_imageh(filter, sampler, pos_of_weight);
+           output.x += dot(input[j], weight_x);
+
+           pos_of_weight.y = out_c * 4 * 3 + 1 * 3 + j / 3;
+           weight_y = read_imageh(filter, sampler, pos_of_weight);
+           output.y += dot(input[j], weight_y);
+
+           pos_of_weight.y = out_c * 4 * 3 + 2 * 3 + j / 3;
+           weight_z = read_imageh(filter, sampler, pos_of_weight);
+           output.z += dot(input[j], weight_z);
+
+           pos_of_weight.y = out_c * 4 * 3 + 3 * 3 + j / 3;
+           weight_w = read_imageh(filter, sampler, pos_of_weight);
+           output.w += dot(input[j], weight_w);
+
+    }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef BIASE
+    output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh));
+#endif
+
+#ifdef RELU
+    output = activation(output);
+#endif
+
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}
+
+__kernel void convBNAdd_1x1(__private const int global_size_dim0,
+                       __private const int global_size_dim1,
+                       __private const int global_size_dim2,
+                       __read_only image2d_t input_image,
+                       __read_only image2d_t filter,
+#ifdef BIASE
+                       __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                       __read_only image2d_t new_scale,
+                       __read_only image2d_t new_biase,
+#endif
+                       __write_only image2d_t output_image,
+                       __private const int stride,
+                       __private const int offset,
+                       __private const int input_c,
+                       __private const int dilation,
+                       __private const int input_width,/* of one block */
+                       __private const int input_height,/* of one block */
+                       __private const int output_width,
+                       __private const int output_height) {
+  const int out_c = get_global_id(0);
+  const int out_w = get_global_id(1);
+  const int out_nh = get_global_id(2);
+
+  const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                           CLK_ADDRESS_CLAMP         |
+                           CLK_FILTER_NEAREST;
+
+  const uint kernelHXW = 1;
+  int2 stride_xy = (int2)(stride, stride);
+  int2 ouput_pos_in_one_block = (int2)(out_w, out_nh);
+  int2 in_pos_in_one_block = ouput_pos_in_one_block * stride_xy + (int2)(offset, offset);
+
+
+  half4 output = 0.0f;
+
+   for (int i = 0; i < input_c; ++i) {
+        int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        half4 input = read_imageh(input_image, sampler, pos_in);
+
+        half4 weight0 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 0));
+        half4 weight1 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 1));
+        half4 weight2 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 2));
+        half4 weight3 = read_imageh(filter, sampler, (int2)(out_c, i * 4 + 3));
+/*
+        output.x = dot(input, weight0);
+        output.y = dot(input, weight1);
+        output.z = dot(input, weight2);
+        output.w = dot(input, weight3);
+*/
+
+        output = mad(input.x, weight0, output);
+        output = mad(input.y, weight1, output);
+        output = mad(input.z, weight2, output);
+        output = mad(input.w, weight3, output);
+
+   }
+
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+
+#ifdef BIASE
+   output += read_imageh(bias, sampler, (int2)(out_c * global_size_dim1 + out_w, out_nh));
+#endif
+
+#ifdef RELU
+  output = activation(output);
+#endif
+
+  int2 output_pos = (int2)(out_c * global_size_dim1 + out_w, out_nh);
+  write_imageh(output_image, output_pos, output);
+}




--- a/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/dropout_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+
+__kernel void dropout(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_W,
+                      __private const float dropoutPro) {
+
+                       const int out_c = get_global_id(0);
+                       const int out_w = get_global_id(1);
+                       const int out_nh = get_global_id(2);
+
+                       int2 output_pos;
+                       output_pos.x = out_c * out_W + out_w;
+                       output_pos.y = out_nh;
+
+                       const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                 CLK_ADDRESS_CLAMP      |
+                                                 CLK_FILTER_NEAREST;
+                       half4 input;
+                       half4 output;
+
+                       input = read_imageh(input_image, sampler,output_pos);
+                       half4 dropout = (half4)(1 - dropoutPro);
+                       output =  dropout * input;
+
+                       write_imageh(output_image, output_pos, output);
+}
+
--- a/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_bn_add_relu_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef FUSION_CONVBNADDRELU_OP
+
+#include "operators/kernel/conv_bn_add_relu_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool ConvBNAddReluKernel<GPU_CL, float>::Init(
+    FusionConvBNAddReluParam<GPU_CL> *param) {
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  const framework::CLImage *mean = param->InputMean();
+  const framework::CLImage *variance = param->InputVariance();
+  const framework::CLImage *scale = param->InputScale();
+  const framework::CLImage *bias = param->InputBias();
+
+  const float epsilon = param->Epsilon();
+
+  const int C = mean->numel();
+
+  auto mean_ptr = mean->data<float>();
+  auto variance_ptr = variance->data<float>();
+  auto scale_ptr = scale->data<float>();
+  auto bias_ptr = bias->data<float>();
+
+  float inv_std_ptr[C];
+  for (int i = 0; i < C; i++) {
+    inv_std_ptr[i] =
+        1 / static_cast<float>(pow((variance_ptr[i] + epsilon), 0.5));
+  }
+  float *new_scale_ptr = new float[C];
+  float *new_bias_ptr = new float[C];
+
+  for (int i = 0; i < C; i++) {
+    new_scale_ptr[i] = inv_std_ptr[i] * scale_ptr[i];
+    new_bias_ptr[i] = bias_ptr[i] - mean_ptr[i] * inv_std_ptr[i] * scale_ptr[i];
+  }
+
+  framework::CLImage *new_scale = new framework::CLImage();
+
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new scale - " << j << new_scale_ptr[j];
+  //  }
+  //
+  //  for (int j = 0; j < C; ++j) {
+  //    DLOG << " new bias - " << j << new_bias_ptr[j];
+  //  }
+
+  new_scale->SetTensorData(new_scale_ptr, variance->dims());
+  new_scale->InitCLImage(this->cl_helper_.CLContext(),
+                         cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - y bias: " << *(param->Bias());
+  //
+  //  DLOG << " climage - new scale: " << *new_scale;
+
+  framework::CLImage *new_bias = new framework::CLImage();
+
+  new_bias->SetTensorData(new_bias_ptr, variance->dims());
+  new_bias->InitCLImage(this->cl_helper_.CLContext(),
+                        cl_helper_.CLCommandQueue());
+
+  //  DLOG << " climage - new bias: " << *new_bias;
+  //
+  //  DLOG << " climage - filter: " << *(param->Filter());
+
+  param->SetNewScale(new_scale);
+  param->SetNewBias(new_bias);
+
+  delete[](new_scale_ptr);
+  delete[](new_bias_ptr);
+
+  PADDLE_MOBILE_ENFORCE(
+      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+          param->Paddings()[0] == param->Paddings()[1],
+      "need equal");
+
+  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+               static_cast<int>(param->Paddings()[1]);
+
+  param->SetOffset(offset);
+
+  if (param->Filter()->dims()[2] == 1 && param->Filter()->dims()[3] == 1) {
+    param->Filter()->InitNImage(cl_helper_.CLContext(),
+                                cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("convBNAdd_1x1", "conv_bn_add_relu_kernel.cl");
+    DLOG << " conv bn add relu conv 1x1";
+  } else if (param->Filter()->dims()[1] == 1 &&
+             param->Input()->dims()[1] == param->Output()->dims()[1] &&
+             param->Filter()->dims()[2] == 3) {
+    param->Filter()->InitDWImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("depth_convBNAdd_3x3",
+                               "conv_bn_add_relu_kernel.cl");
+    DLOG << " conv bn add relu depth_conv_3x3";
+
+  } else if (param->Filter()->dims()[2] == 3 &&
+             param->Filter()->dims()[3] == 3) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+
+    this->cl_helper_.AddKernel("convBNAdd_3x3", "conv_bn_add_relu_kernel.cl");
+    DLOG << " conv bn add relu conv_3x3";
+  } else {
+    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+  }
+
+  return true;
+}
+
+template <>
+void ConvBNAddReluKernel<GPU_CL, float>::Compute(
+    const FusionConvBNAddReluParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+  int c_block = default_work_size[0];
+  int w = default_work_size[1];
+  int nh = default_work_size[2];
+  auto input = param.Input()->GetCLImage();
+  auto filter = param.Filter()->GetCLImage();
+  auto biase = param.Bias()->GetCLImage();
+  auto new_scale = param.NewScale()->GetCLImage();
+  auto new_bias = param.NewBias()->GetCLImage();
+  auto output = param.Output()->GetCLImage();
+  int stride = param.Strides()[0];
+  int offset = param.Offset();
+  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+                    param.Input()->Converter())
+                    ->GetCBlock();
+  int dilation = param.Dilations()[0];
+  int input_width = param.Input()->dims()[3];
+  int input_height = param.Input()->dims()[2];
+  int output_width = param.Output()->dims()[3];
+  int output_height = param.Output()->dims()[2];
+
+  //              DLOG << " c block " << c_block;
+  //              DLOG << " w " << w;
+  //              DLOG << " nh " << nh;
+  //              DLOG << " stride " << stride;
+  //              DLOG << " offset " << offset;
+  //              DLOG << " input_c " << input_c;
+  //              DLOG << " dilation " << dilation;
+  //              DLOG << " input width " << input_width;
+  //              DLOG << " input height " << input_height;
+  //              DLOG << " output width " << output_width;
+  //              DLOG << " output height " << output_height;
+  //              DLOG << " input dim " << *param.Input();
+  //              DLOG << " output dim " <<* param.Output();
+  //              DLOG << " filter dim " << *param.Filter();
+  //              DLOG<<*param.Bias();
+
+  cl_int status;
+
+  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &biase);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 6, sizeof(cl_mem), &new_scale);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 7, sizeof(cl_mem), &new_bias);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 8, sizeof(cl_mem), &output);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 9, sizeof(int), &stride);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 10, sizeof(int), &offset);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 11, sizeof(int), &input_c);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 12, sizeof(int), &dilation);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 13, sizeof(int), &input_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 14, sizeof(int), &input_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 15, sizeof(int), &output_width);
+  CL_CHECK_ERRORS(status);
+
+  status = clSetKernelArg(kernel, 16, sizeof(int), &output_height);
+  CL_CHECK_ERRORS(status);
+
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+template class ConvBNAddReluKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/cl/depthwise_conv_kernel.cpp
+++ b/src/operators/kernel/cl/depthwise_conv_kernel.cpp
-/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
-
-Licensed under the Apache License, Version 2.0 (the "License");
-you may not use this file except in compliance with the License.
-You may obtain a copy of the License at
-
-    http://www.apache.org/licenses/LICENSE-2.0
-
-Unless required by applicable law or agreed to in writing, software
-distributed under the License is distributed on an "AS IS" BASIS,
-WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
-See the License for the specific language governing permissions and
-limitations under the License. */
-
-#ifdef DEPTHWISECONV_OP
-
-#include "operators/kernel/depthwise_conv_kernel.h"
-#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h"
-
-namespace paddle_mobile {
-namespace operators {
-
-template <>
-bool DepthwiseConvKernel<GPU_CL, float>::Init(ConvParam<GPU_CL> *param) {
-  DLOG << " depthwise conv kernel init begin ";
-  PADDLE_MOBILE_ENFORCE(
-      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
-          param->Paddings()[0] == param->Paddings()[1],
-      "need equal");
-  param->Filter()->InitCLImage(cl_helper_.CLContext(),
-                               this->cl_helper_.CLCommandQueue());
-  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
-               static_cast<int>(param->Paddings()[1]);
-  param->SetOffset(offset);
-  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
-  DLOG << " depthwise conv kernel init end ";
-  return true;
-}
-
-template <>
-void DepthwiseConvKernel<GPU_CL, float>::Compute(
-    const ConvParam<GPU_CL> &param) {
-  auto kernel = this->cl_helper_.KernelAt(0);
-  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
-  int c_block = default_work_size[0];
-  int w = default_work_size[1];
-  int nh = default_work_size[2];
-  auto input = param.Input()->GetCLImage();
-  auto filter = param.Filter()->GetCLImage();
-  auto output = param.Output()->GetCLImage();
-  int stride = param.Strides()[0];
-  int offset = param.Offset();
-  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
-                    param.Input()->Converter())
-                    ->GetCBlock();
-  int dilation = param.Dilations()[0];
-
-  int input_width = param.Input()->dims()[3];
-  int input_height = param.Input()->dims()[2];
-  int output_width = param.Output()->dims()[3];
-  int output_height = param.Output()->dims()[2];
-
-  cl_int status;
-
-  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
-  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
-  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
-  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
-  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
-  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
-  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
-  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
-  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
-  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
-  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
-  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
-  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
-  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
-
-  CL_CHECK_ERRORS(status);
-
-  //  cl_event out_event = param.Output()->GetClEvent();
-  //  cl_event wait_event = param.Input()->GetClEvent();
-
-  status = clEnqueueNDRangeKernel(
-      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
-      default_work_size.data(), NULL, 0, NULL, NULL);
-
-  CL_CHECK_ERRORS(status);
-}
-
-template class DepthwiseConvKernel<GPU_CL, float>;
-
-}  // namespace operators
-}  // namespace paddle_mobile
-
-#endif
+///* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+//
+// Licensed under the Apache License, Version 2.0 (the "License");
+// you may not use this file except in compliance with the License.
+// You may obtain a copy of the License at
+//
+//    http://www.apache.org/licenses/LICENSE-2.0
+//
+// Unless required by applicable law or agreed to in writing, software
+// distributed under the License is distributed on an "AS IS" BASIS,
+// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+// See the License for the specific language governing permissions and
+// limitations under the License. */
+//
+//#ifdef DEQUANT_OP
+//
+//#include "operators/kernel/dequantize_kernel.h"
+//
+// namespace paddle_mobile {
+// namespace operators {
+//
+// template <>
+// bool DequantizeKernel<GPU_CL, float>::Init(DequantizeParam<GPU_CL> *param) {
+//  DLOG << " depthwise conv kernel init begin ";
+//  PADDLE_MOBILE_ENFORCE(
+//      param->Filter()->dims()[2] == param->Filter()->dims()[3] &&
+//          param->Paddings()[0] == param->Paddings()[1],
+//      "need equal");
+//  param->Filter()->InitCLImage(cl_helper_.CLContext(),
+//                               this->cl_helper_.CLCommandQueue());
+//  int offset = static_cast<int>(param->Filter()->dims()[2]) / 2 -
+//               static_cast<int>(param->Paddings()[1]);
+//  param->SetOffset(offset);
+//  this->cl_helper_.AddKernel("depth_conv_3x3", "conv_add_bn_relu_kernel.cl");
+//  DLOG << " depthwise conv kernel init end ";
+//  return true;
+//}
+//
+// template <>
+// void DequantizeKernel<GPU_CL, float>::Compute(
+//    const DequantizeParam<GPU_CL> &param) {
+//  auto kernel = this->cl_helper_.KernelAt(0);
+//  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Output());
+//  int c_block = default_work_size[0];
+//  int w = default_work_size[1];
+//  int nh = default_work_size[2];
+//  auto input = param.Input()->GetCLImage();
+//  auto filter = param.Filter()->GetCLImage();
+//  auto output = param.Output()->GetCLImage();
+//  int stride = param.Strides()[0];
+//  int offset = param.Offset();
+//  int input_c = reinterpret_cast<framework::CLImageConverterFolder *>(
+//                    param.Input()->Converter())
+//                    ->GetCBlock();
+//  int dilation = param.Dilations()[0];
+//
+//  int input_width = param.Input()->dims()[3];
+//  int input_height = param.Input()->dims()[2];
+//  int output_width = param.Output()->dims()[3];
+//  int output_height = param.Output()->dims()[2];
+//
+//  cl_int status;
+//
+//  status = clSetKernelArg(kernel, 0, sizeof(int), &c_block);
+//  status = clSetKernelArg(kernel, 1, sizeof(int), &w);
+//  status = clSetKernelArg(kernel, 2, sizeof(int), &nh);
+//  status = clSetKernelArg(kernel, 3, sizeof(cl_mem), &input);
+//  status = clSetKernelArg(kernel, 4, sizeof(cl_mem), &filter);
+//  status = clSetKernelArg(kernel, 5, sizeof(cl_mem), &output);
+//  status = clSetKernelArg(kernel, 6, sizeof(int), &stride);
+//  status = clSetKernelArg(kernel, 7, sizeof(int), &offset);
+//  status = clSetKernelArg(kernel, 8, sizeof(int), &input_c);
+//  status = clSetKernelArg(kernel, 9, sizeof(int), &dilation);
+//  status = clSetKernelArg(kernel, 10, sizeof(int), &input_width);
+//  status = clSetKernelArg(kernel, 11, sizeof(int), &input_height);
+//  status = clSetKernelArg(kernel, 12, sizeof(int), &output_width);
+//  status = clSetKernelArg(kernel, 13, sizeof(int), &output_height);
+//
+//  CL_CHECK_ERRORS(status);
+//
+//  //  cl_event out_event = param.Output()->GetClEvent();
+//  //  cl_event wait_event = param.Input()->GetClEvent();
+//
+//  status = clEnqueueNDRangeKernel(
+//      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(),
+//      NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+//
+//  CL_CHECK_ERRORS(status);
+//}
+//
+// template class DepthwiseConvKernel<GPU_CL, float>;
+//
+//}  // namespace operators
+//}  // namespace paddle_mobile
+//
+//#endif
--- a/src/operators/kernel/cl/dropout_kernel.cpp
+++ b/src/operators/kernel/cl/dropout_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef DROPOUT_OP
+
+#include "operators/kernel/dropout_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool DropoutKernel<GPU_CL, float>::Init(DropoutParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("dropout", "dropout_kernel.cl");
+  return true;
+}
+
+template <>
+void DropoutKernel<GPU_CL, float>::Compute(const DropoutParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*(param.Out()));
+  auto *input_image = param.InputX()->GetCLImage();
+  auto *output_image = param.Out()->GetCLImage();
+  const float dropoutProb = param.DropoutProb();
+  const auto &inputDim = param.InputX()->dims();
+  int input_dims[4] = {1, 1, 1, 1};
+  // 1 1000 1 1
+  for (int i = 0; i < inputDim.size(); i++) {
+    input_dims[4 - inputDim.size() + i] = inputDim[i];
+  }
+  int out_W = input_dims[1];
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &out_W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(float), &dropoutProb);
+  CL_CHECK_ERRORS(status);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/cl/elementwise_add_kernel.cpp
+++ b/src/operators/kernel/cl/elementwise_add_kernel.cpp
@@ -24,7 +24,11 @@ bool ElementwiseAddKernel<GPU_CL, float>::Init(
    ElementwiseAddParam<GPU_CL> *param) {
  DLOG << "-----init add-----";
  CLImage *bias = (CLImage *)(param->InputY());
-  bias->InitCLImage(cl_helper_.CLContext(), this->cl_helper_.CLCommandQueue());
+  if (!bias->isInit()) {
+    bias->InitCLImage(cl_helper_.CLContext(),
+                      this->cl_helper_.CLCommandQueue());
+  }
+
  DLOG << " bias: " << *bias;
  if (bias->dims().size() == 4) {
    this->cl_helper_.AddKernel("elementwise_add", "elementwise_add_kernel.cl");

--- a/src/operators/kernel/cl/mul_kernel.cpp
+++ b/src/operators/kernel/cl/mul_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef MUL_OP
+
+#include "operators/kernel/mul_kernel.h"
+
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool MulKernel<GPU_CL, float>::Init(MulParam<GPU_CL> *param) {
+  return true;
+}
+
+template <>
+void MulKernel<GPU_CL, float>::Compute(const MulParam<GPU_CL> &param) {}
+
+template class MulKernel<GPU_CL, float>;
+
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp
@@ -23,12 +23,61 @@ namespace operators {

 template <>
 bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
+  bool relu_enabled = false;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
+                                           sizeof(float));  // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_deconv_filter(filter, max_value, param->Groups(),
+                             param->Strides()[0]);
+
+  // int element_num_per_div =
+  //    fpga::get_filter_num_per_div(filter, param->Groups());
+
+  // deconv only support group=1 && no spilt
+  fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n,
+                                channel * sub_conv_n);
+
+  fpga::format_fp16_ofm(out);
+
+  fpga::DeconvArgs deconv_arg = {0};
+  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                        param->Groups(), param->Strides()[0],
+                        param->Strides()[1], param->Paddings()[0],
+                        param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(deconv_arg);
+
  return true;
 }

 template <>
 void DeconvAddKernel<FPGA, float>::Compute(
-    const FusionDeconvAddParam<FPGA> &param) {}
+    const FusionDeconvAddParam<FPGA> &param) {
+  fpga::ComputeFpgaDeconv(param.FpgaArgs());
+}

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp
@@ -24,12 +24,60 @@ namespace operators {
 template <>
 bool DeconvAddReluKernel<FPGA, float>::Init(
    FusionDeconvAddReluParam<FPGA> *param) {
+  bool relu_enabled = true;
+  auto input = const_cast<Tensor *>(param->Input());
+  const Tensor *bias = param->Bias();
+  auto bias_ptr = bias->data<float>();
+  auto filter = const_cast<Tensor *>(param->Filter());
+  auto out = param->Output();
+
+  PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
+                        "Output channel should be equal to bias number");
+  int channel = out->dims()[1];
+
+  int sub_conv_n = param->Strides()[0];
+  auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n *
+                                           sizeof(float));  // NOLINT
+
+  for (int i = 0; i < channel * sub_conv_n; i++) {
+    bs_ptr[i + sub_conv_n * channel] = 1;
+    bs_ptr[i] = bias_ptr[i % (channel)];
+  }
+
+  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
+                        "stride_width should be equal to stride_height ");
+  PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
+                        "filter width should be equal to filter height ");
+  PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0),
+                        "filter axis should be the multiple of stride axis ");
+
+  float max_value = fpga::filter_find_max(filter);
+  fpga::format_deconv_filter(filter, max_value, param->Groups(),
+                             param->Strides()[0]);
+
+  // int element_num_per_div =
+  //     fpga::get_filter_num_per_div(filter, param->Groups());
+
+  // deconv only support group=1 && no spilt
+  fpga::format_bias_scale_array(&bs_ptr, channel * sub_conv_n,
+                                channel * sub_conv_n);
+
+  fpga::format_fp16_ofm(out);
+
+  fpga::DeconvArgs deconv_arg = {0};
+  fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled,
+                        param->Groups(), param->Strides()[0],
+                        param->Strides()[1], param->Paddings()[0],
+                        param->Paddings()[1], bs_ptr);
+  param->SetFpgaArgs(deconv_arg);
  return true;
 }

 template <>
 void DeconvAddReluKernel<FPGA, float>::Compute(
-    const FusionDeconvAddReluParam<FPGA> &param) {}
+    const FusionDeconvAddReluParam<FPGA> &param) {
+  fpga::ComputeFpgaDeconv(param.FpgaArgs());
+}

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/elementwise_add_relu_kernel.cpp
@@ -21,7 +21,7 @@ namespace operators {
 template <>
 bool ElementwiseAddReluKernel<FPGA, float>::Init(
    ElementwiseAddReluParam<FPGA> *param) {
-  bool relu_enabled = false;
+  bool relu_enabled = true;
  auto *input_x = const_cast<LoDTensor *>(param->InputX());
  auto *input_y = const_cast<LoDTensor *>(param->InputY());
  auto *out = param->Out();

--- a/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/fusion_fc_kernel.cpp
@@ -47,7 +47,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {

  out->Resize(framework::make_ddim({1, channel, 1, 1}));
  filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
-  fpga::format_fc_data(filter, out, bs_ptr);
+  fpga::format_fc_data(filter, out, &bs_ptr);

  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1,

--- a/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+++ b/src/operators/math/depthwise_conv3x3_int8_arm64.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#if defined(__ARM_NEON__) && defined(__aarch64__)
+
+#include "operators/math/depthwise_conv3x3.h"
+#ifdef __ARM_NEON__
+#include <arm_neon.h>
+#endif
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+// template<>
+// void DepthwiseConv3x3<int8_t, int32_t>(
+//     const framework::Tensor *input, const framework::Tensor *filter,
+//     const std::vector<int> &strides, framework::Tensor *output) {
+//   PADDLE_MOBILE_THROW_EXCEPTION(
+//       "Depthwise conv with generic strides has not been implemented.");
+// }
+
+template <>
+void DepthwiseConv3x3S1<int8_t, int32_t>(const framework::Tensor &input,
+                                         const framework::Tensor &filter,
+                                         const std::vector<int> &paddings,
+                                         framework::Tensor *output) {
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Depthwise conv3x3 with stride 1 for arm v8 has not been implemented.");
+}
+
+template <>
+void DepthwiseConv3x3S2<int8_t, int32_t>(const framework::Tensor &input,
+                                         const framework::Tensor &filter,
+                                         const std::vector<int> &paddings,
+                                         framework::Tensor *output) {
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Depthwise conv3x3 with stride 2 for arm v8 has not been implemented.");
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -15,6 +15,10 @@ limitations under the License. */
 #pragma once
 #include <string>
 #include "common/log.h"
+#include "memory/t_malloc.h"
+#ifdef _OPENMP
+#include <omp.h>
+#endif

 // 矩阵取值运算宏，假设矩阵按行存储
 #define A(i, j) A[(i)*lda + (j)]
@@ -23,10 +27,12 @@ limitations under the License. */

 #if __aarch64__
 #define MR_INT8 4
+#define NR_INT8 2
 #define MR 6
 #define NR 16
 #else
 #define MR_INT8 4
+#define NR_INT8 2
 #define MR 6
 #define NR 8
 #endif
@@ -170,6 +176,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  void SgemmWithBn(int m, int n, int k, float alpha, const float *A, int lda,
                   const float *B, int ldb, float beta, float *C, int ldc,
                   bool relu, float *new_scale, float *new_bias, float *bias);
+
  void SgemmWithPRelu(int m, int n, int k, const float *A, int lda,
                      const float *B, int ldb, float *C, int ldc, float *p,
                      std::string mode, float *bias, float *bias1);
@@ -193,52 +200,72 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  // 8 bits int small block inner product
  void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                 int32_t ldc);
+  void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                 int32_t ldc);
  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                 int32_t ldc);

  // 8 bits int inner product
-  void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
-                           const int8_t *a, const int8_t *b, int8_t beta,
-                           int32_t *c, int32_t *C, int32_t ldc, bool relu,
-                           int8_t *bias);
+  template <typename Otype>
+  void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+                   const int8_t *b, float beta, int32_t *c, Otype *C,
+                   int32_t ldc, bool relu);
+  template <typename Otype>
+  void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+                           const int8_t *b, float beta, int32_t *c, Otype *C,
+                           int32_t ldc, bool relu, int32_t *bias,
+                           bool addOnRow = false);

  // 8 bits int pack function
  void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                      int32_t lda, int8_t *buffer);
+  void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
+                         int32_t lda, int8_t *buffer);
  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                      int32_t lda, int8_t *buffer);
+  void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
+                         int32_t ldb, int8_t *buffer);
  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                      int32_t ldb, int8_t *buffer);
  void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                          int32_t lda, int8_t *buffer);
  void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                          int32_t ldb, int8_t *buffer);
+  void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
+                             const int8_t *A, int32_t lda, int8_t *buffer);
+  void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
+                             const int8_t *B, int32_t ldb, int8_t *buffer);

  // 8 bits int matrix product
-  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
-             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
-             int32_t ldc, bool relu, int8_t *bias);
-  void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
-                 int32_t *C, int32_t ldc, bool relu, int8_t *bias);
+  template <typename Itype, typename Btype, typename Otype>
+  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
+                 int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
+                 int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
+  template <typename Otype>
+  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
+                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
+                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
+                 bool addOnRow = false);
+  template <typename Itype, typename Btype, typename Otype>
+  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const Itype *A,
+             int32_t lda, const Itype *B, int32_t ldb, float beta, Otype *C,
+             int32_t ldc, bool relu, Btype *bias, bool addOnRow = false);
+  template <typename Otype>
+  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
+             int32_t lda, const int8_t *B, int32_t ldb, float beta, Otype *C,
+             int32_t ldc, bool relu, int32_t *bias, bool addOnRow = false);
  // 8 bits int write back
-  // C = alpha * A * B + beta * C
-  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                          int32_t ldc);
  // C = A * B
  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
-  // C = A * B + C
-  void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                    int32_t ldc);
-  // C = A * B + bias
-  void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                      int32_t ldc, int8_t *bias);
-  // C = A * B + C, relu(C)
-  void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                        int32_t ldc);
-  // C = A * B + bias, relu(C)
-  void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                          int32_t ldc, int8_t *bias);
+  // C = A * B + bias, scale * relu(C)
+  void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                             int32_t ldc, int32_t *bias, float scale);
+  // C = A * B + bias, scale * C, bias is added on column
+  void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                         int32_t ldc, int32_t *bias, float scale);
+  // C = A * B + bias, scale * C, bias is added on row
+  void WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                          int32_t ldc, int32_t *bias, float scale);

 private:
  int MC = 0;
@@ -254,10 +281,218 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  // 8 bits int
  int8_t *packedA_int8;
  int8_t *packedB_int8;
-  int32_t *packedC_int8;
+  int32_t *packedC_int32;
  int8_t *zero_int8;
 };

+// 8 bits int matrix product (m*k x k*n)
+template <typename Otype>
+void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
+                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
+                 Otype *C, int32_t ldc, bool relu, int32_t *bias,
+                 bool addOnRow) {
+  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
+  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
+  int32_t L1 = 32 * 1024;
+  int32_t L2 = 512 * 1024;
+
+  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
+  KC = k_complete;
+  MC = L1 / (KC * sizeof(int8_t));
+  NC = L2 / (KC * sizeof(int8_t));
+
+  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
+  if (MC == 0) {
+    MC = MR_INT8;
+  } else {
+    int32_t mblock_num = (m + MC - 1) / MC;
+    MC = (m + mblock_num - 1) / mblock_num;
+    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+  }
+  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
+  if (NC == 0) {
+    NC = NR_INT8;
+  } else {
+    int32_t nblock_num = (n + NC - 1) / NC;
+    NC = (n + nblock_num - 1) / nblock_num;
+    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
+  }
+  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
+  packedA_int8 = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
+  packedB_int8 = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
+  packedC_int32 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
+  zero_int8 =
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
+
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
+  int32_t mc, nc;
+  for (int32_t j = 0; j < n; j += NC) {
+    nc = s_min(n - j, NC);
+    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
+    for (int32_t i = 0; i < m; i += MC) {
+      mc = s_min(m - i, MC);
+      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
+      if (bias == nullptr) {
+        InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                    packedC_int32, &C(i, j), ldc, relu);
+      } else {
+        if (addOnRow) {
+          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                              packedC_int32, &C(i, j), ldc, relu, bias + j,
+                              addOnRow);
+        } else {
+          InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                              packedC_int32, &C(i, j), ldc, relu, bias + i,
+                              addOnRow);
+        }
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA_int8);
+  paddle_mobile::memory::Free(packedB_int8);
+  paddle_mobile::memory::Free(packedC_int32);
+  paddle_mobile::memory::Free(zero_int8);
+}
+
+// 8 bits int matrix product (m*k x k*n), omp version
+template <typename Otype>
+void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
+                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
+                     float beta, Otype *C, int32_t ldc, bool relu,
+                     int32_t *bias, bool addOnRow) {
+#ifdef _OPENMP
+  int32_t max_threads = omp_get_max_threads();
+#else
+  int32_t max_threads = 1;
+#endif
+
+  int32_t L1 = 64 / max_threads * 1024;
+  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
+  KC = k_complete;
+  zero_int8 =
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
+  if (m > n) {
+    // 对 A 分块
+    MC = L1 / (KC * sizeof(int8_t));
+    if (MC == 0) {
+      MC = MR_INT8;
+    } else {
+      int32_t mblock_num = (m + MC - 1) / MC;
+      MC = (m + mblock_num - 1) / mblock_num;
+      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+    }
+    // 补齐 B
+    NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
+
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
+#if __aarch64__
+    // TODO()
+#else
+    PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
+#endif
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
+  } else {
+    // 对 B 分块
+    NC = L1 / (KC * sizeof(int8_t));
+    if (NC == 0) {
+      NC = NR_INT8;
+    } else {
+      int32_t nblock_num = (n + NC - 1) / NC;
+      NC = (n + nblock_num - 1) / nblock_num;
+      NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
+    }
+    // 补齐 A
+    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
+
+    packedA_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
+#if __aarch64__
+    // TODO()
+#else
+    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
+#endif
+    packedB_int8 = static_cast<int8_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
+  }
+  packedC_int32 = static_cast<int32_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
+
+  if (m > n) {
+#pragma omp parallel for
+    for (int32_t i = 0; i < m; i += MC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+
+      int32_t mc;
+      mc = s_min(m - i, MC);
+      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
+      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO()
+#else
+      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
+#endif
+      if (bias == nullptr) {
+        InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
+                    &C(i, 0), ldc, relu);
+      } else {
+        if (addOnRow) {
+          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
+                              local_C, &C(i, 0), ldc, relu, bias, addOnRow);
+        } else {
+          InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
+                              local_C, &C(i, 0), ldc, relu, bias + i, addOnRow);
+        }
+      }
+    }
+  } else {
+#pragma omp parallel for
+    for (int32_t j = 0; j < n; j += NC) {
+#ifdef _OPENMP
+      int32_t local_threads = omp_get_thread_num();
+#else
+      int32_t local_threads = 0;
+#endif
+      int32_t nc;
+      nc = s_min(n - j, NC);
+      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
+      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
+#if __aarch64__
+      // TODO()
+#else
+      PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
+#endif
+      if (bias == nullptr) {
+        InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
+                    &C(0, j), ldc, relu);
+      } else {
+        if (addOnRow) {
+          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
+                              local_C, &C(0, j), ldc, relu, bias + j, addOnRow);
+        } else {
+          InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
+                              local_C, &C(0, j), ldc, relu, bias, addOnRow);
+        }
+      }
+    }
+  }
+
+  paddle_mobile::memory::Free(packedA_int8);
+  paddle_mobile::memory::Free(packedB_int8);
+  paddle_mobile::memory::Free(packedC_int32);
+  paddle_mobile::memory::Free(zero_int8);
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -14,10 +14,11 @@ limitations under the License. */

 #include <string.h>
 #include "common/log.h"
-#include "memory/t_malloc.h"
 #include "operators/math/gemm.h"
 #if __ARM_NEON
 #include <arm_neon.h>
+#include <iostream>
+
 #endif
 #ifdef _OPENMP
 #include <omp.h>
@@ -30,7 +31,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO(wzzju)
+// TODO()
 #else
  const int8_t *a_ptr, *b_ptr;
  a_ptr = a;
@@ -62,7 +63,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
      "pld          [%[b_ptr], #128]               \n\t"
      "vld1.s8      {d0-d3},    [%[a_ptr]]!        \n\t"  // load A 8 cols
      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B first 4 rows
-      "vmovl.s8     q2,         d0                 \n\t"  // process B first 4
+      "vmovl.s8     q2,         d0                 \n\t"  // process B first
                                                          // rows
      "vmovl.s8     q3,         d8                 \n\t"
      "vmlal.s16    q8,         d6,            d4[0]\n\t"
@@ -241,12 +242,141 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
 #endif  // __ARM_NEON
 }

+// The core idea of AddDot4x2 function is borrowed from the Google's gemmlowp
+// open source library. The address of gemmlowp is
+// https://github.com/google/gemmlowp.
+void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
+                     int32_t ldc) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+#define PADDLE_LABEL_LOOP "1"
+#define PADDLE_LABEL_AFTER_LOOP "2"
+  asm volatile(
+      "lsl %[ldc], %[ldc], #2 \n\t"  // sizeof(int32) == 4
+      "vldr d0, [%[b], #0] \n\t"
+      "vmov.s32 q8, #0 \n\t"
+      "vldr d4, [%[a], #0] \n\t"
+      "vmov.s32 q9, q8 \n\t"
+      "vldr d2, [%[b], #16] \n\t"
+      "vmov.s32 q10, q8 \n\t"
+      "vldr d6, [%[a], #16] \n\t"
+      "vmov.s32 q11, q8 \n\t"
+      "vldr d1, [%[b], #8]\n\t"
+      "vmov.s32 q12, q8 \n\t"
+      "vldr d5, [%[a], #8]\n"
+      "vmov.s32 q13, q8 \n\t"
+      "vldr d3, [%[b], #24]\n\t"
+      "vmov.s32 q14, q8 \n\t"
+      "vldr d7, [%[a], #24]\n"
+      "vmov.s32 q15, q8 \n\t"
+
+      PADDLE_LABEL_LOOP
+      ": \n\t"
+      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
+      "add %[b], %[b], #32 \n\t"
+      "vmull.s8    q5,  d2,  d4 \n\t"
+      "vldr d4, [%[a], #32] \n\t"
+      "vmull.s8    q6,  d0,  d6 \n\t"
+      "vmull.s8    q7,  d2,  d6 \n\t"
+      "vldr d6, [%[a], #48] \n\t"
+
+      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
+      "vmlal.s8    q5,  d3,  d5 \n\t"
+      "vldr d5, [%[a], #40] \n\t"
+      "vmlal.s8    q6,  d1,  d7 \n\t"
+      "vmlal.s8    q7,  d3,  d7 \n\t"
+      "vldr d7, [%[a], #56] \n\t"
+
+      "vpadal.s16   q8,  q4 \n\t"  // pairwise-add
+      "add %[a], %[a], #64 \n\t"
+      "vpadal.s16   q9,  q5 \n\t"
+      "subs %[k], %[k], #16 \n\t"
+      "vpadal.s16   q10, q6 \n\t"
+      "vpadal.s16   q11, q7 \n\t"
+
+      "beq " PADDLE_LABEL_AFTER_LOOP
+      "f \n\t"
+
+      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
+      "vmull.s8    q5,  d2,  d4 \n\t"
+      "vldr d4, [%[a], #0] \n\t"
+      "vmull.s8    q6,  d0,  d6 \n\t"
+      "vldr d0, [%[b], #0] \n\t"
+      "vmull.s8    q7,  d2,  d6 \n\t"
+      "vldr d2, [%[b], #16] \n\t"
+
+      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
+      "vldr d6, [%[a], #16] \n\t"
+      "vmlal.s8    q5,  d3,  d5 \n\t"
+      "vldr d5, [%[a], #8] \n\t"
+      "vmlal.s8    q6,  d1,  d7 \n\t"
+      "vldr d1, [%[b], #8] \n\t"
+      "vmlal.s8    q7,  d3,  d7 \n\t"
+      "vldr d3, [%[b], #24] \n\t"
+
+      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
+      "vldr d7, [%[a], #24] \n\t"
+      "vpadal.s16   q13, q5 \n\t"
+      "vpadal.s16   q14, q6 \n\t"
+      "vpadal.s16   q15, q7 \n\t"
+
+      "b " PADDLE_LABEL_LOOP "b \n\t"
+
+      PADDLE_LABEL_AFTER_LOOP
+      ": \n\t"
+      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
+      "vmull.s8    q5,  d2,  d4 \n\t"
+      "vmull.s8    q6,  d0,  d6 \n\t"
+      "vmull.s8    q7,  d2,  d6 \n\t"
+
+      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
+      "vmlal.s8    q5,  d3,  d5 \n\t"
+      "vmlal.s8    q6,  d1,  d7 \n\t"
+      "vmlal.s8    q7,  d3,  d7 \n\t"
+
+      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
+      "vpadal.s16   q13, q5 \n\t"
+      "vpadal.s16   q14, q6 \n\t"
+      "vpadal.s16   q15, q7 \n\t"
+
+      "vpadd.s32 d0, d16, d17 \n\t"  // reduce to int32
+      "vpadd.s32 d1, d18, d19 \n\t"
+      "vpadd.s32 d2, d20, d21 \n\t"
+      "vpadd.s32 d3, d22, d23 \n\t"
+      "vpadd.s32 d4, d24, d25 \n\t"
+      "vpadd.s32 d5, d26, d27 \n\t"
+      "vpadd.s32 d6, d28, d29 \n\t"
+      "vpadd.s32 d7, d30, d31 \n\t"
+
+      "vpadd.s32 d8, d0, d1 \n\t"  // reduce to int32 again
+      "vpadd.s32 d9, d2, d3 \n\t"
+      "vpadd.s32 d10, d4, d5 \n\t"
+      "vpadd.s32 d11, d6, d7 \n\t"
+
+      "vst1.32 {d8}, [%[c]], %[ldc] \n\t"
+      "vst1.32 {d9}, [%[c]], %[ldc] \n\t"
+      "vst1.32 {d10}, [%[c]], %[ldc] \n\t"
+      "vst1.32 {d11}, [%[c]]  \n\t"
+
+      : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)
+      : [ldc] "r"(ldc)
+      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
+        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
+#undef PADDLE_LABEL_AFTER_LOOP
+#undef PADDLE_LABEL_LOOP
+
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}
+
 // 8 bits int small block inner product
 void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO(wzzju)
+// TODO
 #else
  const int8_t *a_ptr, *b_ptr;
  a_ptr = a;
@@ -539,51 +669,229 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
 }

 // 8 bits int inner product
-void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
-                               const int8_t *a, const int8_t *b, int8_t beta,
-                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
-                               int8_t *bias) {
+template <>
+void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+                       const int8_t *b, float beta, int32_t *c, int8_t *C,
+                       int32_t ldc, bool relu) {}
+template <>
+void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+                       const int8_t *b, float beta, int32_t *c, int32_t *C,
+                       int32_t ldc, bool relu) {
 #pragma omp parallel for
-  for (int32_t j = 0; j < nc; j += NR) {
+  for (int32_t j = 0; j < nc; j += NR_INT8) {
    for (int32_t i = 0; i < mc; i += MR_INT8) {
 #if __aarch64__
-    // TODO(wzzju)
+    // TODO
 #else
      //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      //      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
 #endif  // __aarch64__
    }
  }
-  if (alpha != 1) {
-    WriteWithAlphaBeta(mc, nc, c, C, ldc);
-    return;
-  }
-  if (beta == 0) {
+  if (!relu) {
    WriteBasic(mc, nc, c, C, ldc);
    return;
  }
-  if (beta == 1 && !relu) {
-    if (bias == nullptr) {
-      WriteWithAdd(mc, nc, c, C, ldc);
-    } else {
-      WriteWithAddV1(mc, nc, c, C, ldc, bias);
+}
+
+template <>
+void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
+                               const int8_t *a, const int8_t *b, float beta,
+                               int32_t *c, int8_t *C, int32_t ldc, bool relu,
+                               int32_t *bias, bool addOnRow) {
+#pragma omp parallel for
+  for (int32_t j = 0; j < nc; j += NR_INT8) {
+    for (int32_t i = 0; i < mc; i += MR_INT8) {
+#if __aarch64__
+    // TODO
+#else
+      //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      //      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+#endif  // __aarch64__
    }
-    return;
  }
-  if (beta == 1 && relu) {
-    if (bias == nullptr) {
-      WriteWithAddRelu(mc, nc, c, C, ldc);
+  if (relu) {
+    WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
+    return;
+  } else {
+    if (addOnRow) {
+      WriteWithAddScaleT(mc, nc, c, C, ldc, bias, alpha);
    } else {
-      WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
+      WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
    }
-    return;
  }
 }
+
+template <>
+void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
+                               const int8_t *a, const int8_t *b, float beta,
+                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
+                               int32_t *bias, bool addOnRow) {}
+
+// 8 bits int PackMatrixA_4r
+void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
+                             const int8_t *A, int32_t lda, int8_t *buffer) {
+  const int32_t i_length = m - m_tail;
+  const int32_t k_count = k >> 4;
+  const int32_t k_tail = k & 15;
+
+  for (int32_t i = 0; i < i_length; i += 4) {
+    const int8_t *a0 = A + i * lda;
+    const int8_t *a1 = A + (i + 1) * lda;
+    const int8_t *a2 = A + (i + 2) * lda;
+    const int8_t *a3 = A + (i + 3) * lda;
+    int8_t *local_buffer = buffer + i * KC;
+    for (int32_t j = 0; j < k_count; ++j) {
+#if __ARM_NEON
+#if __aarch64__
+    // TODO
+#else
+      asm volatile(
+          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
+          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
+          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
+          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
+          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
+            [a2] "+r"(a2), [a3] "+r"(a3)
+          :
+          : "memory", "q0", "q1", "q2", "q3");
+#endif  // __aarch64__
+#else
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a3++;
+      }
+#endif  // __ARM_NEON
+    }
+    if (k_tail != 0) {
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a3++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+
+  if (m_tail != 0) {
+    const int8_t *a0 = &A(i_length, 0);
+    const int8_t *a1 = a0 + lda;
+    const int8_t *a2 = a0 + 2 * lda;
+    const int8_t *a3 = a0 + 3 * lda;
+    int8_t *local_buffer = buffer + i_length * KC;
+    switch (m_tail) {
+      case 1:
+        a1 = zero_int8;
+      case 2:
+        a2 = zero_int8;
+      case 3:
+        a3 = zero_int8;
+        break;
+      default:
+        break;
+    }
+    for (int32_t j = 0; j < k_count; ++j) {
+#if __ARM_NEON
+#if __aarch64__
+    // TODO
+#else
+      asm volatile(
+          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
+          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
+          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
+          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
+          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
+            [a2] "+r"(a2), [a3] "+r"(a3)
+          :
+          : "memory", "q0", "q1", "q2", "q3");
+#endif  // __aarch64__
+#else
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a3++;
+      }
+#endif  // __ARM_NEON
+    }
+    if (k_tail != 0) {
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a3++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
 // 8 bits int PackMatrixA_4r
 void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                          int32_t lda, int8_t *buffer) {
  const int8_t *a0, *a1, *a2, *a3;
-  for (int32_t i = 0; i < m - m_tail; i += MR_INT8) {
+  for (int32_t i = 0; i < m - m_tail; i += 4) {
    a0 = A + i * lda;
    a1 = A + (i + 1) * lda;
    a2 = A + (i + 2) * lda;
@@ -625,7 +933,7 @@ void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
 void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                          int32_t lda, int8_t *buffer) {
  const int32_t i_length = m - m_tail;
-  for (int32_t i = 0; i < i_length; i += MR_INT8) {
+  for (int32_t i = 0; i < i_length; i += 6) {
    const int8_t *a0 = A + i * lda;
    const int8_t *a1 = A + (i + 1) * lda;
    const int8_t *a2 = A + (i + 2) * lda;
@@ -676,17 +984,85 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
  }
 }

+// 8 bits int PackMatrixB
+void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail,
+                             const int8_t *B, int32_t ldb, int8_t *buffer) {
+  const int32_t j_length = n - n_tail;
+  const int32_t k_count = k >> 4;
+  const int32_t k_tail = k & 15;
+  for (int32_t j = 0; j < j_length; j += 2) {
+    int8_t *local_buffer = buffer + j * KC;
+    for (int32_t i = 0; i < k_count; ++i) {
+      const int8_t *b0 = &B((i << 4), j);
+      const int8_t *b1 = &B((i << 4), j + 1);
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = *b1;
+        b1 += ldb;
+      }
+    }
+    if (k_tail != 0) {
+      const int8_t *b0 = &B((k_count << 4), j);
+      const int8_t *b1 = &B((k_count << 4), j + 1);
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *b1;
+        b1 += ldb;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+  if (n_tail != 0) {
+    int8_t *local_buffer = buffer + j_length * KC;
+    for (int32_t i = 0; i < k_count; ++i) {
+      const int8_t *b0 = &B((i << 4), j_length);
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = 0;
+      }
+    }
+    if (k_tail != 0) {
+      const int8_t *b0 = &B((k_count << 4), j_length);
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
 // 8 bits int PackMatrixB
 void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                          int32_t ldb, int8_t *buffer) {
  const int32_t j_length = n - n_tail;
-  for (int32_t j = 0; j < j_length; j += NR) {
+  for (int32_t j = 0; j < j_length; j += 8) {
    int8_t *local_buffer = buffer + j * k;
    for (int32_t i = 0; i < k; ++i) {
      const int8_t *b0 = &B(i, j);
 #if __ARM_NEON
 #if __aarch64__
-      // TODO(wzzju)
+      // TODO
 #else
      asm volatile(
          //          "pld        [%[b0]]                     \n\t"
@@ -715,94 +1091,27 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
      for (int32_t j = j_length; j < n; ++j) {
        *local_buffer++ = *b0++;
      }
-      for (int32_t j = n; j < j_length + NR; ++j) {
+      for (int32_t j = n; j < j_length + 8; ++j) {
        *local_buffer++ = 0;
      }
    }
  }
 }

-// 8 bits int matrix product (m*k x k*n)
-void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
-                 int32_t *C, int32_t ldc, bool relu, int8_t *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int32_t L1 = 32 * 1024;
-  int32_t L2 = 512 * 1024;
-
-  KC = k;
-  MC = L1 / (KC * sizeof(int8_t));
-  NC = L2 / (KC * sizeof(int8_t));
-
-  // make sure MC is multiple of MR_INT8, and NC is multiple of NR
-  if (MC == 0) {
-    MC = MR_INT8;
-  } else {
-    int32_t mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-  }
-  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR;
-  } else {
-    int32_t nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR - 1) / NR * NR;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-  packedA_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-  packedB_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-  packedC_int8 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
-
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
-  int32_t mc, nc;
-  for (int32_t j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8);
-    for (int32_t i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      //      PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
-      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
-      if (bias == nullptr) {
-        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                            packedC_int8, &C(i, j), ldc, relu, nullptr);
-      } else {
-        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                            packedC_int8, &C(i, j), ldc, relu, bias + i);
-      }
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int8);
-  paddle_mobile::memory::Free(zero_int8);
-}
-
 //  8 bits int write back
-// C = alpha * A * B + beta * C
-void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                              int32_t ldc) {}
-// C = A * B, 8位 int32_t
+// C = A * B
 void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                      int32_t ldc) {
 #if __ARM_NEON
 #if __aarch64__
-// TODO(wzzju)
+// TODO
 #else
  int32_t nc1 = nc >> 4;
  int32_t _nc1 = nc & 15;
  int32_t step = sizeof(int32_t) * ldc;
  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4));
  int32_t volatile m = mc;
-
+  int32_t volatile n = nc1;
  int32_t *volatile c_ptr, *volatile C_ptr;
  int32_t *C0, *c0;
  c_ptr = c;
@@ -836,7 +1145,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
        "end_mc_%=:                         \n\t"

        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
          [step] "r"(step), [step1] "r"(step1)
        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
  }
@@ -854,20 +1163,372 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
 #endif  // __ARM_NEON
 }

-// C = A * B + C
-void Gemm::WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                        int32_t ldc) {}
+// C = A * B + bias, scale * C, bias is added on column
+void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                             int32_t ldc, int32_t *bias, float scale) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+  int8_t narrow = -128;
+  int32_t nc1 = nc >> 3;
+  int32_t _nc1 = nc & 7;
+  int32_t step = sizeof(int8_t) * ldc;
+  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
+  int32_t volatile m = mc;
+  int32_t volatile n = nc1;
+  int32_t *volatile c_ptr, *volatile bias_ptr;
+  int8_t *volatile C_ptr;
+  c_ptr = c;
+  C_ptr = C;
+  bias_ptr = bias;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "vdup.32    q15,  %[scale]          \n\t"
+        "vdup.8     d24,  %[narrow]         \n\t"
+        "loop_mc_%=:                        \n\t"
+        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
+        "vdup.32    q13,  d26[0]            \n\t"
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vqadd.s32  q0, q0, q13             \n\t"
+        "vqadd.s32  q1, q1, q13             \n\t"
+        "vcvt.f32.s32 q2, q0                \n\t"
+        "vcvt.f32.s32 q3, q1                \n\t"
+        "vmul.f32   q2, q2, q15             \n\t"
+        "vmul.f32   q3, q3, q15             \n\t"
+        "vcvt.s32.f32 q4, q2                \n\t"
+        "vcvt.s32.f32 q5, q3                \n\t"
+        "vqmovn.s32 d12, q4                 \n\t"
+        "vqmovn.s32 d13, q5                 \n\t"
+        "vqmovn.s16 d14, q6                 \n\t"
+        "vceq.s8    d15, d14, d24           \n\t"
+        "vsub.s8    d14, d14, d15           \n\t"
+        "vst1.8     {d14}, [r6]!            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"

-// C = A * B + bias
-void Gemm::WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                          int32_t ldc, int8_t *bias) {}
-// C = A * B + C, relu(C)
-void Gemm::WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                            int32_t ldc) {}
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
+          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
+          [scale] "r"(scale), [narrow] "r"(narrow)
+        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+          "q7", "q12", "q13", "q15");
+  }

-// C = A * B + bias, relu(C)
-void Gemm::WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                              int32_t ldc, int8_t *bias) {}
+  int32_t nc_left;
+  int32_t *c0;
+  int8_t *C0;
+  int32_t bias_v;
+  if (_nc1 != 0) {
+    for (int32_t i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 8 + i * ldc;
+      c0 = c_ptr + nc1 * 8 + i * NC;
+      bias_v = *(bias_ptr + i);
+      nc_left = _nc1;
+      asm volatile(
+          "vdup.32    q15,  %[scale]          \n\t"
+          "vdup.8     d24,  %[narrow]         \n\t"
+          "vdup.32    q13,  %[bias_v]         \n\t"
+          "cmp        %[_nc1], #4             \n\t"
+          "blt        less_four_%=            \n\t"
+          "vld1.32    {q0}, [%[c0]]!          \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "vceq.s8    d9, d8, d24             \n\t"
+          "vsub.s8    d8, d8, d9              \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
+          "subs       %[_nc1], %[_nc1], #4    \n\t"
+          "beq        process_over_%=         \n\t"
+          "less_four_%=:                      \n\t"
+          "vld1.32    {q0}, [%[c0]]          \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "vceq.s8    d9, d8, d24             \n\t"
+          "vsub.s8    d8, d8, d9              \n\t"
+          "loop_save_%=:                      \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vext.8 d8, d8, d8, #1              \n\t"
+          "subs       %[_nc1], %[_nc1], #1    \n\t"
+          "bgt        loop_save_%=            \n\t"
+          "process_over_%=:                   \n\t"
+          :
+          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
+            [bias_v] "r"(bias_v), [scale] "r"(scale), [narrow] "r"(narrow)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
+    }
+  }
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}
+
+// C = A * B + bias, scale * C, bias is added on row
+void Gemm::WriteWithAddScaleT(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                              int32_t ldc, int32_t *bias, float scale) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+  int8_t narrow = -128;
+  int32_t nc1 = nc >> 3;
+  int32_t _nc1 = nc & 7;
+  int32_t step = sizeof(int8_t) * ldc;
+  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
+  int32_t volatile m = mc;
+  int32_t volatile n = nc1;
+  int32_t *volatile c_ptr, *volatile bias_ptr;
+  int8_t *volatile C_ptr;
+  c_ptr = c;
+  C_ptr = C;
+  bias_ptr = bias;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "vdup.32    q15,  %[scale]          \n\t"
+        "vdup.8     d24,  %[narrow]         \n\t"
+        "loop_mc_%=:                        \n\t"
+        "mov        r4,   %[bias_ptr]       \n\t"
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+        "vld1.32    {q13, q14}, [r4]!        \n\t"
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vqadd.s32  q0, q0, q13             \n\t"
+        "vqadd.s32  q1, q1, q14             \n\t"
+        "vcvt.f32.s32 q2, q0                \n\t"
+        "vcvt.f32.s32 q3, q1                \n\t"
+        "vmul.f32   q2, q2, q15             \n\t"
+        "vmul.f32   q3, q3, q15             \n\t"
+        "vcvt.s32.f32 q4, q2                \n\t"
+        "vcvt.s32.f32 q5, q3                \n\t"
+        "vqmovn.s32 d12, q4                 \n\t"
+        "vqmovn.s32 d13, q5                 \n\t"
+        "vqmovn.s16 d14, q6                 \n\t"
+        "vceq.s8    d15, d14, d24           \n\t"
+        "vsub.s8    d14, d14, d15           \n\t"
+        "vst1.8     {d14}, [r6]!            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
+          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
+          [scale] "r"(scale), [narrow] "r"(narrow)
+        : "cc", "memory", "r4", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5",
+          "q6", "q7", "q12", "q13", "q15");
+  }
+
+  int32_t nc_left;
+  int32_t *c0;
+  int8_t *C0;
+  int32_t *volatile bias0 = bias_ptr + nc1 * 8;
+  if (_nc1 != 0) {
+    for (int32_t i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 8 + i * ldc;
+      c0 = c_ptr + nc1 * 8 + i * NC;
+      nc_left = _nc1;
+      asm volatile(
+          "vdup.32    q15,  %[scale]          \n\t"
+          "vdup.8     d24,  %[narrow]         \n\t"
+          "cmp        %[_nc1], #4             \n\t"
+          "blt        less_four_%=            \n\t"
+          "vld1.32    {q0}, [%[c0]]!          \n\t"
+          "vld1.32    {q13}, [%[bias0]]!      \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "vceq.s8    d9, d8, d24             \n\t"
+          "vsub.s8    d8, d8, d9              \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
+          "subs       %[_nc1], %[_nc1], #4    \n\t"
+          "beq        process_over_%=         \n\t"
+          "less_four_%=:                      \n\t"
+          "vld1.32    {q0}, [%[c0]]           \n\t"
+          "vld1.32    {q13}, [%[bias0]]       \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "vceq.s8    d9, d8, d24             \n\t"
+          "vsub.s8    d8, d8, d9              \n\t"
+          "loop_save_%=:                      \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vext.8 d8, d8, d8, #1              \n\t"
+          "subs       %[_nc1], %[_nc1], #1    \n\t"
+          "bgt        loop_save_%=            \n\t"
+          "process_over_%=:                   \n\t"
+          :
+          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), [bias0] "r"(bias0),
+            [scale] "r"(scale), [narrow] "r"(narrow)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q15");
+    }
+  }
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}
+
+// C = A * B + bias, scale * relu(C), bias is added on column
+void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+                                 int32_t ldc, int32_t *bias, float scale) {
+#if __ARM_NEON
+#if __aarch64__
+// TODO
+#else
+  int32_t zero = 0;
+  int32_t nc1 = nc >> 3;
+  int32_t _nc1 = nc & 7;
+  int32_t step = sizeof(int8_t) * ldc;
+  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
+  int32_t volatile m = mc;
+  int32_t volatile n = nc1;
+  int32_t *volatile c_ptr, *volatile bias_ptr;
+  int8_t *volatile C_ptr;
+  c_ptr = c;
+  C_ptr = C;
+  bias_ptr = bias;
+  if (nc1 > 0) {
+    asm volatile(
+        "subs       %[mc], %[mc], #1        \n\t"
+        "blt        end_mc_%=               \n\t"
+        "vdup.32    q15,  %[scale]          \n\t"
+        "vdup.32    q14,  %[zero]           \n\t"
+        "loop_mc_%=:                        \n\t"
+        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
+        "vdup.32    q13,  d26[0]            \n\t"
+        "mov        r6,   %[C_ptr]          \n\t"
+        "mov        r5,   %[nc1]            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "blt        end_nc1_%=              \n\t"
+        "loop_nc1_%=:                       \n\t"
+        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
+        "vqadd.s32  q0, q0, q13             \n\t"
+        "vqadd.s32  q1, q1, q13             \n\t"
+        "vmax.s32   q0, q0, q14             \n\t"
+        "vmax.s32   q1, q1, q14             \n\t"
+        "vcvt.f32.s32 q2, q0                \n\t"
+        "vcvt.f32.s32 q3, q1                \n\t"
+        "vmul.f32   q2, q2, q15             \n\t"
+        "vmul.f32   q3, q3, q15             \n\t"
+        "vcvt.s32.f32 q4, q2                \n\t"
+        "vcvt.s32.f32 q5, q3                \n\t"
+        "vqmovn.s32 d12, q4                 \n\t"
+        "vqmovn.s32 d13, q5                 \n\t"
+        "vqmovn.s16 d14, q6                 \n\t"
+        "vst1.8     {d14}, [r6]!            \n\t"
+        "subs       r5,   r5,   #1          \n\t"
+        "bge        loop_nc1_%=             \n\t"
+        "end_nc1_%=:                        \n\t"
+
+        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
+        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
+        "subs       %[mc], %[mc], #1        \n\t"
+        "bge        loop_mc_%=              \n\t"
+        "end_mc_%=:                         \n\t"
+
+        :
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
+          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
+          [scale] "r"(scale), [zero] "r"(zero)
+        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+          "q7", "q13", "q14", "q15");
+  }
+
+  int32_t nc_left;
+  int32_t *c0;
+  int8_t *C0;
+  int32_t bias_v;
+  if (_nc1 != 0) {
+    for (int32_t i = 0; i < mc; i++) {
+      C0 = C_ptr + nc1 * 8 + i * ldc;
+      c0 = c_ptr + nc1 * 8 + i * NC;
+      bias_v = *(bias_ptr + i);
+      nc_left = _nc1;
+      asm volatile(
+          "vdup.32    q15,  %[scale]          \n\t"
+          "vdup.32    q14,  %[zero]           \n\t"
+          "vdup.32    q13,  %[bias_v]         \n\t"
+          "cmp        %[_nc1], #4             \n\t"
+          "blt        less_four_%=            \n\t"
+          "vld1.32    {q0}, [%[c0]]!          \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vmax.s32   q0, q0, q14             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
+          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
+          "subs       %[_nc1], %[_nc1], #4    \n\t"
+          "beq        process_over_%=         \n\t"
+          "less_four_%=:                      \n\t"
+          "vld1.32    {q0}, [%[c0]]!          \n\t"
+          "vqadd.s32  q0, q0, q13             \n\t"
+          "vmax.s32   q0, q0, q14             \n\t"
+          "vcvt.f32.s32 q1, q0                \n\t"
+          "vmul.f32   q1, q1, q15             \n\t"
+          "vcvt.s32.f32 q2, q1                \n\t"
+          "vqmovn.s32 d6, q2                  \n\t"
+          "vqmovn.s16 d8, q3                  \n\t"
+          "loop_save_%=:                      \n\t"
+          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
+          "vext.8 d8, d8, d8, #1              \n\t"
+          "subs       %[_nc1], %[_nc1], #1    \n\t"
+          "bgt        loop_save_%=            \n\t"
+          "process_over_%=:                   \n\t"
+          :
+          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
+            [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero)
+          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15");
+    }
+  }
+#endif  // __aarch64__
+#endif  // __ARM_NEON
+}

 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/gemm_omp_int8.cpp
+++ b/src/operators/math/gemm_omp_int8.cpp
@@ -27,130 +27,17 @@ namespace paddle_mobile {
 namespace operators {
 namespace math {

-// 8 bits int matrix product (m*k x k*n)
-void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha,
-                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
-                     int8_t beta, int32_t *C, int32_t ldc, bool relu,
-                     int8_t *bias) {
-#ifdef _OPENMP
-  int32_t max_threads = omp_get_max_threads();
-#else
-  int32_t max_threads = 1;
-#endif
-
-  int32_t L1 = 64 / max_threads * 1024;
-  KC = k;
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
-  if (m > n) {
-    // 对 A 分块
-    MC = L1 / (KC * sizeof(int8_t));
-    if (MC == 0) {
-      MC = MR_INT8;
-    } else {
-      int32_t mblock_num = (m + MC - 1) / MC;
-      MC = (m + mblock_num - 1) / mblock_num;
-      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-    }
-    // 补齐 B
-    NC = (n + NR - 1) / NR * NR;
-
-    packedB_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-#if __aarch64__
-    // TODO(wzzju)
-#else
-    PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8);
-#endif
-    packedA_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
-  } else {
-    // 对 B 分块
-    NC = L1 / (KC * sizeof(int8_t));
-    if (NC == 0) {
-      NC = NR;
-    } else {
-      int32_t nblock_num = (n + NC - 1) / NC;
-      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR - 1) / NR * NR;
-    }
-    // 补齐 A
-    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-
-    packedA_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-#if __aarch64__
-    // TODO(wzzju)
-#else
-    PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8);
-#endif
-    packedB_int8 = static_cast<int8_t *>(
-        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
-  }
-  packedC_int8 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
-
-  if (m > n) {
-#pragma omp parallel for
-    for (int32_t i = 0; i < m; i += MC) {
-#ifdef _OPENMP
-      int32_t local_threads = omp_get_thread_num();
-#else
-      int32_t local_threads = 0;
-#endif
-
-      int32_t mc;
-      mc = s_min(m - i, MC);
-      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
-      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
-#if __aarch64__
-      // TODO(wzzju)
-#else
-      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A);
-#endif
-      InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
-                          &C(i, 0), ldc, relu, bias + i);
-    }
-  } else {
-#pragma omp parallel for
-    for (int32_t j = 0; j < n; j += NC) {
-#ifdef _OPENMP
-      int32_t local_threads = omp_get_thread_num();
-#else
-      int32_t local_threads = 0;
-#endif
-      int32_t nc;
-      nc = s_min(n - j, NC);
-      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
-      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
-#if __aarch64__
-      // TODO(wzzju)
-#else
-      PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B);
-#endif
-      InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
-                          &C(0, j), ldc, relu, bias);
-    }
-  }
-
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int8);
-  paddle_mobile::memory::Free(zero_int8);
-}
-
 void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
                              const int8_t *B, int32_t ldb, int8_t *buffer) {
  const int32_t j_length = n - n_tail;
 #pragma omp parallel for
-  for (int32_t j = 0; j < j_length; j += NR) {
+  for (int32_t j = 0; j < j_length; j += 8) {
    int8_t *local_buffer = buffer + j * k;
    for (int32_t i = 0; i < k; ++i) {
      const int8_t *b0 = &B(i, j);
 #if __ARM_NEON
 #if __aarch64__
-      // TODO(wzzju)
+      // TODO
 #else
      asm volatile(
          //          "pld        [%[b0]]                     \n\t"
@@ -179,7 +66,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
      for (int32_t j = j_length; j < n; ++j) {
        *local_buffer++ = *b0++;
      }
-      for (int32_t j = n; j < j_length + NR; ++j) {
+      for (int32_t j = n; j < j_length + 8; ++j) {
        *local_buffer++ = 0;
      }
    }
@@ -188,9 +75,9 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,

 void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
                              const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int i_length = m - m_tail;
+  const int32_t i_length = m - m_tail;
 #pragma omp parallel for
-  for (int32_t i = 0; i < i_length; i += MR_INT8) {
+  for (int32_t i = 0; i < i_length; i += 4) {
    const int8_t *a0 = A + i * lda;
    const int8_t *a1 = A + (i + 1) * lda;
    const int8_t *a2 = A + (i + 2) * lda;
@@ -221,7 +108,7 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
      default:
        break;
    }
-    for (int j = 0; j < k; ++j) {
+    for (int32_t j = 0; j < k; ++j) {
      *local_buffer++ = *a0++;
      *local_buffer++ = *a1++;
      *local_buffer++ = *a2++;
@@ -230,6 +117,232 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
  }
 }

+// 8 bits int PackMatrixA_4r
+void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
+                                 const int8_t *A, int32_t lda, int8_t *buffer) {
+  const int32_t i_length = m - m_tail;
+  const int32_t k_count = k >> 4;
+  const int32_t k_tail = k & 15;
+#pragma omp parallel for
+  for (int32_t i = 0; i < i_length; i += 4) {
+    const int8_t *a0 = A + i * lda;
+    const int8_t *a1 = A + (i + 1) * lda;
+    const int8_t *a2 = A + (i + 2) * lda;
+    const int8_t *a3 = A + (i + 3) * lda;
+    int8_t *local_buffer = buffer + i * KC;
+    for (int32_t j = 0; j < k_count; ++j) {
+#if __ARM_NEON
+#if __aarch64__
+    // TODO
+#else
+      asm volatile(
+          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
+          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
+          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
+          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
+          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
+            [a2] "+r"(a2), [a3] "+r"(a3)
+          :
+          : "memory", "q0", "q1", "q2", "q3");
+#endif  // __aarch64__
+#else
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a3++;
+      }
+#endif  // __ARM_NEON
+    }
+    if (k_tail != 0) {
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a3++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+
+  if (m_tail != 0) {
+    const int8_t *a0 = &A(i_length, 0);
+    const int8_t *a1 = a0 + lda;
+    const int8_t *a2 = a0 + 2 * lda;
+    const int8_t *a3 = a0 + 3 * lda;
+    int8_t *local_buffer = buffer + i_length * KC;
+    switch (m_tail) {
+      case 1:
+        a1 = zero_int8;
+      case 2:
+        a2 = zero_int8;
+      case 3:
+        a3 = zero_int8;
+        break;
+      default:
+        break;
+    }
+    for (int32_t j = 0; j < k_count; ++j) {
+#if __ARM_NEON
+#if __aarch64__
+    // TODO
+#else
+      asm volatile(
+          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
+          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
+          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
+          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
+          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
+          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
+          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
+            [a2] "+r"(a2), [a3] "+r"(a3)
+          :
+          : "memory", "q0", "q1", "q2", "q3");
+#endif  // __aarch64__
+#else
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t l = 0; l < 16; ++l) {
+        *local_buffer++ = *a3++;
+      }
+#endif  // __ARM_NEON
+    }
+    if (k_tail != 0) {
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a0++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a1++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a2++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *a3++;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
+// 8 bits int PackMatrixB
+void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
+                                 const int8_t *B, int32_t ldb, int8_t *buffer) {
+  const int32_t j_length = n - n_tail;
+  const int32_t k_count = k >> 4;
+  const int32_t k_tail = k & 15;
+#pragma omp parallel for
+  for (int32_t j = 0; j < j_length; j += 2) {
+    int8_t *local_buffer = buffer + j * KC;
+    for (int32_t i = 0; i < k_count; ++i) {
+      const int8_t *b0 = &B((i << 4), j);
+      const int8_t *b1 = &B((i << 4), j + 1);
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = *b1;
+        b1 += ldb;
+      }
+    }
+    if (k_tail != 0) {
+      const int8_t *b0 = &B((k_count << 4), j);
+      const int8_t *b1 = &B((k_count << 4), j + 1);
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *b1;
+        b1 += ldb;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+  if (n_tail != 0) {
+    int8_t *local_buffer = buffer + j_length * KC;
+    for (int32_t i = 0; i < k_count; ++i) {
+      const int8_t *b0 = &B((i << 4), j_length);
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int m = 0; m < 16; ++m) {
+        *local_buffer++ = 0;
+      }
+    }
+    if (k_tail != 0) {
+      const int8_t *b0 = &B((k_count << 4), j_length);
+      for (int32_t j = k_count << 4; j < k; ++j) {
+        *local_buffer++ = *b0;
+        b0 += ldb;
+      }
+      for (int32_t j = k; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+      for (int32_t j = k_count << 4; j < KC; ++j) {
+        *local_buffer++ = 0;
+      }
+    }
+  }
+}
+
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/gru_compute.cpp
+++ b/src/operators/math/gru_compute.cpp
@@ -34,12 +34,12 @@ struct GRUUnitFunctor<CPU, T> {
      gemm.Sgemm_omp(batch_size, frame_size * 2, frame_size, 1,
                     value.prev_out_value, frame_size, value.gate_weight,
                     frame_size * 2, 1, value.gate_value, frame_size * 3, false,
-                     nullptr);
+                     static_cast<float *>(nullptr));
 #else
      gemm.Sgemm(batch_size, frame_size * 2, frame_size, 1,
                 value.prev_out_value, frame_size, value.gate_weight,
                 frame_size * 2, 1, value.gate_value, frame_size * 3, false,
-                 nullptr);
+                 static_cast<float *>(nullptr));
 #endif
    }

@@ -51,12 +51,12 @@ struct GRUUnitFunctor<CPU, T> {
      gemm.Sgemm_omp(batch_size, frame_size, frame_size, 1,
                     value.reset_output_value, frame_size, value.state_weight,
                     frame_size, 1, value.gate_value + frame_size * 2,
-                     frame_size * 3, false, nullptr);
+                     frame_size * 3, false, static_cast<float *>(nullptr));
 #else
      gemm.Sgemm(batch_size, frame_size, frame_size, 1,
                 value.reset_output_value, frame_size, value.state_weight,
                 frame_size, 1, value.gate_value + frame_size * 2,
-                 frame_size * 3, false, nullptr);
+                 frame_size * 3, false, static_cast<float *>(nullptr));
 #endif
    }


--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -28,7 +28,13 @@ template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false,
-            T *bias = nullptr);
+            float *bias = nullptr);
+
+template <typename T, typename S>
+void matmul(const framework::Tensor &matrix_a, bool trans_a,
+            const framework::Tensor &matrix_b, bool trans_b, T alpha,
+            framework::Tensor *matrix_out, T beta, bool relu = false,
+            S *bias = nullptr, bool addOnRow = false);

 template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,

--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -20,11 +20,12 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
+
 template <>
-void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
-                    const framework::Tensor &matrix_b, bool trans_b,
-                    int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
-                    bool relu, int8_t *bias) {
+void matmul(const framework::Tensor &matrix_a, bool trans_a,
+            const framework::Tensor &matrix_b, bool trans_b, float alpha,
+            framework::Tensor *matrix_out, float beta, bool relu, int32_t *bias,
+            bool addOnRow) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -52,21 +53,43 @@ void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
    }

 #ifdef _OPENMP
-    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                   matrix_out->data<int32_t>(), N, relu, bias);
+    if (bias != nullptr) {
+      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
+    } else {
+      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
+    }
 #else
-    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-               matrix_out->data<int32_t>(), N, relu, bias);
+    if (bias != nullptr) {
+      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+                 matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
+    } else {
+      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+                 matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
+    }
 #endif
  } else {
 #ifdef _OPENMP
-    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                   matrix_b.data<int8_t>(), N, beta,
-                   matrix_out->data<int32_t>(), N, relu, bias);
+    if (bias != nullptr) {
+      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                     matrix_b.data<int8_t>(), N, beta,
+                     matrix_out->data<int8_t>(), N, relu, bias, addOnRow);
+    } else {
+      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                     matrix_b.data<int8_t>(), N, beta,
+                     matrix_out->data<int32_t>(), N, relu, bias, addOnRow);
+    }
 #else
-    gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-               matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
-               relu, bias);
+    if (bias != nullptr) {
+      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
+                 N, relu, bias, addOnRow);
+    } else {
+      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
+                 N, relu, bias, addOnRow);
+    }
 #endif
  }
 }

--- a/src/operators/math/pool_3x3.cpp
+++ b/src/operators/math/pool_3x3.cpp
@@ -38,6 +38,7 @@ void Pool3x3Avgs1p1(const Tensor *input, Tensor *output) {
  const int input_width = static_cast<int>(input->dims()[3]);
  const int output_height = static_cast<int>(output->dims()[2]);
  const int output_width = static_cast<int>(output->dims()[3]);
+  output->mutable_data<float>();

  const int hxw = input_height * input_width;

@@ -472,7 +473,7 @@ void Pool3x3Maxs1p1(const Tensor *input, Tensor *output) {
  const int inputdata_channel_stride = h_in * w_in;
  const int input_batch_stride = output_channels * inputdata_channel_stride;
  const int output_batch_stride = output_channels * outputdata_channel_stride;
-  float *out_data = output->data<float>();
+  float *out_data = output->mutable_data<float>();
  const float *input_data = input->data<float>();
  for (int k = 0; k < batch_size; ++k) {
 #pragma omp parallel for

--- a/src/operators/math/pool_3x3.h
+++ b/src/operators/math/pool_3x3.h
@@ -28,15 +28,21 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-using framework::Tensor;
-using std::vector;
-void Pool3x3Avgs1p1(const Tensor *input, Tensor *output);
-void Pool3x3Maxs1p1(const Tensor *input, Tensor *output);
-void Pool3x3Max(vector<int> strides, vector<int> paddings, const Tensor *input,
-                Tensor *output);
-
-void Pool3x3Avg(vector<int> strides, vector<int> paddings, const Tensor *in_x,
-                Tensor *out);
+void Pool3x3Avgs1p1(const framework::Tensor *input, framework::Tensor *output);
+void Pool3x3Maxs1p1(const framework::Tensor *input, framework::Tensor *output);
+void Pool3x3Max(std::vector<int> strides, std::vector<int> paddings,
+                const framework::Tensor *input, framework::Tensor *output);
+
+void Pool3x3Avg(std::vector<int> strides, std::vector<int> paddings,
+                const framework::Tensor *in_x, framework::Tensor *out);
+
+void Pool3x3Maxs1_int8(const framework::Tensor *input,
+                       framework::Tensor *output, int32_t pad_h, int32_t pad_w);
+void Pool3x3Maxs2_int8(const framework::Tensor *input,
+                       framework::Tensor *output, int32_t pad_h, int32_t pad_w);
+void Pool3x3Max_int8(const std::vector<int> &strides,
+                     const std::vector<int> &paddings,
+                     const framework::Tensor *input, framework::Tensor *output);
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/pool_3x3_int8.cpp
+++ b/src/operators/math/pool_3x3_int8.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+ Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+     http://www.apache.org/licenses/LICENSE-2.0
+ Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef POOL_OP
+#ifdef _OPENMP
+#include <omp.h>
+#endif
+#include "framework/tensor.h"
+#include "operators/math/pool_3x3.h"
+#if __ARM_NEON
+#include <arm_neon.h>
+#endif  // __ARM_NEON
+#include <climits>
+#include <iostream>
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+using framework::Tensor;
+using std::max;
+using std::min;
+using std::vector;
+template <typename T>
+static void make_paddings(const Tensor *input, Tensor *padded_input,
+                          int32_t top, int32_t bottom, int32_t left,
+                          int32_t right, T value) {
+  const int32_t batch_size = input->dims()[0];
+  const int32_t c_in = input->dims()[1];
+  const int32_t h_in = input->dims()[2];
+  const int32_t w_in = input->dims()[3];
+  const int32_t h_padded = h_in + top + bottom;
+  const int32_t w_padded = w_in + left + right;
+  padded_input->Resize({batch_size, c_in, h_padded, w_padded});
+  T *padded_input_data = padded_input->mutable_data<T>();
+  const T *input_data = input->data<T>();
+  const int32_t input_channel_stride = h_in * w_in;
+  const int32_t input_batch_stride = c_in * input_channel_stride;
+  const int32_t padded_channel_stride = h_padded * w_padded;
+  const int32_t padded_batch_stride = c_in * padded_channel_stride;
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int j = 0; j < c_in; ++j) {
+      const T *img_in = input_data + j * input_channel_stride;
+      T *img_padded = padded_input_data + j * padded_channel_stride;
+      int k = 0;
+      for (; k < top; ++k) {
+        for (int l = 0; l < w_padded; ++l) {
+          img_padded[l] = value;
+        }
+        img_padded += w_padded;
+      }
+      for (; k < top + h_in; ++k) {
+        int l = 0;
+        for (; l < left; ++l) {
+          img_padded[l] = value;
+        }
+        memcpy(img_padded + left, img_in, w_in * sizeof(T));
+        l += w_in;
+        img_in += w_in;
+        for (; l < w_padded; ++l) {
+          img_padded[l] = value;
+        }
+        img_padded += w_padded;
+      }
+      for (; k < h_padded; ++k) {
+        for (int l = 0; l < w_padded; ++l) {
+          img_padded[l] = value;
+        }
+        img_padded += w_padded;
+      }
+    }
+    input_data += input_batch_stride;
+    padded_input_data += padded_batch_stride;
+  }
+  //  input_data = input->data<T>();
+  //  std::cout << "+++++++++++++++++++Origin begin++++++++++++++++++++"
+  //            << std::endl;
+  //  for (int i = 0; i < 1; ++i) {
+  //    for (int j = 0; j < 1; ++j) {
+  //      const T *img_in = input_data + j * input_channel_stride;
+  //      for (int k = 0; k < h_in; ++k) {
+  //        for (int l = 0; l < w_in; ++l) {
+  //          std::cout << (int32_t)*img_in << "\t";
+  //          img_in++;
+  //        }
+  //        std::cout << std::endl;
+  //      }
+  //    }
+  //    input_data += input_batch_stride;
+  //  }
+  //  std::cout << "+++++++++++++++++++Origin end++++++++++++++++++++" <<
+  //  std::endl;
+  //
+  //  padded_input_data = padded_input->data<T>();
+  //  std::cout << "******************Padding begin**********************"
+  //            << std::endl;
+  //  for (int i = 0; i < 1; ++i) {
+  //    for (int j = 0; j < 1; ++j) {
+  //      T *img_padded = padded_input_data + j * padded_channel_stride;
+  //      for (int k = 0; k < h_padded; ++k) {
+  //        for (int l = 0; l < w_padded; ++l) {
+  //          std::cout << (int32_t)*img_padded << "\t";
+  //          img_padded++;
+  //        }
+  //        std::cout << std::endl;
+  //      }
+  //    }
+  //    padded_input_data += padded_batch_stride;
+  //  }
+  //  std::cout << "******************Padding end**********************"
+  //            << std::endl;
+}
+void Pool3x3Maxs1_int8(const Tensor *input, Tensor *output, int32_t pad_h,
+                       int32_t pad_w) {
+  Tensor padded_input;
+  if (pad_h != 0 && pad_w != 0) {
+    int8_t value = -SCHAR_MAX;
+    make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
+    input = &padded_input;
+  }
+  const int32_t batch_size = input->dims()[0];
+  const int32_t h_in = input->dims()[2];
+  const int32_t w_in = input->dims()[3];
+  const int8_t *input_data = input->data<int8_t>();
+  const int32_t output_channels = output->dims()[1];
+  const int32_t h_out = output->dims()[2];
+  const int32_t w_out = output->dims()[3];
+  int8_t *output_data = output->mutable_data<int8_t>();
+  const int32_t outputdata_channel_stride = h_out * w_out;
+  const int32_t inputdata_channel_stride = h_in * w_in;
+  const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
+  const int32_t output_batch_stride =
+      output_channels * outputdata_channel_stride;
+  //    std::cout << "h_out = " << h_out << ", w_out=" << w_out << std::endl;
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int j = 0; j < output_channels; ++j) {
+      const int8_t *img_in = input_data + j * inputdata_channel_stride;
+      int8_t *img_out = output_data + j * outputdata_channel_stride;
+      for (int k = 0; k < h_out; ++k) {
+        const int8_t *row0 = img_in + k * w_in;
+        const int8_t *row1 = img_in + (k + 1) * w_in;
+        const int8_t *row2 = img_in + (k + 2) * w_in;
+#if __ARM_NEON
+        int32_t nw = w_out >> 4;
+        int32_t left_w = w_out & 0xf;
+        int32_t nw1 = left_w >> 3;
+        int32_t left_w1 = left_w & 0x7;
+#if __aarch64__
+        // TODO
+#else
+        if (nw > 0) {
+#define LOOP_LABEL "1"
+          // result: q15
+          asm volatile(
+              "vld1.8 {q0}, [%[row0]]! \n\t"  // q0=0-15
+              "vld1.8 {q2}, [%[row1]]! \n\t"
+              "vld1.8 {q4}, [%[row2]]! \n\t"
+
+              LOOP_LABEL
+              ": \n\t"
+              "vld1.8 {q1}, [%[row0]]! \n\t"  // q1=16-31
+              "vext.8 q6, q0, q1, #1 \n\t"
+              "vext.8 q7, q0, q1, #2 \n\t"
+              "vld1.8 {q3}, [%[row1]]! \n\t"
+              "vmax.s8 q15, q0, q6 \n\t"
+              "vmax.s8 q15, q15, q7 \n\t"
+              "vext.8 q6, q2, q3, #1 \n\t"
+              "vext.8 q7, q2, q3, #2 \n\t"
+              "vld1.8 {q5}, [%[row2]]! \n\t"
+              "vmax.s8 q14, q2, q6 \n\t"
+              "vmax.s8 q14, q14, q7 \n\t"
+              "vext.8 q6, q4, q5, #1 \n\t"
+              "vext.8 q7, q4, q5, #2 \n\t"
+              "vmax.s8 q13, q4, q6 \n\t"
+              "vmax.s8 q13, q13, q7 \n\t"
+              "vmax.s8 q15, q15, q14 \n\t"
+              "vmax.s8 q15, q15, q13 \n\t"
+              "vmov.s8 q0, q1 \n\t"
+              "vmov.s8 q2, q3 \n\t"
+              "vmov.s8 q4, q5 \n\t"
+              "vst1.8 {q15}, [%[img_out]]! \n\t"
+              "subs %[nw], #1 \n\t"
+              "bne " LOOP_LABEL
+              "b \n\t"
+              "sub %[row0], #16 \n\t"
+              "sub %[row1], #16 \n\t"
+              "sub %[row2], #16 \n\t"
+              : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
+                [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q13", "q14", "q15");
+#undef LOOP_LABEL
+        }
+        if (nw1 > 0 || left_w1 > 0) {
+#define PADDLE_LABEL_LESS8 "1"
+#define PADDLE_LABEL_LESS8_SAVE "2"
+#define PADDLE_LABEL_OVER "3"
+          // result: d15
+          asm volatile(
+              "vld1.8 {d0}, [%[row0]]! \n\t"  // d0=0-8
+              "vld1.8 {d2}, [%[row1]]! \n\t"
+              "vld1.8 {d4}, [%[row2]]! \n\t"
+              "mov r0, #1 \n\t"
+              "cmp %[nw1], #0 \n\t"
+              "beq " PADDLE_LABEL_LESS8
+              "f\n\t"
+              "vld1.8 {d1}, [%[row0]]! \n\t"  // d1=9-15
+              "vext.8 d6, d0, d1, #1 \n\t"
+              "vext.8 d7, d0, d1, #2 \n\t"
+              "vld1.8 {d3}, [%[row1]]! \n\t"
+              "vmax.s8 d15, d0, d6 \n\t"
+              "vmax.s8 d15, d15, d7 \n\t"
+              "vext.8 d6, d2, d3, #1 \n\t"
+              "vext.8 d7, d2, d3, #2 \n\t"
+              "vld1.8 {d5}, [%[row2]]! \n\t"
+              "vmax.s8 d14, d2, d6 \n\t"
+              "vmax.s8 d14, d14, d7 \n\t"
+              "vext.8 d6, d4, d5, #1 \n\t"
+              "vext.8 d7, d4, d5, #2 \n\t"
+              "vmax.s8 d13, d4, d6 \n\t"
+              "vmax.s8 d13, d13, d7 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+              "vmov.s8 d0, d1 \n\t"
+              "vmov.s8 d2, d3 \n\t"
+              "vmov.s8 d4, d5 \n\t"
+              "vst1.8 {d15}, [%[img_out]]! \n\t"
+
+              PADDLE_LABEL_LESS8
+              ": \n\t"
+              "cmp %[left_w1], #0 \n\t"
+              "beq " PADDLE_LABEL_OVER
+              "f\n\t"
+              "vld1.8 {d1}, [%[row0]] \n\t"  // d1=9-15
+              "vext.8 d6, d0, d1, #1 \n\t"
+              "vext.8 d7, d0, d1, #2 \n\t"
+              "vld1.8 {d3}, [%[row1]] \n\t"
+              "vmax.s8 d15, d0, d6 \n\t"
+              "vmax.s8 d15, d15, d7 \n\t"
+              "vext.8 d6, d2, d3, #1 \n\t"
+              "vext.8 d7, d2, d3, #2 \n\t"
+              "vld1.8 {d5}, [%[row2]] \n\t"
+              "vmax.s8 d14, d2, d6 \n\t"
+              "vmax.s8 d14, d14, d7 \n\t"
+              "vext.8 d6, d4, d5, #1 \n\t"
+              "vext.8 d7, d4, d5, #2 \n\t"
+              "vmax.s8 d13, d4, d6 \n\t"
+              "vmax.s8 d13, d13, d7 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+
+              PADDLE_LABEL_LESS8_SAVE
+              ": \n\t"
+              "vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
+              "add %[row0], %[row0], #1 \n\t"
+              "add %[row1], %[row1], #1 \n\t"
+              "add %[row2], %[row2], #1 \n\t"
+              "vext.8 d15, d15, d15, #1 \n\t"
+              "subs %[left_w1], #1 \n\t"
+              "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
+
+              PADDLE_LABEL_OVER ": \n\t"
+              : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
+                [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+                "d7", "d13", "d14", "d15");
+#undef PADDLE_LABEL_OVER
+#undef PADDLE_LABEL_LESS8_SAVE
+#undef PADDLE_LABEL_LESS8
+        }
+#endif  // __aarch64__
+#else
+        int32_t left = w_out;
+        while (left > 0) {
+          const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
+          const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
+          const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
+          *img_out = std::max(std::max(max0, max1), max2);
+          row0 += 1;
+          row1 += 1;
+          row2 += 1;
+          img_out++;
+          left--;
+        }
+#endif  // __ARM_NEON
+      }
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+}
+void Pool3x3Maxs2_int8(const Tensor *input, Tensor *output, int32_t pad_h,
+                       int32_t pad_w) {
+  Tensor padded_input;
+  if (pad_h != 0 && pad_w != 0) {
+    int8_t value = -SCHAR_MAX;
+    make_paddings(input, &padded_input, pad_h, pad_h, pad_w, pad_w, value);
+    input = &padded_input;
+  }
+  const int32_t batch_size = input->dims()[0];
+  const int32_t h_in = input->dims()[2];
+  const int32_t w_in = input->dims()[3];
+  const int32_t output_channels = output->dims()[1];
+  const int32_t h_out = output->dims()[2];
+  const int32_t w_out = output->dims()[3];
+  const int32_t outputdata_channel_stride = h_out * w_out;
+  const int32_t inputdata_channel_stride = h_in * w_in;
+  const int32_t output_batch_stride =
+      output_channels * outputdata_channel_stride;
+  const int32_t input_batch_stride = output_channels * inputdata_channel_stride;
+  const int8_t *input_data = input->data<int8_t>();
+  int8_t *output_data = output->mutable_data<int8_t>();
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int j = 0; j < output_channels; ++j) {
+      const int8_t *img_in = input_data + j * inputdata_channel_stride;
+      int8_t *img_out = output_data + j * outputdata_channel_stride;
+      for (int k = 0; k < h_out; ++k) {
+        const int8_t *row0 = img_in + 2 * k * w_in;
+        const int8_t *row1 = img_in + (2 * k + 1) * w_in;
+        const int8_t *row2 = img_in + (2 * k + 2) * w_in;
+#if __ARM_NEON
+        int32_t nw = w_out >> 4;
+        int32_t left_w = w_out & 0xf;
+        int32_t nw1 = left_w >> 3;
+        int32_t left_w1 = left_w & 0x7;
+#if __aarch64__
+        // TODO
+#else
+        if (nw > 0) {
+#define LOOP_LABEL "1"
+          // result: q15
+          asm volatile(
+              "vld2.8 {q0, q1}, [%[row0]]! \n\t"  // q0=0-30, q1=1-31
+              "vld2.8 {q2, q3}, [%[row1]]! \n\t"
+              "vld2.8 {q4, q5}, [%[row2]]! \n\t"
+
+              LOOP_LABEL
+              ": \n\t"
+              "vmax.s8 q15, q0, q1 \n\t"
+              "vld2.8 {q6, q7}, [%[row0]]! \n\t"  // q0=32-62, q1=33-63
+              "vmax.s8 q14, q2, q3 \n\t"
+              "vmax.s8 q13, q4, q5 \n\t"
+              "vld2.8 {q8, q9}, [%[row1]]! \n\t"
+              "vext.8 q0, q0, q6, #1 \n\t"
+              "vmax.s8 q15, q15, q0 \n\t"
+              "vld2.8 {q10, q11}, [%[row2]]! \n\t"
+              "vext.8 q2, q2, q8, #1 \n\t"
+              "vmax.s8 q14, q14, q2 \n\t"
+              "vext.8 q4, q4, q10, #1 \n\t"
+              "vmax.s8 q13, q13, q4 \n\t"
+              "vmax.s8 q15, q15, q14 \n\t"
+              "vmax.s8 q15, q15, q13 \n\t"
+              "vmov.s8 q0, q6 \n\t"
+              "vmov.s8 q1, q7 \n\t"
+              "vmov.s8 q2, q8 \n\t"
+              "vmov.s8 q3, q9 \n\t"
+              "vmov.s8 q4, q10 \n\t"
+              "vmov.s8 q5, q11 \n\t"
+              "vst1.8 {q15}, [%[img_out]]! \n\t"
+              "subs %[nw], #1 \n\t"
+              "bne " LOOP_LABEL
+              "b \n\t"
+              "sub %[row0], #32 \n\t"
+              "sub %[row1], #32 \n\t"
+              "sub %[row2], #32 \n\t"
+              : [nw] "+r"(nw), [row0] "+r"(row0), [row1] "+r"(row1),
+                [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7",
+                "q8", "q9", "q10", "q11", "q13", "q14", "q15");
+#undef LOOP_LABEL
+        }
+        if (nw1 > 0 || left_w1 > 0) {
+#define PADDLE_LABEL_LESS8 "1"
+#define PADDLE_LABEL_LESS8_SAVE "2"
+#define PADDLE_LABEL_OVER "3"
+          // result: d15
+          asm volatile(
+              "vld2.8 {d0, d1}, [%[row0]]! \n\t"  // d0=0-14, d1=1-15
+              "vld2.8 {d2, d3}, [%[row1]]! \n\t"
+              "vld2.8 {d4, d5}, [%[row2]]! \n\t"
+              "mov r0, #1 \n\t"
+              "cmp %[nw1], #0 \n\t"
+              "beq " PADDLE_LABEL_LESS8
+              "f\n\t"
+              "vmax.s8 d15, d0, d1 \n\t"
+              "vld2.8 {d6, d7}, [%[row0]]! \n\t"  // d0=32-62, d1=33-63
+              "vmax.s8 d14, d2, d3 \n\t"
+              "vmax.s8 d13, d4, d5 \n\t"
+              "vld2.8 {d8, d9}, [%[row1]]! \n\t"
+              "vext.8 d0, d0, d6, #1 \n\t"
+              "vmax.s8 d15, d15, d0 \n\t"
+              "vld2.8 {d10, d11}, [%[row2]]! \n\t"
+              "vext.8 d2, d2, d8, #1 \n\t"
+              "vmax.s8 d14, d14, d2 \n\t"
+              "vext.8 d4, d4, d10, #1 \n\t"
+              "vmax.s8 d13, d13, d4 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+              "vmov.s8 d0, d6 \n\t"
+              "vmov.s8 d1, d7 \n\t"
+              "vmov.s8 d2, d8 \n\t"
+              "vmov.s8 d3, d9 \n\t"
+              "vmov.s8 d4, d10 \n\t"
+              "vmov.s8 d5, d11 \n\t"
+              "vst1.8 {d15}, [%[img_out]]! \n\t"
+
+              PADDLE_LABEL_LESS8
+              ": \n\t"
+              "cmp %[left_w1], #0 \n\t"
+              "beq " PADDLE_LABEL_OVER
+              "f\n\t"
+              "vmax.s8 d15, d0, d1 \n\t"
+              "vld2.8 {d6, d7}, [%[row0]] \n\t"  // d0=32-62, d1=33-63
+              "vmax.s8 d14, d2, d3 \n\t"
+              "vmax.s8 d13, d4, d5 \n\t"
+              "vld2.8 {d8, d9}, [%[row1]] \n\t"
+              "vext.8 d0, d0, d6, #1 \n\t"
+              "vmax.s8 d15, d15, d0 \n\t"
+              "vld2.8 {d10, d11}, [%[row2]] \n\t"
+              "vext.8 d2, d2, d8, #1 \n\t"
+              "vmax.s8 d14, d14, d2 \n\t"
+              "vext.8 d4, d4, d10, #1 \n\t"
+              "vmax.s8 d13, d13, d4 \n\t"
+              "vmax.s8 d15, d15, d14 \n\t"
+              "vmax.s8 d15, d15, d13 \n\t"
+
+              PADDLE_LABEL_LESS8_SAVE
+              ": \n\t"
+              "vst1.8 {d15[0]}, [%[img_out]], r0\n\t"
+              "add %[row0], %[row0], #2 \n\t"
+              "add %[row1], %[row1], #2 \n\t"
+              "add %[row2], %[row2], #2 \n\t"
+              "vext.8 d15, d15, d15, #1 \n\t"
+              "subs %[left_w1], #1 \n\t"
+              "bgt " PADDLE_LABEL_LESS8_SAVE "b \n\t"
+
+              PADDLE_LABEL_OVER ": \n\t"
+              : [nw1] "+r"(nw1), [left_w1] "+r"(left_w1), [row0] "+r"(row0),
+                [row1] "+r"(row1), [row2] "+r"(row2), [img_out] "+r"(img_out)
+              :
+              : "cc", "memory", "r0", "d0", "d1", "d2", "d3", "d4", "d5", "d6",
+                "d7", "d8", "d9", "d10", "d11", "d13", "d14", "d15");
+#undef PADDLE_LABEL_OVER
+#undef PADDLE_LABEL_LESS8_SAVE
+#undef PADDLE_LABEL_LESS8
+        }
+#endif  // __aarch64__
+#else
+        int32_t left = w_out;
+        while (left > 0) {
+          const int8_t max0 = std::max(std::max(row0[0], row0[1]), row0[2]);
+          const int8_t max1 = std::max(std::max(row1[0], row1[1]), row1[2]);
+          const int8_t max2 = std::max(std::max(row2[0], row2[1]), row2[2]);
+          *img_out = std::max(std::max(max0, max1), max2);
+          row0 += 2;
+          row1 += 2;
+          row2 += 2;
+          img_out++;
+          left--;
+        }
+#endif  // __ARM_NEON
+      }
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+}
+void Pool3x3Max_int8(const vector<int> &strides, const vector<int> &paddings,
+                     const Tensor *input, Tensor *output) {
+  const int batch_size = input->dims()[0];
+  const int input_height = input->dims()[2];
+  const int input_width = input->dims()[3];
+  const int output_channels = output->dims()[1];
+  const int output_height = output->dims()[2];
+  const int output_width = output->dims()[3];
+  //  const int _kernel_size = 3;
+  const int stride = strides[0];
+  //  const int stride_width = strides[1];
+  const int padding = paddings[0];
+  //  const int padding_width = paddings[1];
+  const int8_t negative_max = -SCHAR_MAX;
+  const int input_channel_stride = input_height * input_width;
+  const int output_channel_stride = output_height * output_width;
+  const int8_t *input_data = input->data<int8_t>();
+  int8_t *output_data = output->mutable_data<int8_t>();
+  const int input_batch_stride = output_channels * input_channel_stride;
+  const int output_batch_stride = output_channels * output_channel_stride;
+  for (int i = 0; i < batch_size; ++i) {
+#pragma omp parallel for
+    for (int c = 0; c < output_channels; ++c) {
+      const int8_t *input_seg = input_data + c * input_channel_stride;
+      int8_t *output_seg = output_data + c * output_channel_stride;
+      for (int ph = 0; ph < output_height; ph++) {
+        int hstart = ph * stride - padding;
+        int hend = min(hstart + 3, input_height);
+        hstart = max(hstart, 0);
+        for (int pw = 0; pw < output_width; pw++) {
+          int wstart = pw * stride - padding;
+          int wend = min(wstart + 3, input_width);
+          wstart = max(wstart, 0);
+          const int8_t *pos1 = input_seg + hstart * input_width + wstart;
+          const int8_t *pos2 = input_seg + (hstart + 1) * input_width + wstart;
+          const int8_t *pos3 = input_seg + (hstart + 2) * input_width + wstart;
+          int8_t *output_ptr = output_seg + ph * output_width + pw;
+          if (hend - hstart != 3 || wend - wstart != 3) {
+            int8_t max_value = -SCHAR_MAX;
+            for (int h = hstart; h < hend; h++) {
+              for (int w = wstart; w < wend; w++) {
+                int8_t value = input_seg[h * input_width + w];
+                if (value > max_value) {
+                  max_value = value;
+                }
+              }
+            }
+            output_seg[ph * output_width + pw] = max_value;
+          } else {
+#if __ARM_NEON
+#if __aarch64__
+          // TODO
+#else
+            asm volatile(
+                "vld1.8  {d0}, [%[pos1]]        \n\t"
+                "vld1.8  {d1}, [%[pos2]]        \n\t"
+                "vld1.8  {d2}, [%[pos3]]        \n\t"
+                "vmax.s8 d3, d0, d1            \n\t"
+                "vmax.s8 d4, d2, d3            \n\t"
+                "vmov.s8 d4[3],  %[negative_max] \n\t"
+                "vpmax.s8  d5, d4, d4            \n\t"
+                "vpmax.s8  d6, d5, d5             \n\t"
+                "vst1.8 {d6[0]},[%[output_ptr]]    \n\t"
+                :
+                : [pos1] "r"(pos1), [pos2] "r"(pos2), [pos3] "r"(pos3),
+                  [output_ptr] "r"(output_ptr), [negative_max] "r"(negative_max)
+                : "memory", "q0", "q1", "q2", "q3");
+#endif
+#else
+            const int8_t max0 = std::max(std::max(pos1[0], pos1[1]), pos1[2]);
+            const int8_t max1 = std::max(std::max(pos2[0], pos2[1]), pos2[2]);
+            const int8_t max2 = std::max(std::max(pos3[0], pos3[1]), pos3[2]);
+            *output_ptr = std::max(std::max(max0, max1), max2);
+#endif  // __ARM_NEON
+          }
+        }
+      }
+    }
+    input_data += input_batch_stride;
+    output_data += output_batch_stride;
+  }
+}
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/math/pooling.cpp
+++ b/src/operators/math/pooling.cpp
@@ -70,15 +70,15 @@ class PoolFunctor<CPU, PoolProcess, T> {
            int wend = std::min(wstart + ksize_width, input_width);
            wstart = std::max(wstart, 0);

-            T ele = pool_process.initial();
+            auto ele = pool_process.initial();
            for (int h = hstart; h < hend; ++h) {
              for (int w = wstart; w < wend; ++w) {
                pool_process.compute(input_data[h * input_width + w], &ele);
              }
            }
            int pool_size = (hend - hstart) * (wend - wstart);
-            pool_process.finalize(static_cast<T>(pool_size), &ele);
-            output_data[ph * output_width + pw] = ele;
+            pool_process.finalize(static_cast<float>(pool_size), &ele);
+            output_data[ph * output_width + pw] = static_cast<T>(ele);
          }
        }
        input_data += input_stride;
@@ -88,8 +88,10 @@ class PoolFunctor<CPU, PoolProcess, T> {
  }
 };

-template class PoolFunctor<CPU, math::AvgPool<float>, float>;
+template class PoolFunctor<CPU, math::AvgPool<float, float>, float>;
 template class PoolFunctor<CPU, math::MaxPool<float>, float>;
+template class PoolFunctor<CPU, math::AvgPool<int8_t, int32_t>, int8_t>;
+template class PoolFunctor<CPU, math::MaxPool<int8_t>, int8_t>;
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/math/pooling.h
+++ b/src/operators/math/pooling.h
@@ -16,6 +16,8 @@ limitations under the License. */

 #pragma once

+#include <climits>
+#include <cmath>
 #include "common/log.h"
 #include "framework/tensor.h"
 #include "pool_2x2.h"
@@ -37,24 +39,42 @@ namespace math {
 * in pool pooling, and finally takes the average.
 *        MaxPoolGrad and AvgPoolGrad are gradient operations respectively.
 */
-template <class T>
+template <typename T>
 class MaxPool {
 public:
-  inline T initial() { return static_cast<T>(-FLT_MAX); }
+  inline T initial() {
+    if (typeid(T) == typeid(int8_t)) {
+      return static_cast<T>(-SCHAR_MAX);
+    }
+    return static_cast<T>(-FLT_MAX);
+  }

  inline void compute(const T &x, T *y) { *y = *y > x ? *y : x; }

  inline void finalize(const T &pool_field, T *y) {}
 };

-template <class T>
+template <typename Itype, typename Otype>
 class AvgPool {
 public:
-  inline T initial() { return static_cast<T>(0); }
-
-  inline void compute(const T &x, T *y) { *y += x; }
-
-  inline void finalize(const T &pool_field, T *y) { *y /= pool_field; }
+  inline Otype initial() { return static_cast<Otype>(0); }
+
+  inline void compute(const Itype &x, Otype *y) { *y += x; }
+
+  inline void finalize(const float &pool_field, Otype *y) {
+    if (typeid(Itype) == typeid(int8_t)) {
+      float tmp = *y / pool_field;
+      if (tmp > SCHAR_MAX) {
+        *y = SCHAR_MAX;
+      } else if (tmp < -SCHAR_MAX) {
+        *y = -SCHAR_MAX;
+      } else {
+        *y = static_cast<Otype>(std::round(tmp));
+      }
+    } else {
+      *y /= pool_field;
+    }
+  }
 };

 template <typename DeviceType, typename PoolProcess, typename T>

--- a/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+++ b/src/operators/math/winograd/winograd_transform_f6k3_arm64.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+// Inspired by https://arxiv.org/abs/1509.09308 and refered from nnpack and ncnn
+// project.
+
+#ifdef CONV_OP
+
+#ifdef __aarch64__
+
+#include "operators/math/pad.h"
+#include "operators/math/winograd/winograd_transform.h"
+
+namespace paddle_mobile {
+namespace operators {
+namespace math {
+
+template <>
+void winograd_transform_weight<8, 3>(const framework::Tensor &weight,
+                                     framework::Tensor *output) {
+  /*
+   * w0 = g0
+   * w1 = ((g0 + g2) + g1) * (-2.0 / 9)
+   * w2 = ((g0 + g2) - g1) * (-2.0 / 9)
+   * w3 = ((g0 + 4 * g2) + 2 * g1) * (1.0 / 90)
+   * w4 = ((g0 + 4 * g2) - 2 * g1) * (1.0 / 90)
+   * w5 = ((g2 + 4 * g0) + 2 * g1) * (1.0 / 180)
+   * w6 = ((g2 + 4 * g0) - 2 * g1) * (1.0 / 180)
+   * w7 = g2
+   */
+  // TODO(hjchen2)
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Winograd for arm v8 has not been implemented.");
+}
+
+template <>
+void winograd_transform_input<8, 3>(const framework::Tensor &input,
+                                    framework::Tensor *output) {
+  /*
+   * x0 = (d0 - d6) + (d4 - d2) * 5.25
+   * x1 = (d2 + d6) - 4.25 * (d4 + d3) + (d1 + d5)
+   * x2 = (d2 + d6) - 4.25 * (d4 - d3) - (d1 + d5)
+   * x3 = (0.25 * d2 - 1.25 * d4 + d6) + (0.5 * d1 - 2.5 * d3 + 2 * d5)
+   * x4 = (0.25 * d2 - 1.25 * d4 + d6) - (0.5 * d1 - 2.5 * d3 + 2 * d5)
+   * x5 = (4 * d2 - 5 * d4 + d6) + (2 * d1 - 2.5 * d3 + 0.5 * d5)
+   * x6 = (4 * d2 - 5 * d4 + d6) - (2 * d1 - 2.5 * d3 + 0.5 * d5)
+   * x7 = (d7 - d1) + (d3 - d5) * 5.25
+   */
+  // TODO(hjchen2)
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Winograd for arm v8 has not been implemented.");
+}
+
+template <>
+void winograd_transform_output<8, 3>(const framework::Tensor &input,
+                                     const framework::Tensor &weight,
+                                     framework::Tensor *output) {
+  // TODO(hjchen2)
+  PADDLE_MOBILE_THROW_EXCEPTION(
+      "Winograd for arm v8 has not been implemented.");
+}
+
+}  // namespace math
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif  // __aarch64__
+#endif  // CONV_OP
--- a/src/operators/mul_op.cpp
+++ b/src/operators/mul_op.cpp
@@ -58,6 +58,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(mul, ops::MulOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(mul, ops::MulOp);
+#endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp);
 #endif

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -439,7 +439,7 @@ class ConvParam : public OpParam {

 #endif

- private:
+ protected:
  RType *input_;
  RType *output_;
  RType *filter_;
@@ -1632,6 +1632,10 @@ class FusionFcParam : public OpParam {
    x_num_col_dims_ = GetAttr<int>("x_num_col_dims", attrs);
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);
+
+#ifdef FUSION_FC_INT8_OP
+    scale_ = InputScaleFrom<GType>(inputs, scope);
+#endif
  }
  GType *InputX() const { return input_x_; }

@@ -1655,8 +1659,16 @@ class FusionFcParam : public OpParam {
  int x_num_col_dims_;
  int y_num_col_dims_;
  int axis_;
-#ifdef PADDLE_MOBILE_FPGA

+#ifdef FUSION_FC_INT8_OP
+ public:
+  const RType *InputScale() const { return scale_; }
+
+ private:
+  RType *scale_;
+#endif
+
+#ifdef PADDLE_MOBILE_FPGA
 private:
  fpga::SplitConvArgs fpga_conv_args;

@@ -1707,7 +1719,19 @@ class FusionConvAddReluParam : public FusionConvAddParam<DeviceType> {
  FusionConvAddReluParam(const VariableNameMap &inputs,
                         const VariableNameMap &outputs,
                         const AttributeMap &attrs, const Scope &scope)
-      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {}
+      : FusionConvAddParam<DeviceType>(inputs, outputs, attrs, scope) {
+#ifdef FUSION_CONVADDRELU_INT8_OP
+    scale_ = OpParam::InputScaleFrom<GType>(inputs, scope);
+#endif
+  }
+#ifdef FUSION_CONVADDRELU_INT8_OP
+  typedef typename DtypeTensorTrait<DeviceType>::gtype GType;
+  typedef typename DtypeTensorTrait<DeviceType>::rtype RType;
+  const RType *InputScale() const { return scale_; }
+
+ private:
+  RType *scale_;
+#endif
 };
 #endif


--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -269,8 +269,8 @@ if (NOT FOUND_MATCH)


    #gen test
-    ADD_EXECUTABLE(test-pool operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
-    target_link_libraries(test-pool paddle-mobile)
+    ADD_EXECUTABLE(test-pool-op operators/test_pool_op.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-pool-op paddle-mobile)

    #gen test
    ADD_EXECUTABLE(test-softmax operators/test_softmax_op.cpp test_helper.h test_include.h executor_for_test.h)
@@ -324,6 +324,10 @@ if (NOT FOUND_MATCH)
    ADD_EXECUTABLE(test-conv-add-relu-op operators/test_conv_add_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-relu-op paddle-mobile)

+    # gen test
+    ADD_EXECUTABLE(test-conv-add-relu-int8-op operators/test_fusion_conv_add_relu_int8_op.cpp test_helper.h test_include.h)
+    target_link_libraries(test-conv-add-relu-int8-op paddle-mobile)
+
    # gen test
    ADD_EXECUTABLE(test-conv-add-bn-relu-op operators/test_fusion_conv_add_bn_relu_op.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-conv-add-bn-relu-op paddle-mobile)

--- a/test/common/test_gemm_accuracy.cpp
+++ b/test/common/test_gemm_accuracy.cpp
@@ -25,7 +25,7 @@ limitations under the License. */
 #define c(i, j) c[(i)*ldc + (j)]
 #define c1(i, j) c1[(i)*ldc + (j)]

-void print_matirx(int m, int n, int ldc, float *c) {
+void print_matrix(int m, int n, int ldc, float *c) {
  for (int i = 0; i < m; ++i) {
    std::cout << c(i, 0);
    for (int j = 1; j < n; ++j) {
@@ -98,18 +98,20 @@ int do_sgemm(int m, int n, int k, bool relu, int t1, int t2, int pr) {

  if (pr > 0) {
    std::cout << "A:" << std::endl;
-    print_matirx(m, k, lda, a);
+    print_matrix(m, k, lda, a);
    std::cout << "B:" << std::endl;
-    print_matirx(k, n, ldb, b);
+    print_matrix(k, n, ldb, b);
    std::cout << "C:" << std::endl;
-    print_matirx(m, n, ldc, c);
+    print_matrix(m, n, ldc, c);
    std::cout << "C1:" << std::endl;
-    print_matirx(m, n, ldc, c1);
+    print_matrix(m, n, ldc, c1);
  }

  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
            << "   eq=" << eq << " neq=" << neq << std::endl;

+  PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
+
  paddle_mobile::memory::Free(a);
  paddle_mobile::memory::Free(b);
  paddle_mobile::memory::Free(c);

--- a/test/common/test_gemm_int8_accuracy.cpp
+++ b/test/common/test_gemm_int8_accuracy.cpp
@@ -15,7 +15,9 @@ limitations under the License. */
 #include <cstdlib>
 #include <ctime>
 #include <iostream>
+#include <limits>
 #include <random>
+#include <type_traits>
 #include "../test_helper.h"
 #include "common/log.h"
 #include "memory/t_malloc.h"
@@ -32,26 +34,65 @@ limitations under the License. */
 using std::default_random_engine;
 using std::uniform_int_distribution;

-void print_matirx(int m, int n, int ldc, int32_t *c) {
+template <typename T>
+void print_matrix(int m, int n, int ldc, T *c) {
  for (int i = 0; i < m; ++i) {
-    std::cout << c(i, 0);
+    if (std::is_same<T, int8_t>::value) {
+      std::cout.setf(std::ios::left);
+      std::cout.width(4);
+      std::cout << static_cast<int32_t>(c(i, 0));
+    } else {
+      std::cout.setf(std::ios::left);
+      std::cout.width(6);
+      std::cout << c(i, 0);
+    }
    for (int j = 1; j < n; ++j) {
-      std::cout << " | " << c(i, j);
+      if (std::is_same<T, int8_t>::value) {
+        std::cout << " | ";
+        std::cout.setf(std::ios::left);
+        std::cout.width(4);
+        std::cout << static_cast<int32_t>(c(i, j));
+      } else {
+        std::cout << " | ";
+        std::cout.setf(std::ios::left);
+        std::cout.width(6);
+        std::cout << c(i, j);
+      }
    }
-    std::cout << std::endl;
+    std::cout << "\n";
  }
  std::cout << std::endl;
 }

-void print_matirx(int m, int n, int ldc, int8_t *c) {
-  for (int i = 0; i < m; ++i) {
-    std::cout << static_cast<int32_t>(c(i, 0));
-    for (int j = 1; j < n; ++j) {
-      std::cout << " | " << static_cast<int32_t>(c(i, j));
-    }
-    std::cout << std::endl;
-  }
-  std::cout << std::endl;
+int32_t qadd_int32(int32_t l, int32_t r) {
+  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
+  if (res > std::numeric_limits<int32_t>::max())
+    return std::numeric_limits<int32_t>::max();
+  else if (res < std::numeric_limits<int32_t>::min())
+    return std::numeric_limits<int32_t>::min();
+  else
+    return static_cast<int32_t>(res);
+}
+
+// round to zero
+float round2zero(float v) {
+  float res;
+  if (v > 0)
+    res = std::floor(v);
+  else if (v < 0)
+    res = std::ceil(v);
+  return res;
+}
+
+int8_t qscale_int32(int32_t v, float scale) {
+  float res = static_cast<float>(v) * scale;
+  res = round2zero(res);
+  if (res > 127)
+    return static_cast<int8_t>(127);
+  else if (res < -127)
+    return static_cast<int8_t>(-127);
+  else
+    return static_cast<int8_t>(res);
 }

 int do_sgemm(int m, int n, int k, bool relu, int pr) {
@@ -106,30 +147,152 @@ int do_sgemm(int m, int n, int k, bool relu, int pr) {

  if (pr > 0) {
    std::cout << "A:" << std::endl;
-    print_matirx(m, k, lda, a);
+    print_matrix(m, k, lda, a);
+    std::cout << "B:" << std::endl;
+    print_matrix(k, n, ldb, b);
+    std::cout << "C:" << std::endl;
+    print_matrix(m, n, ldc, c);
+    std::cout << "C1:" << std::endl;
+    print_matrix(m, n, ldc, c1);
+  }
+
+  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
+            << "   eq=" << eq << " neq=" << neq << std::endl;
+
+  PADDLE_MOBILE_ENFORCE(neq == 0, "The execution of do_sgemm is failed!");
+
+  paddle_mobile::memory::Free(a);
+  paddle_mobile::memory::Free(b);
+  paddle_mobile::memory::Free(c);
+  paddle_mobile::memory::Free(c1);
+
+  return 0;
+}
+
+int do_sgemm_with_bias(int m, int n, int k, bool relu, int pr,
+                       bool addOnRow = false) {
+  int lda = k;
+  int ldb = n;
+  int ldc = n;
+  float scale = 0.00628f;
+  default_random_engine e;
+  uniform_int_distribution<int8_t> pixel(-127, 127);
+  int8_t *a = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * k));
+  int8_t *b = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * k * n));
+  int8_t *c = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
+  int8_t *c1 = static_cast<int8_t *>(
+      paddle_mobile::memory::Alloc(sizeof(int8_t) * m * n));
+
+  int32_t *bias = nullptr;
+  if (addOnRow) {
+    bias = static_cast<int32_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int32_t) * n));
+  } else {
+    bias = static_cast<int32_t *>(
+        paddle_mobile::memory::Alloc(sizeof(int32_t) * m));
+  }
+
+  for (int i = 0; i < m * k; ++i) {
+    a[i] = pixel(e);
+  }
+  for (int i = 0; i < k * n; ++i) {
+    b[i] = pixel(e);
+  }
+
+  if (addOnRow) {
+    for (int i = 0; i < n; ++i) {
+      bias[i] = static_cast<int32_t>(pixel(e));
+    }
+    for (int i = 0; i < m; ++i) {
+      for (int j = 0; j < n; ++j) {
+        int32_t bias_v = bias[j];
+        int32_t r = 0;
+        for (int p = 0; p < k; p++) {
+          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
+        }
+        r = qadd_int32(r, bias_v);
+        if (relu) r = std::max(0, r);
+        c1(i, j) = qscale_int32(r, scale);
+      }
+    }
+  } else {
+    for (int i = 0; i < m; ++i) {
+      bias[i] = static_cast<int32_t>(pixel(e));
+    }
+    for (int i = 0; i < m; ++i) {
+      int32_t bias_v = bias[i];
+      for (int j = 0; j < n; ++j) {
+        int32_t r = 0;
+        for (int p = 0; p < k; p++) {
+          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
+        }
+        r = qadd_int32(r, bias_v);
+        if (relu) r = std::max(0, r);
+        c1(i, j) = qscale_int32(r, scale);
+      }
+    }
+  }
+
+  paddle_mobile::operators::math::Gemm gemm;
+#ifdef _OPENMP
+  gemm.Sgemm_omp(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
+                 relu, bias, addOnRow);
+#else
+  gemm.Sgemm(m, n, k, scale, a, lda, b, ldb, static_cast<float>(0), c, ldc,
+             relu, bias, addOnRow);
+#endif
+  int eq = 0;
+  int neq = 0;
+  for (int i = 0; i < m * n; ++i) {
+    if (c[i] == c1[i]) {
+      ++eq;
+    } else {
+      ++neq;
+    }
+  }
+
+  if (pr > 0) {
+    std::cout << "A:" << std::endl;
+    print_matrix(m, k, lda, a);
    std::cout << "B:" << std::endl;
-    print_matirx(k, n, ldb, b);
+    print_matrix(k, n, ldb, b);
+    std::cout << "Bias:" << std::endl;
+    if (addOnRow) {
+      print_matrix(1, n, n, bias);
+    } else {
+      print_matrix(m, 1, 1, bias);
+    }
    std::cout << "C:" << std::endl;
-    print_matirx(m, n, ldc, c);
+    print_matrix(m, n, ldc, c);
    std::cout << "C1:" << std::endl;
-    print_matirx(m, n, ldc, c1);
+    print_matrix(m, n, ldc, c1);
  }

  std::cout << "mnk=" << m << " " << n << " " << k << " relu=" << relu
            << "   eq=" << eq << " neq=" << neq << std::endl;

+  PADDLE_MOBILE_ENFORCE(neq == 0,
+                        "The execution of do_sgemm_with_bias is failed!");
+
  paddle_mobile::memory::Free(a);
  paddle_mobile::memory::Free(b);
  paddle_mobile::memory::Free(c);
  paddle_mobile::memory::Free(c1);
+  paddle_mobile::memory::Free(bias);

  return 0;
 }

 int main() {
 #ifdef _OPENMP
-  omp_set_num_threads(8);
+  omp_set_num_threads(4);
 #endif
+  std::cout << "\n\n******************************************************\n\n"
+            << std::endl;
+  std::cout << "Test gemm without bias:" << std::endl;
  do_sgemm(9, 9, 9, false, 1);
  do_sgemm(10, 6, 12, false, 0);
  do_sgemm(512, 256, 384, false, 0);
@@ -140,5 +303,44 @@ int main() {
  do_sgemm(333, 797, 939, false, 0);
  do_sgemm(1024, 1024, 1024, false, 0);

+  std::cout << "\n\n******************************************************\n\n"
+            << std::endl;
+  std::cout << "Test gemm with bias(bias is added on column):" << std::endl;
+  do_sgemm_with_bias(9, 9, 9, false, 1);
+  do_sgemm_with_bias(10, 6, 12, false, 0);
+  do_sgemm_with_bias(512, 256, 384, false, 0);
+  do_sgemm_with_bias(1366, 768, 256, false, 0);
+  do_sgemm_with_bias(1255, 755, 333, false, 0);
+  do_sgemm_with_bias(599, 1133, 393, false, 0);
+  do_sgemm_with_bias(777, 555, 999, false, 0);
+  do_sgemm_with_bias(333, 797, 939, false, 0);
+  do_sgemm_with_bias(1024, 1024, 1024, false, 0);
+
+  std::cout << "\n\n******************************************************\n\n"
+            << std::endl;
+  std::cout << "Test gemm with bias(bias is added on row):" << std::endl;
+  do_sgemm_with_bias(9, 9, 9, false, 1, true);
+  do_sgemm_with_bias(10, 6, 12, false, 0, true);
+  do_sgemm_with_bias(512, 256, 384, false, 0, true);
+  do_sgemm_with_bias(1366, 768, 256, false, 0, true);
+  do_sgemm_with_bias(1255, 755, 333, false, 0, true);
+  do_sgemm_with_bias(599, 1133, 393, false, 0, true);
+  do_sgemm_with_bias(777, 555, 999, false, 0, true);
+  do_sgemm_with_bias(333, 797, 939, false, 0, true);
+  do_sgemm_with_bias(1024, 1024, 1024, false, 0, true);
+
+  std::cout << "\n\n******************************************************\n\n"
+            << std::endl;
+  std::cout << "Test gemm with relu and bias:" << std::endl;
+  do_sgemm_with_bias(9, 9, 9, true, 1);
+  do_sgemm_with_bias(10, 6, 12, true, 0);
+  do_sgemm_with_bias(512, 256, 384, true, 0);
+  do_sgemm_with_bias(1366, 768, 256, true, 0);
+  do_sgemm_with_bias(1255, 755, 333, true, 0);
+  do_sgemm_with_bias(599, 1133, 393, true, 0);
+  do_sgemm_with_bias(777, 555, 999, true, 0);
+  do_sgemm_with_bias(333, 797, 939, true, 0);
+  do_sgemm_with_bias(1024, 1024, 1024, true, 0);
+
  return 0;
 }
--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,7 +28,7 @@ limitations under the License. */

 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(8);
+  paddle_mobile.SetThreadNum(4);
  Tensor aa, bb, cc;
  auto aaptr = aa.mutable_data<float>({m, k});
  auto bbptr = bb.mutable_data<float>({k, n});
@@ -44,10 +44,13 @@ int main() {
    ccptr[i] = 2;
  }

-  Tensor aa_int8, bb_int8, cc_int8;
+  Tensor aa_int8, bb_int8, cc_int32, cc_int8;
  auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
-  auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
+  auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
+  auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
+  int32_t* bias_data_col = new int32_t[m];
+  int32_t* bias_data_row = new int32_t[n];

  for (int i = 0; i < m * k; ++i) {
    aaptr_int8[i] = static_cast<int8_t>(2);
@@ -56,7 +59,15 @@ int main() {
    bbptr_int8[i] = static_cast<int8_t>(2);
  }
  for (int i = 0; i < m * n; ++i) {
-    ccptr_int8[i] = static_cast<int32_t>(2);
+    ccptr_int32[i] = static_cast<int32_t>(2);
+  }
+
+  for (int i = 0; i < m; ++i) {
+    bias_data_col[i] = 2;
+  }
+
+  for (int i = 0; i < n; ++i) {
+    bias_data_row[i] = 2;
  }

  // float
@@ -67,31 +78,87 @@ int main() {
        false, nullptr);
  }

-  auto time1 = time();
+  auto time_start0 = time();
  for (int j = 0; j < 10; ++j) {
    paddle_mobile::operators::math::matmul<float>(
        aa, false, bb, false, static_cast<float>(1), &cc, static_cast<float>(0),
        false, nullptr);
  }
-  auto time2 = time();
-  std::cout << "float gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
+  auto time_end0 = time();
+  std::cout << "float gemm  cost :" << time_diff(time_start0, time_end0) / 10
+            << "ms\n";

-  // int8_t
+  // int8_t without bias
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul<int8_t>(
-        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
-        static_cast<int8_t>(0), false, nullptr);
+    paddle_mobile::operators::math::matmul<float, int32_t>(
+        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
+        static_cast<float>(0));
+  }
+
+  auto time_start1 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul<float, int32_t>(
+        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
+        static_cast<float>(0));
  }
+  auto time_end1 = time();
+  std::cout << "int8_t gemm  cost :" << time_diff(time_start1, time_end1) / 10
+            << "ms\n";

-  auto time3 = time();
+  // int8_t with bias, column element wise add
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_col, false);
+  }
+  auto time_start2 = time();
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul<int8_t>(
-        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
-        static_cast<int8_t>(0), false, nullptr);
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_col, false);
  }
-  auto time4 = time();
-  std::cout << "int8_t gemm  cost :" << time_diff(time3, time4) / 10 << "ms\n";
+  auto time_end2 = time();
+  std::cout << "int8_t gemm_with_bias(column add) cost :"
+            << time_diff(time_start2, time_end2) / 10 << "ms\n";
+
+  // int8_t with bias, row element wise add
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_row, true);
+  }
+  auto time_start3 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), false, bias_data_row, true);
+  }
+  auto time_end3 = time();
+  std::cout << "int8_t gemm_with_bias(row add) cost :"
+            << time_diff(time_start3, time_end3) / 10 << "ms\n";
+
+  // int8_t with bias&relu
+  // warm-up 10 times
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), true, bias_data_col, false);
+  }
+  auto time_start4 = time();
+  for (int j = 0; j < 10; ++j) {
+    paddle_mobile::operators::math::matmul(
+        aa_int8, false, bb_int8, false, static_cast<float>(0.618), &cc_int8,
+        static_cast<float>(0), true, bias_data_col, false);
+  }
+  auto time_end4 = time();
+  std::cout << "int8_t gemm_with_bias_relu cost :"
+            << time_diff(time_start4, time_end4) / 10 << "ms\n";
+
+  delete[] bias_data_row;
+  delete[] bias_data_col;

  return 0;
 }
--- a/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+++ b/test/operators/test_fusion_conv_add_relu_int8_op.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include <iostream>
+
+#ifdef FUSION_CONVADDRELU_INT8_OP
+#include <limits>
+#include "../test_helper.h"
+#include "../test_include.h"
+#include "operators/fusion_conv_add_relu_int8_op.h"
+
+namespace paddle_mobile {
+int32_t qadd_int32(int32_t l, int32_t r) {
+  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
+  if (res > std::numeric_limits<int32_t>::max())
+    return std::numeric_limits<int32_t>::max();
+  else if (res < std::numeric_limits<int32_t>::min())
+    return std::numeric_limits<int32_t>::min();
+  else
+    return static_cast<int32_t>(res);
+}
+
+// round to zero
+float round2zero(float v) {
+  float res;
+  if (v > 0)
+    res = std::floor(v);
+  else if (v < 0)
+    res = std::ceil(v);
+  return res;
+}
+
+int8_t qscale_int32(int32_t v, float scale) {
+  float res = static_cast<float>(v) * scale;
+  res = round2zero(res);
+  if (res > 127)
+    return static_cast<int8_t>(127);
+  else if (res < -127)
+    return static_cast<int8_t>(-127);
+  else
+    return static_cast<int8_t>(res);
+}
+
+// Reference convolution from Caffe for checking results.
+// accumulate through explicit loops over input, output, and filters.
+template <typename T>
+void conv2d(const framework::Tensor *input, const framework::Tensor *filter,
+            const framework::Tensor *bias, const framework::AttributeMap &attrs,
+            framework::Tensor *output, float scale) {
+  framework::AttrReader attr_reader(attrs);
+  std::vector<int> paddings = attr_reader.Get<std::vector<int>>("paddings");
+  std::vector<int> strides = attr_reader.Get<std::vector<int>>("strides");
+  std::vector<int> dilations = attr_reader.Get<std::vector<int>>("dilations");
+  int groups = attr_reader.Get<int>("groups");
+  int kernel_h = filter->dims()[2];
+  int kernel_w = filter->dims()[3];
+  int pad_h = paddings[0];
+  int pad_w = paddings[1];
+  int stride_h = strides[0];
+  int stride_w = strides[1];
+  int dilation_h = dilations[0];
+  int dilation_w = dilations[1];
+  auto in_shape = input->dims();
+  auto out_shape = output->dims();
+
+  const bool has_depth = 0;
+  int kernel_d, pad_d, stride_d, dilation_d;
+  if (has_depth) {
+    kernel_d = kernel_h;
+    stride_d = stride_h;
+    pad_d = pad_h;
+    dilation_d = dilation_h;
+  } else {
+    kernel_d = stride_d = dilation_d = 1;
+    pad_d = 0;
+  }
+  // Groups
+  int o_g = out_shape[1] / groups;
+  int k_g = in_shape[1] / groups;
+  int o_head, k_head;
+  // Convolution
+  vector<int> weight_offset(4 + has_depth);
+  vector<int> in_offset(4 + has_depth);
+  vector<int> out_offset(4 + has_depth);
+  auto offset = [](const framework::Tensor *input, const vector<int> &indics) {
+    framework::DDim shape = input->dims();
+    size_t count = 0;
+    for (int i = 0; i < indics.size(); ++i) {
+      count *= shape[i];
+      count += indics[i];
+    }
+    return count;
+  };
+
+  const T *in_data = input->data<T>();
+  const T *w_data = filter->data<T>();
+  framework::Tensor output_32;
+  int32_t *out_data_32 = output_32.mutable_data<int32_t>(out_shape);
+  memset(out_data_32, 0, output_32.numel() * sizeof(int32_t));
+  for (int n = 0; n < out_shape[0]; n++) {
+    for (int g = 0; g < groups; g++) {
+      o_head = o_g * g;
+      k_head = k_g * g;
+      for (int o = 0; o < o_g; o++) {
+        for (int k = 0; k < k_g; k++) {
+          for (int z = 0; z < (has_depth ? out_shape[2] : 1); z++) {
+            for (int y = 0; y < out_shape[2 + has_depth]; y++) {
+              for (int x = 0; x < out_shape[3 + has_depth]; x++) {
+                for (int r = 0; r < kernel_d; r++) {
+                  for (int p = 0; p < kernel_h; p++) {
+                    for (int q = 0; q < kernel_w; q++) {
+                      int in_z = z * stride_d - pad_d + r * dilation_d;
+                      int in_y = y * stride_h - pad_h + p * dilation_h;
+                      int in_x = x * stride_w - pad_w + q * dilation_w;
+                      if (in_z >= 0 && in_z < (has_depth ? in_shape[2] : 1) &&
+                          in_y >= 0 && in_y < in_shape[2 + has_depth] &&
+                          in_x >= 0 && in_x < in_shape[3 + has_depth]) {
+                        weight_offset[0] = o + o_head;
+                        weight_offset[1] = k;
+                        if (has_depth) {
+                          weight_offset[2] = r;
+                        }
+                        weight_offset[2 + has_depth] = p;
+                        weight_offset[3 + has_depth] = q;
+                        in_offset[0] = n;
+                        in_offset[1] = k + k_head;
+                        if (has_depth) {
+                          in_offset[2] = in_z;
+                        }
+                        in_offset[2 + has_depth] = in_y;
+                        in_offset[3 + has_depth] = in_x;
+                        out_offset[0] = n;
+                        out_offset[1] = o + o_head;
+                        if (has_depth) {
+                          out_offset[2] = z;
+                        }
+                        out_offset[2 + has_depth] = y;
+                        out_offset[3 + has_depth] = x;
+
+                        out_data_32[offset(output, out_offset)] +=
+                            in_data[offset(input, in_offset)] *
+                            w_data[offset(filter, weight_offset)];
+                      }
+                    }
+                  }
+                }
+              }
+            }
+          }
+        }
+      }
+    }
+  }
+
+  T *out_data = output->mutable_data<T>();
+  int32_t n = out_shape[0];
+  int32_t c = out_shape[1];
+  int32_t h = out_shape[2];
+  int32_t w = out_shape[3];
+  const int32_t *bias_data = bias->data<int32_t>();
+  for (int i = 0; i < n; ++i) {
+    for (int j = 0; j < c; ++j) {
+      int32_t bias_v = bias_data[j];
+      for (int k = 0; k < h; ++k) {
+        for (int l = 0; l < w; ++l) {
+          int32_t tmp = out_data_32[i * c * h * w + j * h * w + k * w + l];
+          tmp = qadd_int32(tmp, bias_v);
+          tmp = std::max(0, tmp);
+          out_data[i * c * h * w + j * h * w + k * w + l] =
+              qscale_int32(tmp, scale);
+        }
+      }
+    }
+  }
+}
+
+template <typename T, int Kernel, int Pad, int Stride>
+int TestConvOp(int in_channels, int in_height, int in_width, int out_channels) {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  int dilation_h = 1;
+  int dilation_w = 1;
+
+  int batch_size = 1;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+  int output_c = out_channels;
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+  framework::DDim filter_shape =
+      framework::make_ddim({output_c, input_c, kernel_h, kernel_w});
+
+  int kernel_extent_h = dilation_h * (kernel_h - 1) + 1;
+  int kernel_extent_w = dilation_w * (kernel_w - 1) + 1;
+  int output_h = (input_h + 2 * pad_h - kernel_extent_h) / stride_h + 1;
+  int output_w = (input_w + 2 * pad_w - kernel_extent_w) / stride_w + 1;
+  framework::DDim output_shape = framework::make_ddim(
+      std::vector<int>({batch_size, output_c, output_h, output_w}));
+
+  framework::DDim bias_shape = framework::make_ddim({output_c});
+
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["Input"] = std::vector<std::string>({"input"});
+  inputs["Filter"] = std::vector<std::string>({"filter"});
+  inputs["Scale"] = std::vector<std::string>({"scale"});
+  inputs["Y"] = std::vector<std::string>({"bias"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(input, input_shape, -127, 127);
+
+  auto filter_var = scope.get()->Var("filter");
+  auto filter = filter_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(filter, filter_shape, -127, 127);
+
+  auto scale_var = scope.get()->Var("scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  scale->Resize(framework::make_ddim({1}));
+  float scale_v = 0.000828f;
+  scale->mutable_data<float>()[0] = scale_v;
+
+  auto bias_var = scope.get()->Var("bias");
+  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<int32_t>(bias, bias_shape, -127, 127);
+
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["dilations"].Set<vector<int>>(
+      std::vector<int>({dilation_h, dilation_w}));
+  attrs["groups"].Set<int>(1);
+  attrs["axis"].Set<int>(0);
+
+  auto *op = new operators::FusionConvAddReluInt8Op<CPU, T>(
+      "fusion_conv_add_relu_int8", inputs, outputs, attrs, scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+
+  framework::Tensor output_cmp;
+  output_cmp.mutable_data<T>(output_shape);
+  conv2d<T>(input, filter, bias, attrs, &output_cmp, scale_v);
+
+  // compare results
+  int eq = 0;
+  int neq = 0;
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const T *output_data = output->data<T>();
+  T *output_cmp_data = output_cmp.data<T>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(
+        output_data[i] == output_cmp_data[i],
+        "The execution of test_fusion_conv_add_relu_int8_op is failed!");
+    if (output_data[i] == output_cmp_data[i]) {
+      ++eq;
+    } else {
+      ++neq;
+    }
+  }
+  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
+  delete op;
+  return 0;
+}
+
+}  // namespace paddle_mobile
+
+int main(int argc, char *argv[]) {
+  if (argc < 5) {
+    LOG(paddle_mobile::kLOG_INFO)
+        << "Usage:\n"
+        << "  ./test-conv-add-relu-int8-op in_channels in_height in_width "
+           "out_channels\n"
+        << "  params:\n"
+        << "   -in_channels: int, input image's channels\n"
+        << "   -in_height: int, input image's height\n"
+        << "   -in_width: int, input image's width\n"
+        << "   -out_channels: int, conv output channels\n";
+    return 1;
+  }
+  int in_channels = atoi(argv[1]);
+  int in_height = atoi(argv[2]);
+  int in_width = atoi(argv[3]);
+  int out_channels = atoi(argv[4]);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8_t, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=2";
+  paddle_mobile::TestConvOp<int8_t, 7, 0, 2>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 1, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=2";
+  paddle_mobile::TestConvOp<int8_t, 7, 1, 2>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 3, stride = 2
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=2";
+  paddle_mobile::TestConvOp<int8_t, 7, 3, 2>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 7, 0, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 7, 1, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 3, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 7, 3, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 5, stride = 3
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=5, stride=3";
+  paddle_mobile::TestConvOp<int8_t, 7, 5, 3>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 7, pad = 3, stride = 4
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=7, pad=3, stride=4";
+  paddle_mobile::TestConvOp<int8_t, 7, 3, 4>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 3, 0, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 3, 1, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+
+  // kernel = 5, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=0, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 5, 0, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+
+  // kernel = 5, pad = 2, stride = 1
+  LOG(paddle_mobile::kLOG_INFO) << "int8, kernel=5, pad=2, stride=1";
+  paddle_mobile::TestConvOp<int8_t, 5, 2, 1>(in_channels, in_height, in_width,
+                                             out_channels);
+}
+#else
+int main() {
+  std::cout << "FUSION_CONVADDRELU_INT8_OP is not defined!" << std::endl;
+  return 0;
+}
+#endif
--- a/test/operators/test_fusion_fc_op.cpp
+++ b/test/operators/test_fusion_fc_op.cpp
@@ -12,147 +12,163 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

-#include <framework/program/program-optimize/program_optimize.h>
+#include <iostream>
+#include <type_traits>
+#include "../test_helper.h"
 #include "../test_include.h"
+#include "framework/operator.h"
+#include "operators/fusion_fc_int8_op.h"
 #include "operators/fusion_fc_op.h"

+#define a(i, j) a[(i)*lda + (j)]
+#define b(i, j) b[(i)*ldb + (j)]
+#define c(i, j) c[(i)*ldc + (j)]
+
 namespace paddle_mobile {
-namespace framework {
+using framework::AttributeMap;
+using framework::DDim;
+using framework::Scope;
+using framework::make_ddim;
+
+int32_t qadd_int32(int32_t l, int32_t r) {
+  int64_t res = static_cast<int64_t>(l) + static_cast<int64_t>(r);
+  if (res > std::numeric_limits<int32_t>::max())
+    return std::numeric_limits<int32_t>::max();
+  else if (res < std::numeric_limits<int32_t>::min())
+    return std::numeric_limits<int32_t>::min();
+  else
+    return static_cast<int32_t>(res);
+}

-template <typename Dtype>
-class TestFcOp {
- public:
-  explicit TestFcOp(const Program<Dtype> p) : program_(p) {
-    use_optimize_ = true;
-    if (use_optimize_) {
-      to_predict_program_ = program_.optimizeProgram;
-    } else {
-      to_predict_program_ = program_.originProgram;
-    }
+// round to zero
+float round2zero(float v) {
+  float res;
+  if (v > 0)
+    res = std::floor(v);
+  else if (v < 0)
+    res = std::ceil(v);
+  return res;
+}

-    const std::vector<std::shared_ptr<BlockDesc>> blocks =
-        to_predict_program_->Blocks();
-    //  DLOG << " **block size " << blocks.size();
-    for (int i = 0; i < blocks.size(); ++i) {
-      std::shared_ptr<BlockDesc> block_desc = blocks[i];
-      std::vector<std::shared_ptr<OpDesc>> ops = block_desc->Ops();
-      //    DLOG << " ops " << ops.size();
-      for (int j = 0; j < ops.size(); ++j) {
-        std::shared_ptr<OpDesc> op = ops[j];
-        if (op->Type() == "fc" && op->Input("X")[0] == "pool2d_13.tmp_0") {
-          DLOG << " fc attr size: " << op->GetAttrMap().size();
-          DLOG << " inputs size: " << op->GetInputs().size();
-          DLOG << " outputs size: " << op->GetOutputs().size();
-          DLOG << " Input X is : " << op->Input("X")[0];
-          DLOG << " Input Y is : " << op->Input("Y")[0];
-          DLOG << " Input Y is : " << op->Input("Z")[0];
-          DLOG << " Output Out is : " << op->Output("Out")[0];
-          std::shared_ptr<operators::FusionFcOp<Dtype, float>> testOp =
-              std::make_shared<operators::FusionFcOp<Dtype, float>>(
-                  op->Type(), op->GetInputs(), op->GetOutputs(),
-                  op->GetAttrMap(), program_.scope);
-          ops_of_block_[*block_desc.get()].push_back(testOp);
+int8_t qscale_int32(int32_t v, float scale) {
+  float res = static_cast<float>(v) * scale;
+  res = round2zero(res);
+  if (res > 127)
+    return static_cast<int8_t>(127);
+  else if (res < -127)
+    return static_cast<int8_t>(-127);
+  else
+    return static_cast<int8_t>(res);
+}
+
+template <typename T, typename S>
+int TestFcOP() {
+  int32_t m = 377;
+  int32_t n = 1363;
+  int32_t k = 577;
+  int32_t lda = k;
+  int32_t ldb = n;
+  int32_t ldc = n;
+  DDim inputA_shape = make_ddim({m, k});
+  DDim inputB_shape = make_ddim({k, n});
+  DDim bias_shape = make_ddim({n});
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<Scope>();
+  inputs["X"] = std::vector<std::string>({"inputA"});
+  inputs["Y"] = std::vector<std::string>({"inputB"});
+  inputs["Z"] = std::vector<std::string>({"bias"});
+  inputs["Scale"] = std::vector<std::string>({"scale"});
+  outputs["Out"] = std::vector<std::string>({"output"});
+
+  auto inputA_var = scope.get()->Var("inputA");
+  auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(inputA, inputA_shape, -127, 127);
+  auto inputB_var = scope.get()->Var("inputB");
+  auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(inputB, inputB_shape, -127, 127);
+  auto bias_var = scope.get()->Var("bias");
+  auto bias = bias_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<S>(bias, bias_shape, -127, 127);
+
+  auto scale_var = scope.get()->Var("scale");
+  auto scale = scale_var->template GetMutable<framework::LoDTensor>();
+  scale->Resize(framework::make_ddim({1}));
+  float scale_v = 0.000828f;
+  scale->mutable_data<float>()[0] = scale_v;
+
+  auto output_var = scope.get()->Var("output");
+  AttributeMap attrs;
+  attrs["x_num_col_dims"].Set<int>(1);
+  attrs["y_num_col_dims"].Set<int>(1);
+  attrs["axis"].Set<int>(1);
+  operators::OperatorBase<CPU> *op = nullptr;
+#ifdef FUSION_FC_INT8_OP
+  if (std::is_same<T, int8_t>::value) {
+    op = new operators::FusionFcInt8Op<CPU, T>("fusion_fc_int8", inputs,
+                                               outputs, attrs, scope);
+  } else {
+    op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
+                                           scope);
+  }
+#else
+  op = new operators::FusionFcOp<CPU, T>("fusion_fc", inputs, outputs, attrs,
+                                         scope);
+#endif
+  op->InferShape();
+  op->Run();
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const T *output_data = output->data<T>();
+  // compare
+  T *c = static_cast<T *>(memory::Alloc(sizeof(T) * m * n));
+  T *a = inputA->data<T>();
+  T *b = inputB->data<T>();
+  S *bias_data = bias->data<S>();
+  for (int32_t i = 0; i < m; ++i) {
+    for (int32_t j = 0; j < n; ++j) {
+      S bias_v = bias_data[j];
+      if (std::is_same<T, int8_t>::value) {
+        int32_t r = 0;
+        for (int32_t p = 0; p < k; p++) {
+          r += static_cast<int32_t>(a(i, p)) * static_cast<int32_t>(b(p, j));
        }
+        r = qadd_int32(r, bias_v);
+        c(i, j) = qscale_int32(r, scale_v);
+      } else {
+        T r = 0;
+        for (int32_t p = 0; p < k; p++) {
+          r += a(i, p) * b(p, j);
+        }
+        r += bias_v;
+        c(i, j) = r;
      }
    }
  }

-  std::shared_ptr<Tensor> predict(const Tensor &t1, const Tensor &t2,
-                                  const Tensor &t3) {
-    // feed
-    auto scope = program_.scope;
-    Variable *x_feed_value = scope->Var("pool2d_13.tmp_0");
-    auto tensor_x = x_feed_value->GetMutable<LoDTensor>();
-    tensor_x->ShareDataWith(t1);
-
-    Variable *y_feed_value = scope->Var("loss3_classifier-loc_weights");
-    auto tensor_y = y_feed_value->GetMutable<LoDTensor>();
-    tensor_y->ShareDataWith(t2);
-
-    Variable *z_feed_value = scope->Var("loss3_classifier-loc_biases");
-    auto tensor_z = z_feed_value->GetMutable<LoDTensor>();
-    tensor_z->ShareDataWith(t3);
-
-    Variable *con_output = scope->Var("loss3_classifier-loc.tmp_1");
-    auto *output_tensor = con_output->GetMutable<LoDTensor>();
-    output_tensor->mutable_data<float>({3, 10});
-    //  DLOG << typeid(output_tensor).name();
-    //  DLOG << "output_tensor dims: " << output_tensor->dims();
-
-    std::shared_ptr<LoDTensor> out_tensor = std::make_shared<LoDTensor>();
-    out_tensor.reset(output_tensor);
-
-    predict(t1, t2, t3, 0);
-    return out_tensor;
-  }
-
- private:
-  const framework::Program<Dtype> program_;
-  std::shared_ptr<ProgramDesc> to_predict_program_;
-  std::map<framework::BlockDesc,
-           std::vector<std::shared_ptr<OperatorBase<Dtype>>>>
-      ops_of_block_;
-  bool use_optimize_ = false;
-
-  void predict(const Tensor &t1, const Tensor &t2, const Tensor &t3,
-               int block_id) {
-    std::shared_ptr<BlockDesc> to_predict_block =
-        to_predict_program_->Block(block_id);
-    for (int j = 0; j < ops_of_block_[*to_predict_block.get()].size(); ++j) {
-      auto op = ops_of_block_[*to_predict_block.get()][j];
-      DLOG << "op -> run()";
-      op->Run();
+  int32_t eq = 0;
+  int32_t neq = 0;
+  for (int32_t i = 0; i < m * n; ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == c[i],
+                          "The execution of test_fusion_fc_op is failed!");
+    if (output_data[i] == c[i]) {
+      ++eq;
+    } else {
+      ++neq;
    }
  }
-};
-
-template class TestFcOp<CPU>;
-}  // namespace framework
+  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
+            << " neq=" << neq << std::endl;
+  delete op;
+  return 0;
+}
 }  // namespace paddle_mobile
-int main() {
-  DLOG << "----------**********----------";
-  DLOG << "begin to run Fc Test";
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  //    "../../../test/models/googlenet"
-  auto program = loader.Load(g_googlenet);
-  paddle_mobile::framework::ProgramOptimize optimize;
-  //  program.originProgram->Description("origin");
-  auto optimize_program = optimize.FusionOptimize(program.originProgram);
-
-  program.optimizeProgram = optimize_program;
-
-  if (optimize_program != nullptr) {
-    optimize_program->Description("optimize");
-  } else {
-    LOG(paddle_mobile::kLOG_ERROR) << "optimize_program is null";
-  }
-
-  /// input x (1,3,224,224)
-  paddle_mobile::framework::LoDTensor inputx;
-  SetupTensor<float>(&inputx, {3, 64, 1, 1}, static_cast<float>(1),
-                     static_cast<float>(1));
-  auto *inputx_ptr = inputx.data<float>();
-  /// input y (224,)
-  paddle_mobile::framework::LoDTensor inputy;
-  SetupTensor<float>(&inputy, {64, 10}, static_cast<float>(1.5),
-                     static_cast<float>(1.5));
-  auto *inputy_ptr = inputy.data<float>();
-
-  paddle_mobile::framework::LoDTensor inputz;
-  SetupTensor<float>(&inputz, {10}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto *inputz_ptr = inputz.data<float>();
-
-  paddle_mobile::framework::TestFcOp<paddle_mobile::CPU> testFcOp(program);
-
-  auto output = testFcOp.predict(inputx, inputy, inputz);
-  auto *output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << "value of output: " << output_ptr[j];
-  }
-
-  DLOG << "1 (3,64) * 2 (64,10) = 96(3,10)";
-  DLOG << "output : 96(3,10) + bias(10)";

+int main() {
+  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
+  paddle_mobile.SetThreadNum(4);
+#ifdef FUSION_FC_INT8_OP
+  paddle_mobile::TestFcOP<int8_t, int32_t>();
+#endif
+  paddle_mobile::TestFcOP<float, float>();
  return 0;
 }
--- a/test/operators/test_mul_op.cpp
+++ b/test/operators/test_mul_op.cpp
@@ -12,6 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <iostream>
 #include "../test_helper.h"
 #include "../test_include.h"
 #include "operators/mul_op.h"
@@ -79,14 +80,14 @@ int TestMulOP() {
    PADDLE_MOBILE_ENFORCE(
        output_data[i] == c[i], "output[%d] = %d, output_cmp[%d] = %d", i,
        static_cast<int32_t>(output_data[i]), i, static_cast<int32_t>(c[i]));
-    if (static_cast<int>(output_data[i] == c[i])) {
+    if (output_data[i] == c[i]) {
      ++eq;
    } else {
      ++neq;
    }
  }
-  DLOG << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
-       << " neq=" << neq;
+  std::cout << "mnk=" << m << " " << n << " " << k << "   eq=" << eq
+            << " neq=" << neq << std::endl;
  delete op;
  return 0;
 }
@@ -94,7 +95,7 @@ int TestMulOP() {

 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(8);
+  paddle_mobile.SetThreadNum(4);
  paddle_mobile::TestMulOP<int8_t, int32_t>();
  paddle_mobile::TestMulOP<float, float>();
  return 0;

--- a/test/operators/test_pool_op.cpp
+++ b/test/operators/test_pool_op.cpp
@@ -12,30 +12,281 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */

+#include <iostream>
 #include "../test_include.h"
+#include "operators/kernel/central-arm-func/pool_arm_func.h"
 #include "operators/pool_op.h"

-int main() {
-  paddle_mobile::framework::Loader<paddle_mobile::CPU> loader;
-  auto program = loader.Load(std::string(g_googlenet));
-  if (program.originProgram == nullptr) {
-    DLOG << "program read file";
+namespace paddle_mobile {
+static int PoolOutputSize(int input_size, int filter_size, int padding,
+                          int stride, bool ceil_mode) {
+  int output_size;
+  if (!ceil_mode) {
+    output_size = (input_size - filter_size + 2 * padding) / stride + 1;
+  } else {
+    output_size =
+        (input_size - filter_size + 2 * padding + stride - 1) / stride + 1;
  }
+  return output_size;
+}
+
+template <typename T>
+static void PoolAvgPad0(std::vector<int> ksize, std::vector<int> strides,
+                        const framework::Tensor *input,
+                        framework::Tensor *out) {
+  const int32_t batch_size = input->dims()[0];
+  const int32_t input_c = input->dims()[1];
+  const int32_t input_h = input->dims()[2];
+  const int32_t input_w = input->dims()[3];
+  const int32_t out_c = out->dims()[1];
+  const int32_t out_h = out->dims()[2];
+  const int32_t out_w = out->dims()[3];
+  const int32_t kernel_h = ksize[0];
+  const int32_t kernel_w = ksize[1];
+  const int32_t stride_h = strides[0];
+  const int32_t stride_w = strides[1];
+  const int32_t inputdata_channel_stride = input_h * input_w;
+  const int32_t input_batch_stride = input_c * inputdata_channel_stride;
+  const int32_t outputdata_channel_stride = out_h * out_w;
+  const int32_t output_batch_stride = out_c * outputdata_channel_stride;
+  T *out_data = out->mutable_data<T>();
+  const T *input_data = input->data<T>();
+  const T **rows = new const T *[kernel_h];
+  for (int i = 0; i < batch_size; ++i) {
+    for (int j = 0; j < out_c; ++j) {
+      const T *img_in = input_data + j * inputdata_channel_stride;
+      T *img_out = out_data + j * outputdata_channel_stride;
+      for (int k = 0; k < out_h; ++k) {
+        for (int m = 0; m < kernel_h; ++m) {
+          rows[m] = img_in + (stride_h * k + m) * input_w;
+        }
+        int32_t left = out_w;
+        while (left > 0) {
+          float tmp = 0;
+          for (int m = 0; m < kernel_h; ++m) {
+            for (int l = 0; l < kernel_w; ++l) {
+              tmp += rows[m][l];
+            }
+          }
+          if (typeid(T) == typeid(int8_t)) {
+            tmp = tmp / (kernel_h * kernel_w);
+            if (tmp < -127) {
+              *img_out = -127;
+            } else if (tmp > 127) {
+              *img_out = 127;
+            } else {
+              *img_out = static_cast<T>(std::round(tmp));
+            }
+          } else {
+            *img_out = static_cast<T>(tmp / (kernel_h * kernel_w));
+          }
+          for (int m = 0; m < kernel_h; ++m) {
+            rows[m] += stride_w;
+          }
+          img_out++;
+          left--;
+        }
+      }
+    }
+    input_data += input_batch_stride;
+    out_data += output_batch_stride;
+  }
+  delete[] rows;
+}
+
+template <typename T, int CeilMode, int PoolType, int Kernel, int Pad,
+          int Stride>
+int TestPoolOp(int in_channels, int in_height, int in_width) {
+  int kernel_h = Kernel;
+  int kernel_w = Kernel;
+  int pad_h = Pad;
+  int pad_w = Pad;
+  int stride_h = Stride;
+  int stride_w = Stride;
+  bool ceil_mode = CeilMode != 0;
+  std::string pooling_type = (PoolType == 0 ? "max" : "avg");
+
+  int batch_size = 1;
+  int input_c = in_channels;
+  int input_h = in_height;
+  int input_w = in_width;
+
+  framework::DDim input_shape =
+      framework::make_ddim({batch_size, input_c, input_h, input_w});
+
+  std::vector<int64_t> output_shape_v({batch_size, input_c});
+  output_shape_v.push_back(
+      PoolOutputSize(input_h, kernel_h, pad_h, stride_h, ceil_mode));
+  output_shape_v.push_back(
+      PoolOutputSize(input_w, kernel_w, pad_w, stride_w, ceil_mode));
+
+  framework::DDim output_shape = framework::make_ddim(output_shape_v);

-  Executor4Test<paddle_mobile::CPU,
-                paddle_mobile::operators::PoolOp<paddle_mobile::CPU, float>>
-      executor(program, "pool2d");
+  VariableNameMap inputs;
+  VariableNameMap outputs;
+  auto scope = std::make_shared<framework::Scope>();
+  inputs["X"] = std::vector<std::string>({"input"});
+  outputs["Out"] = std::vector<std::string>({"output"});

-  paddle_mobile::framework::Tensor input;
-  SetupTensor<float>(&input, {1, 64, 112, 112}, static_cast<float>(0),
-                     static_cast<float>(1));
-  auto out_ddim = paddle_mobile::framework::make_ddim({1, 64, 56, 56});
-  auto output =
-      executor.Predict(input, "conv2d_0.tmp_1", "pool2d_0.tmp_0", out_ddim);
+  auto input_var = scope.get()->Var("input");
+  auto input = input_var->template GetMutable<framework::LoDTensor>();
+  SetupTensor<T>(input, input_shape, -127, 127);

-  float *output_ptr = output->data<float>();
-  for (int j = 0; j < output->numel(); ++j) {
-    DLOG << " value of output: " << output_ptr[j];
+  auto output_var = scope.get()->Var("output");
+  framework::AttributeMap attrs;
+  attrs["pooling_type"].SetString(pooling_type);
+  attrs["ksize"].Set<vector<int>>(std::vector<int>({kernel_h, kernel_w}));
+  attrs["strides"].Set<vector<int>>(std::vector<int>({stride_h, stride_w}));
+  attrs["paddings"].Set<vector<int>>(std::vector<int>({pad_h, pad_w}));
+  attrs["ceil_mode"].Set<bool>(false);
+  attrs["global_pooling"].Set<bool>(false);
+
+  auto *op = new operators::PoolOp<CPU, float>("pool2d", inputs, outputs, attrs,
+                                               scope);
+  op->InferShape();
+  op->Init();
+  op->Run();
+
+  framework::Tensor output_cmp;
+  output_cmp.mutable_data<T>(output_shape);
+  if (pooling_type == "avg" && pad_h == 0 && pad_h == pad_w) {
+    PoolAvgPad0<T>(std::vector<int>{kernel_h, kernel_w},
+                   std::vector<int>{stride_h, stride_w}, input, &output_cmp);
+  } else {
+    if (typeid(T) == typeid(int8_t)) {
+      operators::PoolBasic<int8_t, int32_t>(
+          pooling_type, std::vector<int>{kernel_h, kernel_w},
+          std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
+          input, &output_cmp);
+    } else {
+      operators::PoolBasic<float, float>(
+          pooling_type, std::vector<int>{kernel_h, kernel_w},
+          std::vector<int>{stride_h, stride_w}, std::vector<int>{pad_h, pad_w},
+          input, &output_cmp);
+    }
+  }
+
+  // compare results
+  int eq = 0;
+  int neq = 0;
+  auto output = output_var->template Get<framework::LoDTensor>();
+  const T *output_data = output->data<T>();
+  T *output_cmp_data = output_cmp.data<T>();
+  for (int i = 0; i < output->numel(); ++i) {
+    PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
+                          "The execution of test_pool_op is failed!");
+    if (output_data[i] == output_cmp_data[i]) {
+      ++eq;
+    } else {
+      ++neq;
+    }
  }
+  std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
+  delete op;
+
  return 0;
 }
+}  // namespace paddle_mobile
+
+int main(int argc, char *argv[]) {
+  if (argc < 4) {
+    LOG(paddle_mobile::kLOG_INFO)
+        << "Usage:\n"
+        << "  ./test-pool-op in_channels in_height in_width \n"
+        << "  params:\n"
+        << "   -in_channels: int, input image's channels\n"
+        << "   -in_height: int, input image's height\n"
+        << "   -in_width: int, input image's width\n";
+    return 1;
+  }
+  int in_channels = atoi(argv[1]);
+  int in_height = atoi(argv[2]);
+  int in_width = atoi(argv[3]);
+#if __ARM_NEON
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestPoolOp<float, 0, 0, 3, 1, 1>(in_channels, in_height,
+                                                  in_width);
+  // kernel = 3, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<float, 0, 0, 3, 0, 2>(in_channels, in_height,
+                                                  in_width);
+#endif
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 1, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 2, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 0, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 1, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=1, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 1, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=2, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 2, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 3, stride = 3
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=max, kernel=3, pad=3, stride=3";
+  paddle_mobile::TestPoolOp<int8_t, 0, 0, 3, 3, 3>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 2
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=2";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 2>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 3
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=3";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 7, 0, 3>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 1>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 3, pad = 0, stride = 3
+  LOG(paddle_mobile::kLOG_INFO)
+      << "int8_t, ceil_mode=false, pooling_type=avg, kernel=3, pad=0, stride=3";
+  paddle_mobile::TestPoolOp<int8_t, 0, 1, 3, 0, 3>(in_channels, in_height,
+                                                   in_width);
+  // kernel = 7, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 1>(in_channels, in_height,
+                                                  in_width);
+  // kernel = 7, pad = 0, stride = 4
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=avg, kernel=7, pad=0, stride=4";
+  paddle_mobile::TestPoolOp<float, 0, 1, 7, 0, 4>(in_channels, in_height,
+                                                  in_width);
+  // kernel = 5, pad = 0, stride = 1
+  LOG(paddle_mobile::kLOG_INFO)
+      << "float, ceil_mode=false, pooling_type=avg, kernel=5, pad=0, stride=1";
+  paddle_mobile::TestPoolOp<float, 0, 1, 5, 0, 1>(in_channels, in_height,
+                                                  in_width);
+}
--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -213,6 +213,8 @@ if(NOT FOUND_MATCH)
  set(FUSION_CONVADD_OP ON)
  set(FUSION_CONVADDPRELU_OP ON)
  set(FUSION_CONVADDRELU_OP ON)
+  set(FUSION_CONVADDRELU_INT8_OP ON)
+  set(FUSION_FC_INT8_OP ON)
  set(FUSION_FC_OP ON)
  set(LRN_OP ON)
  set(MUL_OP ON)
@@ -312,6 +314,9 @@ endif()
 if (FUSION_CONVADDRELU_OP)
  add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()
+if (FUSION_CONVADDRELU_INT8_OP)
+  add_definitions(-DFUSION_CONVADDRELU_INT8_OP)
+endif()
 if (FUSION_CONVADDPRELU_OP)
  add_definitions(-DFUSION_CONVADDPRELU_OP)
 endif()
@@ -321,6 +326,9 @@ endif()
 if (FUSION_FC_OP)
  add_definitions(-DFUSION_FC_OP)
 endif()
+if(FUSION_FC_INT8_OP)
+  add_definitions(-DFUSION_FC_INT8_OP)
+endif()
 if (LRN_OP)
  add_definitions(-DLRN_OP)
 endif()