Merge branch 'dev-latest' of https://github.com/hjchen2/paddle-mobile into dev-latest

c569382c · hjchen2 · 6bc07183 · f7350b4d · c569382c · c569382c
37 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
    arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
    arg->conv_arg[i].filter_scale_address = filter->scale;
-    arg->conv_arg[i].filter_address = &(
+    //    arg->conv_arg[i].filter_address = &(
-        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  // NOLINT
+    //        (int8_t *)filter_ptr)[i * element_num * filter_num_per_div];  //
-    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
+    //        NOLINT
+    //    arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_arg[i].filter_num = (uint32_t)(
        i == n - 1 ? channel - (n - 1) * filter_num_per_div  // NOLINT
                   : filter_num_per_div);
+    size_t filter_size =
+        element_num * arg->conv_arg[i].filter_num * sizeof(int8_t);
+    auto filter_head =
+        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
+    arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
+    memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
+    fpga_flush(arg->conv_arg[i].filter_address, filter_size);
+    size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float);
+    auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
+    arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
+    memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
+    fpga_flush(arg->conv_arg[i].sb_address, bs_size);
    if (n > 1) {
      arg->conv_arg[i].output.scale_address =
          (float *)fpga_malloc(2 * sizeof(float));  // NOLINT
      arg->conv_arg[i].output.address =
-          fpga_malloc(input->dims()[2] *
+          fpga_malloc(out->dims()[2] *
-                      align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num,
+                      align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num,
                                 IMAGE_ALIGNMENT) *
                      sizeof(half));
    } else {
@@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
  }
+  filter->reset_data_ptr(nullptr);
+  fpga_free(bs_ptr);
 }
 }  // namespace fpga

--- a/src/fpga/V1/filter.cpp
+++ b/src/fpga/V1/filter.cpp
@@ -137,24 +137,23 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
  int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
  int num_per_div_after_alignment =
      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
-  if (num_per_div_after_alignment != num_per_div_before_alignment) {
-    char *tmp = *data_in;
-    int div_num =
-        (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-    int num_element = div_num * num_per_div_after_alignment * align_chw;
-    char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-    memset(data_tmp, 0, num_element * sizeof(char));
+  char *tmp = *data_in;
+  int div_num =
+      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
+  int num_element = div_num * num_per_div_after_alignment * align_chw;
+  char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char));  // NOLINT
-    for (i = 0; i < div_num; i++) {
+  memset(data_tmp, 0, num_element * sizeof(char));
-      memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
-             *data_in + num_per_div_before_alignment * align_chw * i,
-             num_per_div_before_alignment * align_chw);
-    }
-    *data_in = data_tmp;
+  for (i = 0; i < div_num; i++) {
-    fpga_free(tmp);
+    memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
+           *data_in + num_per_div_before_alignment * align_chw * i,
+           num_per_div_before_alignment * align_chw);
  }
+  *data_in = data_tmp;
+  fpga_free(tmp);
 }
 void reorder(char **data_in, int num_after_alignment, int chw) {
@@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
  char **quantize_data = (char **)data_in;  // NOLINT
  convert_to_hwc(quantize_data, num, channel, height, width);
  align_element(quantize_data, num, chw);
-  align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  }
  reorder(quantize_data, num_after_alignment, chw);
  interleave(quantize_data, num_after_alignment, chw);
  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
@@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
      align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
  int div_num =
      (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
-  int num_after_alignment = num_per_div_after_alignment * div_num;
+  int residual = num % num_per_div_before_alignment;
+  int num_after_alignment = num_per_div_after_alignment *
+                                ((residual == 0) ? div_num : (div_num - 1)) +
+                            align_to_x(residual, FILTER_NUM_ALIGNMENT);
  quantize(data_in, data_size, max);
  char **quantize_data = (char **)data_in;  // NOLINT
  convert_fc_filter(quantize_data, num, chw);
  align_element(quantize_data, num, chw);
-  align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  if (num_after_alignment != num) {
+    align_num(quantize_data, num_per_div_before_alignment, num, chw);
+  }
  reorder(quantize_data, num_after_alignment, chw);
  interleave(quantize_data, num_after_alignment, chw);
  fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *

--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -13,16 +13,172 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #include "fpga/common/pe.h"
+#include <unistd.h>
+#include <iomanip>
+#include <iostream>
+#include "common/types.h"
 #include "fpga/V1/filter.h"
 #include "fpga/V1/image.h"
 #include "fpga/common/config.h"
 #include "fpga/common/driver.h"
+using namespace std;
+using namespace paddle_mobile::fpga::driver;  // NOLINT
 namespace paddle_mobile {
 namespace fpga {
+#define IMAGE_ALIGN 16
+#define FILTER_ALIGN 16
+#define FILTER_NUM_ALIGN 32
+#define USE_RELU 1
+#define USE_BIAS 2
+// bypass cmd
+#define CMD_FP16_TO_FP16 0
+#define CMD_FP16_TO_FP32 1
+#define CMD_FP32_TO_FP16 2
+#define CMD_FP32_TO_FP32 3
+// bypass macro
+#define SIZE_FP16 2
+#define SIZE_FP32 4
+#define PE_IRQ_TIMEOUT 1000000
+/* Interrupt bit-set offset*/
+#define INTERRUPT_RSVD 0x0001
+#define INTERRUPT_BYPASS 0x0002
+#define INTERRUPT_CONV 0x0004
+#define INTERRUPT_POOLING 0x0008
+#define INTERRUPT_EW 0x0010
+//#define INTERRUPT_RESIZE                        0x0020
+/* Register offset */
+#define REG_INTERRUPT 0x000
+#define REG_VERSION 0x008
+#define REG_TEMPERATURE 0x010
+#define REG_FPGA_RESET 0x018
+#define REG_TEST_REGISTER 0x048
+#define REG_HARDWARE_STATUS 0x050
+#define REG_TIMER_COUNTER 0x070
+#define REG_SCALE_PARAMETER 0x080
+#define REG_FLASH_CMD 0x200
+#define REG_FLASH_DATA 0x208
+#define REG_FLASH_CONFIG 0x210
+#define REG_FLASH_STATUS 0x218
+#define REG_SN 0x220
+//#define REG_READ_SCALE
+//#define REG_WRITE_SCALE
+/*bypass*/
+#define REG_CONVERT_CMD 0x400
+#define REG_CONVERT_SRC_ADDR 0x408
+#define REG_CONVERT_DST_ADDR 0x410
+#define REG_CONVERT_LENGTH 0x418
+/*resize*/
+#define REG_RESIZE_CMD 0x600
+#define REG_RESIZE_CHANNEL_NUMBER 0x608
+#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610
+#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618
+#define REG_RESIZE_INPUT_BASE_ADDR 0x620
+#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628
+#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630
+#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638
+/*pooling*/
+#define REG_POOLING_CMD 0x800
+#define REG_POOLING_IMAGE_BASE_ADDR 0x808
+#define REG_POOLING_RESULT_BASE_ADDR 0x810
+#define REG_POOLING_IMAGE_PIXEL 0x818
+#define REG_POOLING_WINDOW_SIZE 0x820
+#define REG_POOLING_RESULT_PIXEL 0x828
+#define REG_POOLING_PAD_PIXEL 0x830
+#define REG_POOLING_STEP_PIXEL 0x838
+#define REG_POOLING_CHANNEL_NUMBER 0x840
+#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848
+#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850
+#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858
+#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860
+#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
+#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
+#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
+#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
+#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
+#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
+#define REG_POOLING_MODE_RECIPROCAL 0x890
+/*conv*/
+#define REG_CONV_CMD 0xC00
+#define REG_CONV_IMAGE_BASE_ADDR 0xC08
+#define REG_CONV_FILTER_BASE_ADDR 0xC10
+#define REG_CONV_SB_BASE_ADDR 0xC18
+#define REG_CONV_RESULT_BASE_ADDR 0xC20
+#define REG_CONV_IMAGE_PIXEL 0xC28
+#define REG_CONV_FILTER_PIXEL 0xC30
+#define REG_CONV_RESULT_PIXEL 0xC38
+#define REG_CONV_PAD_PIXEL 0xC40
+#define REG_CONV_STEP_PIXEL 0xC48
+#define REG_CONV_GROUP_NUMBER 0xC50
+#define REG_CONV_FILTER_NUMBER 0xC58
+#define REG_CONV_CHANNEL_NUMBER 0xC60
+#define REG_CONV_FILTER_PER_GROUP 0xC68
+#define REG_CONV_CHANNEL_PER_GROUP 0xC70
+#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78
+#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80
+#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88
+#define REG_CONV_FILTER_AMOUNT_ALL 0xC90
+#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98
+#define REG_CONV_RESULT_LAST_VALID 0xCA0
+#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8
+#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0
+#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8
+#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0
+#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8
+#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0
+#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8
+#define REG_CONV_IMAGE_WIN_CNT 0xCE0
+#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8
+#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8
+#define REG_CONV_PROG_FULL_CNT 0xD08
+#define REG_CONV_POST_PROG_FULL_CNT 0xD10
+#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20
+#define REG_CONV_IMAGE_SCALE 0xD28
+#define REG_CONV_FILTER_SCALE 0xD30
+/*ew*/
+#define REG_EW_CMD 0x0F00
+#define REG_EW_IMAGE0_BASE_ADDR 0x0F08
+#define REG_EW_IMAGE1_BASE_ADDR 0x0F10
+#define REG_EW_RESULT_BASE_ADDR 0x0F18
+#define REG_EW_DATA_LEN 0x0F20
+#define REG_EW_COEFFICIENT 0x0F28
+#define REG_EW_IMAGE_PIXEL 0x0F30
+#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
 int ComputeFpgaConv(const struct SplitConvArgs &args) {
-  ComputeBasicConv(args.conv_arg[0]);
+//  ComputeBasicConv(args.conv_arg[0]);
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFPGAConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num
+       << "   split_num:" << args.split_num;
+#endif
+  int split_num = args.split_num;
+  for (int i = 0; i < split_num; i++) {
+    ComputeBasicConv(args.conv_arg[i]);
+  }
+  if (split_num > 1) {
+    ComputeFPGAConcat(args.concat_arg);
+  }
 }
 int ComputeBasicConv(const struct ConvArgs &args) {
@@ -47,9 +203,237 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
 #endif
+  cout << "    relu_enabled:" << args.relu_enabled
+       << "    sb_address:" << args.sb_address
+       << "    filter_address:" << args.filter_address
+       << "    filter_num:" << args.filter_num
+       << "    group_num:" << args.group_num;
+  cout << "    image_address:" << args.image.address
+       << "    image_scale_address:" << args.image.scale_address
+       << "    image_channels:" << args.image.channels
+       << "    image_height:" << args.image.height
+       << "    image_width:" << args.image.width
+       << "    pad_height:" << args.image.pad_height
+       << "    pad_width:" << args.image.pad_width;
+  cout << "    kernel_height:" << args.kernel.height
+       << "    kernel_width:" << args.kernel.width
+       << "    stride_h:" << args.kernel.stride_h
+       << "    stride_w:" << args.kernel.stride_w;
+  cout << "    out_address:" << args.output.address
+       << "    out_scale_address:" << args.output.scale_address;
-#ifndef PADDLE_MOBILE_ZU5
+#ifdef PADDLE_MOBILE_ZU5
-  return 0;
+  DLOG << "Conv";
+  // return 0;
+  uint64_t timer_cnt;
+  uint64_t output_scale;
+  uint64_t image_scale;
+  uint64_t filter_scale;
+  uint64_t image_address_phy = 0;
+  uint64_t sb_address_phy = 0;
+  uint64_t filter_address_phy = 0;
+  uint64_t output_address_phy = 0;
+  int ret = 0;
+  fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
+  fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
+  cout << "image_scale :" << hex << (image_scale) << endl;
+  cout << "filter_scale :" << hex << (filter_scale) << endl;
+  uint64_t filterlen = (uint64_t)args.kernel.width *
+                       (uint64_t)args.kernel.height *
+                       (uint64_t)args.image.channels;
+  filterlen = align_to_x(filterlen, FILTER_ALIGN);
+  filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGN);
+  uint64_t fpga_bias_scale_len =
+      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
+  uint64_t output_height =
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
+          args.kernel.stride_h +
+      1;
+  uint64_t output_width =
+      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w +
+      1;
+  uint64_t output_size =
+      output_height * output_width * (uint64_t)args.filter_num;
+  uint64_t filter_per_group = (uint64_t)(args.filter_num / args.group_num);
+  uint64_t channel_per_group = (uint64_t)(args.image.channels / args.group_num);
+  uint64_t image_row_count = ((uint64_t)args.image.width) *
+                             ((uint64_t)args.image.channels);  // without align
+  uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGN);
+  uint64_t image_one_pad_per_row =
+      align_to_x(image_row_count, IMAGE_ALIGN) +
+      ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
+  uint64_t filter_amount_all =
+      align_to_x(((uint64_t)args.kernel.height) *
+                     ((uint64_t)args.kernel.width) * channel_per_group,
+                 FILTER_ALIGN);
+  uint64_t output_amount_per_row =
+      align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGN);
+  // find the opt partition strategy
+  uint64_t res_win;
+  uint64_t res_fit = 0;
+  for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
+    if ((align_to_x(
+             (args.image.channels *
+              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
+             IMAGE_ALIGN) /
+             16 +
+         1) *
+            args.kernel.height >
+        2048) {
+      break;
+    }
+  }
+  if (res_win != output_width) {
+    res_win -= 1;
+  }
+  if (((res_win % 2) != 0) && (res_win != 1)) {
+    res_win = res_win - 1;
+  }
+  res_fit = res_win;
+  uint64_t block_num = (output_width + res_fit - 1) / res_fit;
+  uint64_t block_len = res_fit;
+  uint64_t block_last = output_width - res_fit * (block_num - 1);
+  uint64_t res_amount_per_row = output_width * args.filter_num;
+  uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
+  uint64_t image_block_amount_per_row =
+      args.kernel.stride_w * (res_fit)*args.image.channels;
+  uint64_t filter_pad_width_mul_channel =
+      args.image.pad_width * args.image.channels;
+  uint64_t image_amount_per_row_multi_win_first =
+      image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
+  uint64_t image_amount_per_row_multi_win =
+      image_amount_per_row * (4 * args.kernel.stride_h);
+  uint64_t image_block_num = block_num;
+  uint64_t image_block_len =
+      align_to_x((args.image.channels *
+                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
+                 IMAGE_ALIGN) /
+          16 +
+      1;
+  uint64_t image_block_len_last =
+      align_to_x(
+          (args.image.channels *
+           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
+          IMAGE_ALIGN) /
+          16 +
+      1;
+  uint64_t image_win_cnt = block_len;
+  uint64_t image_win_cnt_last = block_last;
+  uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
+  uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
+  if (prog_full_cnt == 1023) {
+    prog_full_cnt--;
+  }
+  uint64_t post_prog_full_cnt =
+      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
+          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
+          : 0;
+  image_address_phy = vaddr_to_paddr(args.image.address);
+  sb_address_phy = vaddr_to_paddr(args.sb_address);
+  filter_address_phy = vaddr_to_paddr(args.filter_address);
+  output_address_phy = vaddr_to_paddr(args.output.address);
+  /*SDK刷Cache保证数据一致性*/
+  uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
+    ret = -EIO;
+    DLOG << "Conv Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }
+  /*restart scale*/
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
+  reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
+  reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR);
+  reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR);
+  reg_writeq(
+      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
+      REG_CONV_IMAGE_PIXEL);
+  reg_writeq(
+      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
+      REG_CONV_FILTER_PIXEL);
+  reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL);
+  reg_writeq(((uint64_t)args.image.pad_height) |
+                 (((uint64_t)args.image.pad_width) << 32),
+             REG_CONV_PAD_PIXEL);
+  reg_writeq(((uint64_t)args.kernel.stride_h) |
+                 (((uint64_t)args.kernel.stride_w) << 32),
+             REG_CONV_STEP_PIXEL);
+  reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
+  reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
+  reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
+  reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP);
+  reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
+  reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
+  reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
+  reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
+  reg_writeq(image_block_amount_per_row, 0xca8);
+  reg_writeq(filter_pad_width_mul_channel, 0xcb0);
+  reg_writeq(image_amount_per_row_multi_win_first, 0xcb8);
+  reg_writeq(image_amount_per_row_multi_win, 0xcc0);
+  reg_writeq(image_block_num, 0xcc8);
+  reg_writeq(image_block_len, 0xcd0);
+  reg_writeq(image_block_len_last, 0xcd8);
+  reg_writeq(image_win_cnt, 0xce0);
+  reg_writeq(image_win_cnt_last, 0xce8);
+  reg_writeq(res_row_data_align4_pad, 0xcf8);
+  reg_writeq(prog_full_cnt, 0xd08);
+  reg_writeq(post_prog_full_cnt, 0xd10);
+  reg_writeq(fpga_bias_scale_len / 4, 0xd20);
+  /*write scale*/
+  reg_writeq(image_scale, REG_CONV_IMAGE_SCALE);
+  reg_writeq(filter_scale, REG_CONV_FILTER_SCALE);
+  reg_writeq(cmd, REG_CONV_CMD);
+  DLOG << "before reg poll";
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
+    ret = -EIO;
+    DLOG << "Conv Wait Irq Timeout!";
+  }
+  DLOG << "after reg poll";
+  usleep(40);
+  /*SDK 无效 Cache保证数据一致性*/
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  cout << "output_scale :" << hex << (output_scale) << endl;
+  //*(args.output.scale_address) = output_scale;
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return ret;
 #endif
  return 0;
@@ -74,8 +458,135 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
 #endif
-#ifndef PADDLE_MOBILE_ZU5
+#ifdef PADDLE_MOBILE_ZU5
-  return 0;
+  DLOG << "Polling";
+  // return 0;
+  uint64_t output_scale = 0;
+  uint64_t timer_cnt = 0;
+  int ret = 0;
+  uint64_t cmd = 0;
+  uint64_t image_physical_address = 0;
+  uint64_t output_physical_address = 0;
+  image_physical_address = vaddr_to_paddr(args.image.address);
+  output_physical_address = vaddr_to_paddr(args.output.address);
+  uint32_t output_height = (uint32_t)(
+      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
+          args.kernel.stride_h +
+      1);
+  uint32_t output_width = (uint32_t)(
+      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
+          args.kernel.stride_w +
+      1);
+  uint64_t image_amount_per_row = align_to_x(
+      (uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGN);
+  uint64_t image_one_pad_per_row =
+      align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
+                 FILTER_ALIGN) +
+      (uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
+  uint64_t image_two_pad_per_row = align_to_x(
+      ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
+          (uint64_t)args.image.channels,
+      IMAGE_ALIGN);
+  uint64_t image_row_mul_pooling_hight =
+      image_amount_per_row * (uint64_t)args.kernel.height;
+  uint64_t image_row_mul_pad_hight =
+      image_amount_per_row * (uint64_t)args.image.pad_height;
+  uint64_t image_row_mul_step_hight =
+      image_amount_per_row * (uint64_t)args.kernel.stride_h;
+  uint64_t result_amount_align_32 = align_to_x(
+      (uint64_t)output_width * (uint64_t)args.image.channels, FILTER_ALIGN);
+  uint64_t result_amount_align_64 = align_to_x(
+      (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGN);
+  uint64_t image_calcu_height =
+      (uint64_t)args.kernel.height +
+      ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
+  uint64_t image_pad_left = args.image.channels * args.image.pad_width;
+  uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
+  uint64_t image_padleft_skipwindow =
+      (image_skip_window << 32) | image_pad_left;
+  uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
+                             (((uint64_t)args.kernel_reciprocal));
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
+    ret = -EIO;
+    DLOG << "Conv Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }
+  /*restart scale*/
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
+  reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
+  reg_writeq(
+      ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
+      REG_POOLING_IMAGE_PIXEL);
+  reg_writeq(
+      ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
+      REG_POOLING_WINDOW_SIZE);
+  reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
+             REG_POOLING_RESULT_PIXEL);
+  reg_writeq(((uint64_t)args.image.pad_height) |
+                 (((uint64_t)args.image.pad_width) << 32),
+             REG_POOLING_PAD_PIXEL);
+  reg_writeq(((uint64_t)args.kernel.stride_h) |
+                 (((uint64_t)args.kernel.stride_w) << 32),
+             REG_POOLING_STEP_PIXEL);
+  reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
+  reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
+  reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
+  reg_writeq(image_row_mul_pooling_hight,
+             REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
+  reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
+  reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
+  reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
+  reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
+  reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
+  reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
+  reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
+  /*SDK刷Cache保证数据一致性*/
+  reg_writeq(cmd, REG_POOLING_CMD);
+  DLOG << "before reg poll";
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
+    ret = -EIO;
+    DLOG << "Pooling Wait Irq Timeout!";
+  }
+  DLOG << "after reg poll";
+  usleep(40);
+  /*SDK 无效 Cache保证数据一致性*/
+  // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return ret;
 #endif
  return 0;
 }
@@ -103,8 +614,73 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
 #endif
-#ifndef PADDLE_MOBILE_ZU5
+#ifdef PADDLE_MOBILE_ZU5
-  return 0;
+  DLOG << "Conv";
+  // return 0;
+  int ret = 0;
+  uint64_t output_scale = 0;
+  uint64_t timer_cnt = 0;
+  uint64_t image0_address_phy = 0;
+  uint64_t image1_address_phy = 0;
+  uint64_t output_address_phy = 0;
+  uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
+  uint64_t datalen = (uint64_t)args.image0.width *
+                     (uint64_t)args.image0.height *
+                     (uint64_t)args.image0.channels;
+  uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
+    ret = -EIO;
+    DLOG << "Conv Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }
+  image0_address_phy = vaddr_to_paddr(args.image0.address);
+  image1_address_phy = vaddr_to_paddr(args.image1.address);
+  output_address_phy = vaddr_to_paddr(args.output.address);
+  uint64_t image_amount_per_row =
+      align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
+                 IMAGE_ALIGN);
+  uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
+                               ((uint64_t)args.image0.width << 16) |
+                               (uint64_t)args.image0.height;
+  /*SDK刷Cache保证数据一致性*/
+  /*restart scale*/
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
+  reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
+  reg_writeq(datalen, REG_EW_DATA_LEN);
+  reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL);
+  reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
+  reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR);
+  reg_writeq(coefficient, REG_EW_COEFFICIENT);
+  reg_writeq(cmd, REG_EW_CMD);
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
+    ret = -EIO;
+    DLOG << "EW Wait Irq Timeout!";
+  }
+  usleep(40);
+  /*SDK 无效 Cache保证数据一致性*/
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
+  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return ret;
 #endif
  return 0;
 }
@@ -126,8 +702,117 @@ int PerformBypass(const struct BypassArgs &args) {
  DLOG << "   out_address:" << args.output.address
       << "   out_scale_address:" << args.output.scale_address;
 #endif
-#ifndef PADDLE_MOBILE_ZU5
+#ifdef PADDLE_MOBILE_ZU5
-  return 0;
+  DLOG << "Bypass";
+  // return 0;
+  struct fpga_pe *pe;
+  uint64_t output_scale = 0;
+  uint64_t timer_cnt = 0;
+  uint64_t cmd = 0;
+  uint64_t datalen = 0;
+  uint64_t input_address_phy = 0;
+  uint64_t output_address_phy = 0;
+  uint8_t data_cell_in = 0;
+  uint8_t data_cell_out = 0;
+  int ret = 0;
+  datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
+            (uint64_t)args.image.channels;
+  datalen = align_to_x(datalen, 16);
+  input_address_phy = vaddr_to_paddr(args.image.address);
+  output_address_phy = vaddr_to_paddr(args.output.address);
+  DLOG << "input_phy:" << input_address_phy;
+  DLOG << "output_phy:" << output_address_phy;
+  switch (args.input_data_type) {
+    case DATA_TYPE_FP16: {
+      switch (args.output_data_type) {
+        case DATA_TYPE_FP16:
+          data_cell_in = SIZE_FP16;
+          data_cell_out = SIZE_FP16;
+          cmd = CMD_FP16_TO_FP16;
+          break;
+        case DATA_TYPE_FP32:
+          data_cell_in = SIZE_FP16;
+          data_cell_out = SIZE_FP32;
+          cmd = CMD_FP16_TO_FP32;
+          break;
+        default:
+          break;
+      }
+    } break;
+    case DATA_TYPE_FP32: {
+      switch (args.output_data_type) {
+        case DATA_TYPE_FP16:
+          data_cell_in = SIZE_FP32;
+          data_cell_out = SIZE_FP16;
+          cmd = CMD_FP32_TO_FP16;
+          break;
+        case DATA_TYPE_FP32:
+          data_cell_in = SIZE_FP32;
+          data_cell_out = SIZE_FP32;
+          cmd = CMD_FP32_TO_FP32;
+          break;
+        default:
+          break;
+      }
+    } break;
+    default:
+      break;
+  }
+  if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
+      cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) {
+    return -EFAULT;
+  }
+  if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32) ||
+      (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
+    return -EFAULT;
+  }
+  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
+  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) {
+    ret = -EIO;
+    DLOG << "Bypass Status Error!";
+    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+    return ret;
+  }
+  /*restart scale*/
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
+  reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
+  reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
+  reg_writeq(datalen, REG_CONVERT_LENGTH);
+  /*SDK刷Cache保证数据一致性*/
+  reg_writeq(cmd, REG_CONVERT_CMD);
+  DLOG << "before reg poll";
+  if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
+    g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
+    ret = -EIO;
+    DLOG << "BYPASS Wait Irq Timeout!";
+  }
+  DLOG << "after reg poll";
+  usleep(40);
+  /*SDK 无效 Cache保证数据一致性*/
+  output_scale = reg_readq(REG_SCALE_PARAMETER);
+  output_scale = (output_scale << 32) | (output_scale >> 32);
+  fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
+  //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
+  //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
+  pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
+  return ret;
 #endif
  return 0;
@@ -138,11 +823,14 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
  DLOG << "=============ComputeFpgaConcat===========";
  DLOG << "   Image_num: " << args.image_num
       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out;
+       << "   out_scale_address:" << args.scale_out
+       << "   out_channel:" << args.out_channel;
  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
  for (int i = 0; i < args.image_num; i++) {
    DLOG << "   " << i << "th:        ";
-    DLOG << "   channel_num:" << args.channel_num[i]
+    DLOG << "   channel_num:"
+         << args.channel_num[i]
+         // << "   aligned_channel_num:" << args.aligned_channel_num[i]
         << "   image_address:" << args.images_in[i]
         << "   image_scale_address:" << args.scales_in[i];
  }
@@ -154,6 +842,82 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
  return 0;
 }
+void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
+                         int sub_height, int sub_width, int omit_size) {
+  int origin_h = sub_height * sub_conv_n;
+  int origin_w = sub_width * sub_conv_n;
+  int align_origin_w = align_to_x(origin_w * channel, 16);
+  int deconv_h = origin_h - 2 * omit_size;
+  int deconv_w = origin_w - 2 * omit_size;
+  int deconv_row_len = deconv_w * channel;
+  int align_deconv_row_len = align_to_x(deconv_row_len, 16);
+  half *ptr_tmp = *data_in;
+  half *ptr_deconv =
+      (half *)fpga_malloc(num * align_deconv_row_len * deconv_h * sizeof(half));
+  memset(ptr_deconv, 0, num * align_deconv_row_len * deconv_h * sizeof(half));
+  int deconv_idx = 0;
+  for (int nn = 0; nn < num; ++nn) {
+    for (int hh = 0; hh < origin_h; ++hh) {
+      int hx = (hh % sub_conv_n);
+      half *sub_t = ptr_tmp + hx * sub_height * align_origin_w;  // sub(hx,:);
+      int hi = (hh / sub_conv_n);
+      if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
+      // for (int ww = 0; ww < origin_w; ++ww){
+      //   if((ww < omit_size)  )// || (ww >= (origin_w-omit_size))
+      //      continue;
+      int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
+                  omit_size * channel);
+      fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx,
+                sizeof(half) * deconv_row_len);
+      deconv_idx += align_deconv_row_len;
+      //}
+    }
+  }
+  *data_in = ptr_deconv;
+  fpga_free(ptr_tmp);
+}
+int ComputeFpgaDeconv(const struct DeconvArgs &args) {
+#ifdef FPGA_TEST_MODE
+  DLOG << "=============ComputeFPGADeConv===========";
+  DLOG << "   filter_num:" << args.filter_num
+       << "   group_num:" << args.group_num
+       << "   sub_conv_num:" << args.sub_conv_num;
+#endif
+  int sub_conv_num = args.sub_conv_num;
+  for (int i = 0; i < sub_conv_num; i++) {
+    //#if CPU_SIMULATE
+    //#else
+    ComputeBasicConv(args.conv_args[i]);
+    //#endif
+  }
+  if (sub_conv_num > 1) {
+    float max_scale = -1.0;
+    for (int i = 0; i < sub_conv_num; i++) {
+      float ptr_scale = (args.conv_args[i].output.scale_address)[0];
+      if (ptr_scale > max_scale) {
+        args.output.scale_address[0] = ptr_scale;
+        args.output.scale_address[1] =
+            (args.conv_args[i].output.scale_address)[1];
+      }
+    }
+    deconv_post_process((half **)(&(args.output.address)), args.sub_conv_num, 1,
+                        args.filter_num, (args.sub_output_height),
+                        (args.sub_output_width), args.omit_size);
+  }
+  return 0;
+}
 int ComputeFPGASplit(const struct SplitArgs &args) {
 #ifdef FPGA_PRINT_MODE
  DLOG << "=============ComputeFpgaSplit===========";
@@ -173,6 +937,5 @@ int ComputeFPGASplit(const struct SplitArgs &args) {
                     args.height, args.width);
  return 0;
 }
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -132,11 +132,11 @@ void format_concat_output(framework::Tensor *out, int height, int width,
 }
 int format_conv_data(framework::Tensor *filter_tensor,
-                     framework::Tensor *ofm_tensor, float *bs_ptr, int group) {
+                     framework::Tensor *ofm_tensor, float **bs_ptr, int group) {
  float max_value = fpga::filter_find_max(filter_tensor);
  fpga::format_filter(filter_tensor, max_value, group);
  int aligned_num = get_aligned_filter_num(filter_tensor);
-  fpga::format_bias_scale_array(&bs_ptr,
+  fpga::format_bias_scale_array(bs_ptr,
                                (int)filter_tensor->dims()[0],  // NOLINT
                                aligned_num);
  int aligned_channel = fpga::get_conv_output_channel(filter_tensor);

--- a/src/fpga/V2/api.h
+++ b/src/fpga/V2/api.h
@@ -39,7 +39,7 @@ void format_bias_scale_array(float** bias_scale_array, int filter_num,
 void format_concat_output(framework::Tensor* out, int height, int width,
                          uint32_t out_channel);
 int format_conv_data(framework::Tensor* filter_tensor,
-                     framework::Tensor* ofm_tensor, float* bs_ptr, int group);
+                     framework::Tensor* ofm_tensor, float** bs_ptr, int group);
 int format_fc_data(framework::Tensor* filter_tensor,
                   framework::Tensor* ofm_tensor, float* bs_ptr);
 void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input,

--- a/src/fpga/common/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
  for (i = 0; i < timeout; i++) {
    if (val == reg_readq(reg)) {
+      std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg
+                << std::endl;
      break;
    }
  }
-  if (i <= timeout) {
+  if (i < timeout) {
    return 0;
  } else {
    return -1;
@@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
  unsigned int nr = (unsigned int)_nr;
  int ret = 0;
+  DLOG << size;
+  DLOG << _nr;
+  DLOG << nr;
+  uint64_t a_size = FPGA_PAGE_SIZE * nr;
+  DLOG << a_size;
  pthread_mutex_lock(&memory->mutex);
@@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
    *addr = address_ofset;
  } else {
+    DLOG << "memory request failed!";
    ret = -ENOMEM;
  }
@@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) {
  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
    paddr = iter->second;
  } else {
-    DLOG << "Invalid pointer";
+    DLOG << "Invalid pointer: " << address;
  }
  return paddr;
@@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) {
    fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
                              g_fpgainfo.memory_info->nr[pos]);
    pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
+    auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
+    if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
+      g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
+    }
  } else {
    DLOG << "Invalid pointer";
  }

--- a/src/fpga/common/driver.h
+++ b/src/fpga/common/driver.h
@@ -17,6 +17,7 @@ limitations under the License. */
 #include <ctype.h>
 #include <stdio.h>
 #include <stdlib.h>
+#include <unistd.h>
 #include <cstring>
 #include <map>
@@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1;
 const int PE_IDX_EW = 2;
 const int PE_IDX_BYPASS = 3;
-enum pe_status { IDLE = 0, BUSY = 1 };
+enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 };
 struct MemoryCacheArgs {
  void *offset;
@@ -58,7 +59,7 @@ struct MemoryCacheArgs {
 struct fpga_pe {
  char type_name[MAX_TYPE_NAME_LENTH + 1];
  struct pe_data_s *outer;
-  pe_status status;  // 0=idle 1=busy -1=fail
+  pe_status status;
  uint64_t interrupt_cnt;
 };
@@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) {
  uint64_t value =
      *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
                             offset);                                // NOLINT
+  // DLOG << "read end";
+  usleep(10);
  return value;
 }
@@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) {
  // DLOG << "offset : " << offset << ", value : " << value;
  *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr +  // NOLINT
                         offset) = value;
+  // DLOG << "write end";
+  usleep(10);
 }
 int open_device_driver();

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -74,12 +74,21 @@ struct ConcatArgs {
  void* image_out;
  float* scale_out;
  uint32_t* channel_num;
-  //  uint32_t* aligned_channel_num;
+  uint32_t* aligned_channel_num;
-  //  uint32_t out_channel;
+  uint32_t out_channel;
  uint32_t height;
  uint32_t width;
 };
+struct SplitConvArgs {
+  uint32_t split_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_arg;
+  struct ConcatArgs concat_arg;
+};
 struct SplitArgs {
  uint32_t image_num;
  int16_t* image_in;
@@ -91,15 +100,6 @@ struct SplitArgs {
  uint32_t width;
 };
-struct SplitConvArgs {
-  uint32_t split_num;
-  uint32_t group_num;
-  uint32_t filter_num;
-  struct ImageOutputArgs output;
-  struct ConvArgs* conv_arg;
-  struct ConcatArgs concat_arg;
-};
 struct PoolingArgs {
  int16_t mode;  // mode: 0:max, 1:avg
  int16_t kernel_reciprocal;
@@ -127,7 +127,14 @@ struct BypassArgs {
 };
 struct DeconvArgs {
-  struct ConvArgs conv_arg;
+  uint32_t sub_conv_num;
+  uint32_t group_num;
+  uint32_t filter_num;
+  uint32_t omit_size;
+  uint32_t sub_output_width;
+  uint32_t sub_output_height;
+  struct ImageOutputArgs output;
+  struct ConvArgs* conv_args;
 };
 static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }

--- a/src/framework/cl/cl_image.h
+++ b/src/framework/cl/cl_image.h
@@ -68,6 +68,13 @@ class CLImage {
    InitCLImage(context, command_queue, folder_converter);
  }
+  void InitNormalCLImage(cl_context context, cl_command_queue command_queue) {
+    PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr,
+                          " need call SetTensorData first");
+    CLImageConverterNormal *normal_converter = new CLImageConverterNormal();
+    InitCLImage(context, command_queue, normal_converter);
+  }
  void InitCLImage(cl_context context, cl_command_queue command_queue,
                   CLImageConverterBase *converter) {
    if (image_converter_ != nullptr) {

--- a/src/operators/feed_op.cpp
+++ b/src/operators/feed_op.cpp
@@ -22,7 +22,6 @@ void FeedOp<DeviceType, T>::InferShape() const {
  auto out_dims = this->param_.Out()->dims();
  out_dims[0] = this->param_.BatchSize();
  auto input_dims = this->param_.InputX()->dims();
-  DLOG << input_dims.size();
  if (input_dims.size() == 4) {
    this->param_.Out()->Resize(input_dims);
  } else {

--- a/src/operators/fusion_fc_op.cpp
+++ b/src/operators/fusion_fc_op.cpp
@@ -60,6 +60,9 @@ REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher);
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp);
+#endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp);
 #endif

--- a/src/operators/kernel/central-arm-func/conv_arm_func.h
+++ b/src/operators/kernel/central-arm-func/conv_arm_func.h
@@ -107,15 +107,9 @@ inline void GemmConv(const ConvParam<CPU> &param) {
      Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step);
      Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step);
-      if (param.Input()->type() == typeid(int8_t)) {
+      math::matmul<Itype>(filter_slice, false, col_matrix, false,
-        math::matmul_int8(filter_slice, false, col_matrix, false,
                          static_cast<float>(1), &out_slice,
                          static_cast<float>(0));
-      } else {
-        math::matmul<float>(filter_slice, false, col_matrix, false,
-                            static_cast<float>(1), &out_slice,
-                            static_cast<float>(0));
-      }
    }
  }
 }

--- a/src/operators/kernel/central-arm-func/mul_arm_func.h
+++ b/src/operators/kernel/central-arm-func/mul_arm_func.h
@@ -73,8 +73,8 @@ void MulCompute(const MulParam<CPU> &param) {
  }
  if (param.InputX()->type() == typeid(int8_t)) {
    out->mutable_data<int32_t>();
-    math::matmul_int8(x_matrix, false, y_matrix, false, static_cast<float>(1),
+    math::matmul<int8_t>(x_matrix, false, y_matrix, false,
-                      out, static_cast<float>(0));
+                         static_cast<int8_t>(1), out, static_cast<int8_t>(0));
  } else {
    out->mutable_data<float>();

--- a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl
@@ -13,7 +13,27 @@ See the License for the specific language governing permissions and
 limitations under the License. */
 #pragma OPENCL EXTENSION cl_khr_fp16 : enable
-/*
+__kernel void concatByC0(__read_only image2d_t input_image,
+                      __write_only image2d_t output_image,
+                      __private const int out_W) {
+                      const int in_c = get_global_id(0);
+                      const int in_w = get_global_id(1);
+                      const int in_nh = get_global_id(2);
+                      int2 input_pos ;
+                      input_pos.x = in_c * out_W + in_w;
+                      input_pos.y = in_nh;
+                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                CLK_ADDRESS_CLAMP |
+                                                CLK_FILTER_NEAREST;
+                      half4 input;
+                      input = read_imageh(input_image, sampler,input_pos);
+                      write_imageh(output_image, input_pos, input);
+}
 __kernel void concatByC(__read_only image2d_t input_image1,
                      __read_only image2d_t input_image2,
@@ -24,13 +44,13 @@ __kernel void concatByC(__read_only image2d_t input_image1,
                      __private const int out_C_Start,
                      __private const int in_W,
                      __private const int in_H,
-                      __private const int int_C1,
+                      __private const int in_C1,
-                      __private const int int_C2) {
+                      __private const int in_C2) {
                       const int in_c = get_global_id(0);
                       const int in_w = get_global_id(1);
                       const int in_nh = get_global_id(2);
-                       int out_c1 = (out_C_Start)/4 + in_c;
+                       int out_c1 = (out_C_Start + 3)/4 -1 + in_c;
                       int out_c2 = out_c1 + 1;
@@ -45,7 +65,7 @@ __kernel void concatByC(__read_only image2d_t input_image1,
                       int2 input_pos1;
                       if(in_c==0){
-                        input_pos1.x = ((in_C1-1)/4) * in_W + in_w;
+                        input_pos1.x = ((in_C1 + 3)/4-1) * in_W + in_w;
                       }else{
                        input_pos1.x = (in_c - 1) * in_W + in_w;
                       }
@@ -103,26 +123,6 @@ __kernel void concatByC(__read_only image2d_t input_image1,
                       write_imageh(output_image, output_pos2, output2);
 }
-__kernel void concatByW0(__read_only image2d_t input_image,
-                      __write_only image2d_t output_image,
-                      __private const int out_W) {
-                      const int in_c = get_global_id(0);
-                      const int in_w = get_global_id(1);
-                      const int in_nh = get_global_id(2);
-                      int2 input_pos = in_c * out_W + in_w;
-                      const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
-                                                CLK_ADDRESS_CLAMP |
-                                                CLK_FILTER_NEAREST;
-                      half4 input;
-                      input = read_imageh(input_image, sampler,input_pos);
-                      write_imageh(output_image, input_pos, input);
-}
-*/
 __kernel void concatByH(__read_only image2d_t input_image,
                      __write_only image2d_t output_image,

--- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
+++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl
@@ -692,6 +692,238 @@ __kernel void conv_1x1_4(__private const int global_size_dim0,
 */
+__kernel void conv_7x7(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter_image,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+    const filter_n0 = 4 * out_c + 0;
+    const filter_n1 = 4 * out_c + 1;
+    const filter_n2 = 4 * out_c + 2;
+    const filter_n3 = 4 * out_c + 3;
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+   half4 input;
+   half4 filter[4];
+   int2 filter_pos0;
+   int2 filter_pos1;
+   int2 filter_pos2;
+   int2 filter_pos3;
+   for (int i = 0; i < input_c; ++i) {
+   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        for(int j = 0; j < 7; j++){
+         for(int k = 0; k < 7; k++){
+          input  =  select(read_imageh(input_image, sampler,
+                                (int2)(pos_in.x + (j - 3) * dilation, pos_in.y +  (k - 3) * dilation)),
+                                (half4)(0.0f),
+                                (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15));
+         int filter_h = k;
+         int filter_w = j;
+         int filter_c = i;
+         filter_pos0.x = filter_c * 7 + filter_w;
+         filter_pos0.y = filter_n0 * 7 + filter_h;
+         filter_pos1.x = filter_c * 7 + filter_w;
+         filter_pos1.y = filter_n1 * 7 + filter_h;
+         filter_pos2.x = filter_c * 7 + filter_w;
+         filter_pos2.y = filter_n2 * 7 + filter_h;
+         filter_pos3.x = filter_c * 7 + filter_w;
+         filter_pos3.y = filter_n3 * 7 + filter_h;
+         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
+         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
+         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
+         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
+         output.x += dot(input, filter[0]);
+         output.y += dot(input, filter[1]);
+         output.z += dot(input, filter[2]);
+         output.w += dot(input, filter[3]);
+         }
+        }
+    }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+    output = activation(output);
+#endif
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}
+__kernel void conv_5x5(__private const int global_size_dim0,
+                                              __private const int global_size_dim1,
+                                              __private const int global_size_dim2,
+                                              __read_only image2d_t input_image,
+                                              __read_only image2d_t filter_image,
+#ifdef BIASE
+                                              __read_only image2d_t bias,
+#endif
+#ifdef BATCH_NORM
+                                              __read_only image2d_t new_scale,
+                                              __read_only image2d_t new_biase,
+#endif
+                                              __write_only image2d_t output_image,
+                                              __private const int stride,
+                                              __private const int offset,
+                                              __private const int input_c,
+                                              __private const int dilation,
+                                              __private const int input_width,/* of one block */
+                                              __private const int input_height,/* of one block */
+                                              __private const int output_width,
+                                              __private const int output_height) {
+    const int out_c = get_global_id(0);
+    const int out_w = get_global_id(1);
+    const int out_nh = get_global_id(2);
+    if (out_c >= global_size_dim0 ||
+        out_w >= global_size_dim1 ||
+        out_nh >= global_size_dim2) {
+        return;
+    }
+    const filter_n0 = 4 * out_c + 0;
+    const filter_n1 = 4 * out_c + 1;
+    const filter_n2 = 4 * out_c + 2;
+    const filter_n3 = 4 * out_c + 3;
+    int2 stride_xy;
+    stride_xy.x = stride;
+    stride_xy.y = stride;
+    int2 ouput_pos_in_one_block;
+    ouput_pos_in_one_block.x = out_w;
+    ouput_pos_in_one_block.y = out_nh;
+    const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                              CLK_ADDRESS_CLAMP          |
+                              CLK_FILTER_NEAREST;
+    int2 in_pos_in_one_block;
+    in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset;
+    in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset;
+#ifdef BIASE
+    half4 output = read_imageh(bias, sampler, (int2)(out_c, 0));
+#else
+    half4 output = 0.0f;
+#endif
+   half4 input;
+   half4 filter[4];
+   int2 filter_pos0;
+   int2 filter_pos1;
+   int2 filter_pos2;
+   int2 filter_pos3;
+   for (int i = 0; i < input_c; ++i) {
+   int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y);
+        for(int j = 0; j < 5; j++){
+         for(int k = 0; k < 5; k++){
+          input  =  select(read_imageh(input_image, sampler,
+                                (int2)(pos_in.x + (j - 2) * dilation, pos_in.y +  (k - 2) * dilation)),
+                                (half4)(0.0f),
+                                (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15));
+         int filter_h = k;
+         int filter_w = j;
+         int filter_c = i;
+         filter_pos0.x = filter_c * 5 + filter_w;
+         filter_pos0.y = filter_n0 * 5 + filter_h;
+         filter_pos1.x = filter_c * 5 + filter_w;
+         filter_pos1.y = filter_n1 * 5 + filter_h;
+         filter_pos2.x = filter_c * 5 + filter_w;
+         filter_pos2.y = filter_n2 * 5 + filter_h;
+         filter_pos3.x = filter_c * 5 + filter_w;
+         filter_pos3.y = filter_n3 * 5 + filter_h;
+         filter[0] =  read_imageh(filter_image, sampler, filter_pos0);
+         filter[1] =  read_imageh(filter_image, sampler, filter_pos1);
+         filter[2] =  read_imageh(filter_image, sampler, filter_pos2);
+         filter[3] =  read_imageh(filter_image, sampler, filter_pos3);
+         output.x += dot(input, filter[0]);
+         output.y += dot(input, filter[1]);
+         output.z += dot(input, filter[2]);
+         output.w += dot(input, filter[3]);
+         }
+        }
+    }
+#ifdef BATCH_NORM
+    output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0));
+#endif
+#ifdef RELU
+    output = activation(output);
+#endif
+    write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output);
+}

--- a/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#pragma OPENCL EXTENSION cl_khr_fp16 : enable
+__kernel void lrn(__read_only image2d_t input_image,
+                        __write_only image2d_t output_image,
+                        __private const int out_C,
+                        __private const int out_W,
+                        __private const int n,
+                        __private const float k,
+                        __private const float alpha,
+                        __private const float beta){
+                        const int out_c = get_global_id(0);
+                        const int out_w = get_global_id(1);
+                        const int out_nh = get_global_id(2);
+                        const int out_c0 = out_c * 4;
+                        const int out_c1 = out_c * 4 + 1;
+                        const int out_c2 = out_c * 4+ 2;
+                        const int out_c3 = out_c * 4+ 3;
+                        const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE |
+                                                    CLK_ADDRESS_CLAMP |
+                                                    CLK_FILTER_NEAREST;
+                        const int start = -(n-1)/2;
+                        const end = start + n;
+                        float sqr_sum0 = 0.0f;
+                        float sqr_sum1 = 0.0f;
+                        float sqr_sum2 = 0.0f;
+                        float sqr_sum3 = 0.0f;
+                        int input_c0,input_c1,input_c2,input_c3;
+                        int2 input_pos0,input_pos1,input_pos2,input_pos3;
+                        float4 input0,input1,input2,input3;
+                        for(int i = start; i < end ;i++){
+                         if(out_c0 + i>=0&&out_c0 + i<out_C){
+                          input_c0 = (out_c0 + i)/4;
+                          input_pos0.x = input_c0 * out_W + out_w;
+                          input_pos0.y = out_nh;
+                          input0 = convert_float4(read_imageh(input_image, sampler,input_pos0));
+                          if((out_c0 + i)%4 == 0){
+                           sqr_sum0 += input0.x * input0.x;
+                          }else if((out_c0 + i)%4 == 1){
+                           sqr_sum0 += input0.y * input0.y;
+                          }else if((out_c0 + i)%4 == 2){
+                           sqr_sum0 += input0.z * input0.z;
+                          }else{
+                           sqr_sum0 += input0.w * input0.w;
+                          }
+                         }
+                       if(out_c1 + i>=0&&out_c1 + i<out_C){
+                          input_c1 = (out_c1 + i)/4;
+                          input_pos1.x = input_c1 * out_W + out_w;
+                          input_pos1.y = out_nh;
+                          input1 = convert_float4(read_imageh(input_image, sampler,input_pos1));
+                          if((out_c1 + i)%4 == 0){
+                           sqr_sum1 += input1.x * input1.x;
+                          }else if((out_c1 + i)%4 == 1){
+                           sqr_sum1 += input1.y * input1.y;
+                          }else if((out_c1 + i)%4 == 2){
+                           sqr_sum1 += input1.z * input1.z;
+                          }else{
+                           sqr_sum1 += input1.w * input1.w;
+                          }
+                         }
+                         if(out_c2 + i>=0&&out_c2 + i<out_C){
+                          input_c2 = (out_c2 + i)/4;
+                          input_pos2.x = input_c2 * out_W + out_w;
+                          input_pos2.y = out_nh;
+                          input2 = convert_float4(read_imageh(input_image, sampler,input_pos2));
+                          if((out_c2 + i)%4 == 0){
+                           sqr_sum2 += input2.x * input2.x;
+                          }else if((out_c2 + i)%4 == 1){
+                           sqr_sum2 += input2.y * input2.y;
+                          }else if((out_c2 + i)%4 == 2){
+                           sqr_sum2 += input2.z * input2.z;
+                          }else{
+                           sqr_sum2 += input2.w * input2.w;
+                          }
+                         }
+                         if(out_c3 + i>=0&&out_c3 + i<out_C){
+                          input_c3 = (out_c3 + i)/4;
+                          input_pos3.x = input_c3 * out_W + out_w;
+                          input_pos3.y = out_nh;
+                          input3 = convert_float4(read_imageh(input_image, sampler,input_pos3));
+                          if((out_c3 + i)%4 == 0){
+                           sqr_sum3 += input3.x * input3.x;
+                          }else if((out_c3 + i)%4 == 1){
+                           sqr_sum3 += input3.y * input3.y;
+                          }else if((out_c3 + i)%4 == 2){
+                           sqr_sum3 += input3.z * input3.z;
+                          }else{
+                           sqr_sum3 += input3.w * input3.w;
+                          }
+                         }
+                        }
+                        float4 output = (float4)0.0f;
+                        float4 input;
+                        int2 output_pos;
+                        output_pos.x = out_c * out_W + out_w;
+                        output_pos.y = out_nh;
+                        input = convert_float4(read_imageh(input_image, sampler,output_pos));
+                        output.x = input.x / (pow(k + alpha * (sqr_sum0),beta));
+                        if(out_C - 4 * out_c>=2){
+                        output.y = input.y / (pow(k + alpha * (sqr_sum1),beta));
+                        }
+                        if(out_C - 4 * out_c>=3){
+                        output.z = input.z / (pow(k + alpha * (sqr_sum2),beta));
+                        }
+                        if(out_C - 4 * out_c>=4){
+                        output.w = input.w / (pow(k + alpha * (sqr_sum3),beta));
+                        }
+                        half4 tmp = convert_half4(output);
+                        write_imageh(output_image, output_pos, tmp);
+}
\ No newline at end of file
--- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
+++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl
@@ -31,11 +31,13 @@ __kernel void pool_max(
  const sampler_t sampler =
      CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST;
-  int start_h = max(out_h * stride_h - pad_top, 0);
+  int start_h = out_h * stride_h - pad_top;
  int end_h = min(start_h + ksize_h, in_height);
+  start_h = max(start_h,0);
-  int start_w = max(out_w * stride_w - pad_left, 0);
+  int start_w = out_w * stride_w - pad_left;
  int end_w = min(start_w + ksize_w, in_width);
+  start_w = max(start_w,0);
  const int pos_in_x = out_c * in_width;
  const int pos_in_y = out_n * in_height;

--- a/src/operators/kernel/cl/concat_kernel.cpp
+++ b/src/operators/kernel/cl/concat_kernel.cpp
@@ -23,12 +23,17 @@ template <>
 bool ConcatKernel<GPU_CL, float>::Init(ConcatParam<GPU_CL> *param) {
  if (param->Out()->dims().size() < 4) {
    this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl");
+  } else if (param->Out()->dims().size() == 4) {
+    this->cl_helper_.AddKernel("concatByC0", "concat_kernel.cl");
+    this->cl_helper_.AddKernel("concatByC", "concat_kernel.cl");
  }
  return true;
 }
 template <>
 void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
+  DLOG << "yangfei50";
+  DLOG << param.Out()->dims();
  if (param.Out()->dims().size() < 4) {
    auto kernel = this->cl_helper_.KernelAt(0);
    auto inputs = param.Inputs();
@@ -62,6 +67,76 @@ void ConcatKernel<GPU_CL, float>::Compute(const ConcatParam<GPU_CL> &param) {
        out_H_Start += inputs[i]->dims()[0];
      }
    }
+  } else {
+    auto kernel0 = this->cl_helper_.KernelAt(0);
+    auto kernel1 = this->cl_helper_.KernelAt(1);
+    auto inputs = param.Inputs();
+    auto *output_image = param.Out()->GetCLImage();
+    int out_C_Start = 0;
+    auto input_image = inputs[0]->GetCLImage();
+    auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[0]);
+    int out_W = param.Out()->dims()[3];
+    cl_int status;
+    status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &output_image);
+    CL_CHECK_ERRORS(status);
+    status = clSetKernelArg(kernel0, 2, sizeof(int), &out_W);
+    CL_CHECK_ERRORS(status);
+    status = clEnqueueNDRangeKernel(
+        this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(),
+        NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+    CL_CHECK_ERRORS(status);
+    out_C_Start += inputs[0]->dims()[1];
+    for (int i = 1; i < inputs.size(); i++) {
+      auto input_image1 = inputs[i - 1]->GetCLImage();
+      auto input_image2 = inputs[i]->GetCLImage();
+      default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]);
+      int out_C = param.Out()->dims()[1];
+      int out_H = param.Out()->dims()[2];
+      int in_W = inputs[i]->dims()[3];
+      int in_H = inputs[i]->dims()[2];
+      int in_C1 = inputs[i - 1]->dims()[1];
+      int in_C2 = inputs[i]->dims()[1];
+      DLOG << "第" << i << "个";
+      DLOG << "out_C=" << out_C;
+      DLOG << "out_H=" << out_H;
+      DLOG << "in_W=" << in_W;
+      DLOG << "in_H=" << in_H;
+      DLOG << "in_C1=" << in_C1;
+      DLOG << "in_C2=" << in_C2;
+      DLOG << "out_C_Start = " << out_C_Start;
+      status = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &input_image1);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &input_image2);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &output_image);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 3, sizeof(int), &out_C);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 4, sizeof(int), &out_H);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 5, sizeof(int), &out_W);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 6, sizeof(int), &out_C_Start);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 7, sizeof(int), &in_W);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 8, sizeof(int), &in_H);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 9, sizeof(int), &in_C1);
+      CL_CHECK_ERRORS(status);
+      status = clSetKernelArg(kernel1, 10, sizeof(int), &in_C2);
+      CL_CHECK_ERRORS(status);
+      status = clEnqueueNDRangeKernel(
+          this->cl_helper_.CLCommandQueue(), kernel1, default_work_size.size(),
+          NULL, default_work_size.data(), NULL, 0, NULL, NULL);
+      CL_CHECK_ERRORS(status);
+      out_C_Start += inputs[i]->dims()[1];
+    }
  }
 }

--- a/src/operators/kernel/cl/conv_add_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_kernel.cpp
@@ -51,8 +51,16 @@ bool ConvAddKernel<GPU_CL, float>::Init(FusionConvAddParam<GPU_CL> *param) {
    this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl");
-  } else {
+  } else if (param->Filter()->dims()[2] == 7 &&
-    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
+             param->Filter()->dims()[3] == 7) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_7x7", "conv_add_kernel.cl");
+  } else if (param->Filter()->dims()[2] == 5 &&
+             param->Filter()->dims()[3] == 5) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_5x5", "conv_add_kernel.cl");
  }
  return true;

--- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp
@@ -52,6 +52,16 @@ bool ConvAddReluKernel<GPU_CL, float>::Init(
    this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl");
+  } else if (param->Filter()->dims()[2] == 7 &&
+             param->Filter()->dims()[3] == 7) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_7x7", "conv_add_relu_kernel.cl");
+  } else if (param->Filter()->dims()[2] == 5 &&
+             param->Filter()->dims()[3] == 5) {
+    param->Filter()->InitCLImage(cl_helper_.CLContext(),
+                                 cl_helper_.CLCommandQueue());
+    this->cl_helper_.AddKernel("conv_5x5", "conv_add_relu_kernel.cl");
  } else {
    PADDLE_MOBILE_THROW_EXCEPTION(" not support ");
  }

--- a/src/operators/kernel/cl/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/cl/fusion_fc_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef FUSION_FC_OP
+#include "operators/kernel/fusion_fc_kernel.h"
+#include "operators/math/math_function.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool FusionFcKernel<GPU_CL, float>::Init(FusionFcParam<GPU_CL> *param) {
+  param->InputY()->InitNormalCLImage(cl_helper_.CLContext(),
+                                     this->cl_helper_.CLCommandQueue());
+  param->InputZ()->InitNormalCLImage(cl_helper_.CLContext(),
+                                     this->cl_helper_.CLCommandQueue());
+  this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl");
+  this->cl_helper_.AddKernel("feed", "feed_kernel.cl");
+  return true;
+}
+template <typename P>
+void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
+                     cl_command_queue commandQueue, cl_kernel kernel0,
+                     cl_kernel kernel1) {
+  auto *input_x_image = param.InputX();
+  auto *input_y_image = param.InputY();
+  auto *input_z_image = param.InputZ();
+  int axis = param.Axis();
+  auto *out_image = param.Out();
+  Tensor *input_x = new Tensor();
+  input_x->Resize(input_x_image->dims());
+  input_x->mutable_data<float>();
+  framework::CLImageToTensor(input_x_image, input_x, context, commandQueue,
+                             kernel0);
+  Tensor *input_y = new Tensor();
+  input_y->Resize(input_y_image->dims());
+  input_y->mutable_data<float>();
+  framework::CLImageToTensor(input_y_image, input_y, context, commandQueue,
+                             kernel0);
+  Tensor *input_z = new Tensor();
+  input_z->Resize(input_z_image->dims());
+  input_z->mutable_data<float>();
+  framework::CLImageToTensor(input_z_image, input_z, context, commandQueue,
+                             kernel0);
+  auto *input_z_data = input_z->data<float>();
+  DLOG << *input_x;
+  DLOG << *input_y;
+  DLOG << *input_z;
+  Tensor *out = new Tensor();
+  out->Resize(out_image->dims());
+  out->mutable_data<float>();
+  auto *out_data = out->mutable_data<float>();
+  const Tensor x_matrix =
+      input_x->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_x, param.XNumColDims())
+          : *input_x;
+  const Tensor y_matrix =
+      input_y->dims().size() > 2
+          ? framework::ReshapeToMatrix(*input_y, param.YNumColDims())
+          : *input_y;
+  auto out_dim = out->dims();
+  if (out_dim.size() != 2) {
+    out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]});
+  }
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1");
+  PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0],
+                        " out_dim.size must be 2.");
+  axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis);
+  PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. ");
+  int64_t classes = input_z->numel();
+  for (int i = 0; i < out_dim[0]; i++) {
+    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
+  }
+  //  for (int i = 0; i < out->numel(); i++) {
+  //    DLOG << out_data[i];
+  //  }
+  // bias_data的维度和out的维度一致
+  math::matmul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+                      out, static_cast<float>(1), false);
+  out_image->InitEmptyImage(context, commandQueue, out->dims());
+  framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);
+  DLOG << *out;
+  delete (input_x);
+  delete (input_y);
+  delete (input_z);
+  delete (out);
+  PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2.");
+  //            if (out_dim.size() != 2) {
+  //                out->Resize(out_dim);
+  //            }
+}
+template <>
+void FusionFcKernel<GPU_CL, float>::Compute(
+    const FusionFcParam<GPU_CL> &param) {
+  auto kernel0 = this->cl_helper_.KernelAt(0);
+  auto kernel1 = this->cl_helper_.KernelAt(1);
+  FusionFcCompute<float>(param, this->cl_helper_.CLContext(),
+                         this->cl_helper_.CLCommandQueue(), kernel0, kernel1);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/cl/lrn_kernel.cpp
+++ b/src/operators/kernel/cl/lrn_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef LRN_OP
+#include "operators/kernel/lrn_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool LrnKernel<GPU_CL, float>::Init(LrnParam<GPU_CL> *param) {
+  this->cl_helper_.AddKernel("lrn", "lrn_kernel.cl");
+  return true;
+}
+template <>
+void LrnKernel<GPU_CL, float>::Compute(const LrnParam<GPU_CL> &param) {
+  auto kernel = this->cl_helper_.KernelAt(0);
+  auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out());
+  auto input_image = param.InputX()->GetCLImage();
+  auto x_dims = param.InputX()->dims();
+  auto output_image = param.Out()->GetCLImage();
+  const int N = x_dims[0];
+  const int C = x_dims[1];
+  const int H = x_dims[2];
+  const int W = x_dims[3];
+  const int n = param.N();
+  const float alpha = param.Alpha();
+  const float beta = param.Beta();
+  const float k = param.K();
+  DLOG << "n=" << n;
+  DLOG << "alpha=" << alpha;
+  DLOG << "beta=" << beta;
+  DLOG << "k=" << k;
+  DLOG << default_work_size;
+  DLOG << C;
+  DLOG << W;
+  cl_int status;
+  status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 2, sizeof(int), &C);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 3, sizeof(int), &W);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 4, sizeof(int), &n);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 5, sizeof(float), &k);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 6, sizeof(float), &alpha);
+  CL_CHECK_ERRORS(status);
+  status = clSetKernelArg(kernel, 7, sizeof(float), &beta);
+  status = clEnqueueNDRangeKernel(
+      this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL,
+      default_work_size.data(), NULL, 0, NULL, NULL);
+  CL_CHECK_ERRORS(status);
+}
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp
@@ -58,7 +58,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp
@@ -56,7 +56,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp
@@ -38,7 +38,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }
-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp
@@ -38,7 +38,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
    bs_ptr[i] = bias_ptr[i];
  }
-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp
@@ -50,7 +50,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp
@@ -15,6 +15,7 @@ limitations under the License. */
 #ifdef FUSION_CONVBNRELU_OP
 #include "operators/kernel/conv_bn_relu_kernel.h"
+#include "fpga/V2/filter.h"
 namespace paddle_mobile {
 namespace operators {
@@ -50,7 +51,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
-  fpga::format_conv_data(filter, out, bs_ptr, param->Groups());
+  fpga::format_conv_data(filter, out, &bs_ptr, param->Groups());
  fpga::SplitConvArgs conv_arg = {0};
  fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled,

--- a/src/operators/lrn_op.cpp
+++ b/src/operators/lrn_op.cpp
@@ -14,7 +14,7 @@ limitations under the License. */
 #ifdef LRN_OP
-#include "lrn_op.h"
+#include "operators/lrn_op.h"
 namespace paddle_mobile {
 namespace operators {
@@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators;
 #ifdef PADDLE_MOBILE_CPU
 REGISTER_OPERATOR_CPU(lrn, ops::LrnOp);
 #endif
+#ifdef PADDLE_MOBILE_CL
+REGISTER_OPERATOR_CL(lrn, ops::LrnOp);
+#endif
 #ifdef PADDLE_MOBILE_MALI_GPU
 REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp);
 #endif

--- a/src/operators/math/gemm.h
+++ b/src/operators/math/gemm.h
@@ -23,12 +23,10 @@ limitations under the License. */
 #if __aarch64__
 #define MR_INT8 4
-#define NR_INT8 2
 #define MR 6
 #define NR 16
 #else
 #define MR_INT8 4
-#define NR_INT8 2
 #define MR 6
 #define NR 8
 #endif
@@ -195,58 +193,52 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  // 8 bits int small block inner product
  void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                 int32_t ldc);
-  void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                 int32_t ldc);
  void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                 int32_t ldc);
  // 8 bits int inner product
-  void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+  void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
-                   const int8_t *b, float beta, int32_t *c, int32_t *C,
+                           const int8_t *a, const int8_t *b, int8_t beta,
-                   int32_t ldc, bool relu);
+                           int32_t *c, int32_t *C, int32_t ldc, bool relu,
-  void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+                           int8_t *bias);
-                           const int8_t *b, float beta, int32_t *c, int8_t *C,
-                           int32_t ldc, bool relu, int32_t *bias);
  // 8 bits int pack function
  void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                      int32_t lda, int8_t *buffer);
-  void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
-                         int32_t lda, int8_t *buffer);
  void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                      int32_t lda, int8_t *buffer);
-  void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
-                         int32_t ldb, int8_t *buffer);
  void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                      int32_t ldb, int8_t *buffer);
  void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                          int32_t lda, int8_t *buffer);
  void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                          int32_t ldb, int8_t *buffer);
-  void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                             const int8_t *A, int32_t lda, int8_t *buffer);
-  void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer);
  // 8 bits int matrix product
-  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
+  void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
-             int32_t lda, const int8_t *B, int32_t ldb, float beta, int32_t *C,
+             int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C,
-             int32_t ldc, bool relu, int32_t *bias);
+             int32_t ldc, bool relu, int8_t *bias);
-  void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
+  void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
-             int32_t lda, const int8_t *B, int32_t ldb, float beta, int8_t *C,
+                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
-             int32_t ldc, bool relu, int32_t *bias);
+                 int32_t *C, int32_t ldc, bool relu, int8_t *bias);
-  void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 int32_t *C, int32_t ldc, bool relu, int32_t *bias);
  // 8 bits int write back
+  // C = alpha * A * B + beta * C
+  void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc);
  // C = A * B
  void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc);
-  // C = A * B + bias, scale * relu(C)
+  // C = A * B + C
-  void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+  void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                             int32_t ldc, int32_t *bias, float scale);
+                    int32_t ldc);
-  // C = A * B + bias, scale * C
+  // C = A * B + bias
-  void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+  void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                         int32_t ldc, int32_t *bias, float scale);
+                      int32_t ldc, int8_t *bias);
+  // C = A * B + C, relu(C)
+  void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                        int32_t ldc);
+  // C = A * B + bias, relu(C)
+  void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                          int32_t ldc, int8_t *bias);
 private:
  int MC = 0;
@@ -262,7 +254,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb,
  // 8 bits int
  int8_t *packedA_int8;
  int8_t *packedB_int8;
-  int32_t *packedC_int32;
+  int32_t *packedC_int8;
  int8_t *zero_int8;
 };

--- a/src/operators/math/gemm_int8.cpp
+++ b/src/operators/math/gemm_int8.cpp
@@ -18,8 +18,6 @@ limitations under the License. */
 #include "operators/math/gemm.h"
 #if __ARM_NEON
 #include <arm_neon.h>
-#include <iostream>
 #endif
 #ifdef _OPENMP
 #include <omp.h>
@@ -64,7 +62,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
      "pld          [%[b_ptr], #128]               \n\t"
      "vld1.s8      {d0-d3},    [%[a_ptr]]!        \n\t"  // load A 8 cols
      "vld1.s8      {d8-d11},   [%[b_ptr]]!        \n\t"  // load B first 4 rows
-      "vmovl.s8     q2,         d0                 \n\t"  // process B first
+      "vmovl.s8     q2,         d0                 \n\t"  // process B first 4
                                                          // rows
      "vmovl.s8     q3,         d8                 \n\t"
      "vmlal.s16    q8,         d6,            d4[0]\n\t"
@@ -243,132 +241,6 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
 #endif  // __ARM_NEON
 }
-void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
-                     int32_t ldc) {
-#if __ARM_NEON
-#if __aarch64__
-// TODO(wzzju)
-#else
-#define PADDLE_LABEL_LOOP "1"
-#define PADDLE_LABEL_AFTER_LOOP "2"
-  asm volatile(
-      "lsl %[ldc], %[ldc], #2 \n\t"  // sizeof(int32) == 4
-      "vldr d0, [%[b], #0] \n\t"
-      "vmov.s32 q8, #0 \n\t"
-      "vldr d4, [%[a], #0] \n\t"
-      "vmov.s32 q9, q8 \n\t"
-      "vldr d2, [%[b], #16] \n\t"
-      "vmov.s32 q10, q8 \n\t"
-      "vldr d6, [%[a], #16] \n\t"
-      "vmov.s32 q11, q8 \n\t"
-      "vldr d1, [%[b], #8]\n\t"
-      "vmov.s32 q12, q8 \n\t"
-      "vldr d5, [%[a], #8]\n"
-      "vmov.s32 q13, q8 \n\t"
-      "vldr d3, [%[b], #24]\n\t"
-      "vmov.s32 q14, q8 \n\t"
-      "vldr d7, [%[a], #24]\n"
-      "vmov.s32 q15, q8 \n\t"
-      PADDLE_LABEL_LOOP
-      ": \n\t"
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "add %[b], %[b], #32 \n\t"
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vldr d4, [%[a], #32] \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-      "vldr d6, [%[a], #48] \n\t"
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vldr d5, [%[a], #40] \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-      "vldr d7, [%[a], #56] \n\t"
-      "vpadal.s16   q8,  q4 \n\t"  // pairwise-add
-      "add %[a], %[a], #64 \n\t"
-      "vpadal.s16   q9,  q5 \n\t"
-      "subs %[k], %[k], #16 \n\t"
-      "vpadal.s16   q10, q6 \n\t"
-      "vpadal.s16   q11, q7 \n\t"
-      "beq " PADDLE_LABEL_AFTER_LOOP
-      "f \n\t"
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vldr d4, [%[a], #0] \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vldr d0, [%[b], #0] \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-      "vldr d2, [%[b], #16] \n\t"
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vldr d6, [%[a], #16] \n\t"
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vldr d5, [%[a], #8] \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vldr d1, [%[b], #8] \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-      "vldr d3, [%[b], #24] \n\t"
-      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
-      "vldr d7, [%[a], #24] \n\t"
-      "vpadal.s16   q13, q5 \n\t"
-      "vpadal.s16   q14, q6 \n\t"
-      "vpadal.s16   q15, q7 \n\t"
-      "b " PADDLE_LABEL_LOOP "b \n\t"
-      PADDLE_LABEL_AFTER_LOOP
-      ": \n\t"
-      "vmull.s8    q4,  d0,  d4 \n\t"  // first half
-      "vmull.s8    q5,  d2,  d4 \n\t"
-      "vmull.s8    q6,  d0,  d6 \n\t"
-      "vmull.s8    q7,  d2,  d6 \n\t"
-      "vmlal.s8    q4,  d1,  d5 \n\t"  // second half
-      "vmlal.s8    q5,  d3,  d5 \n\t"
-      "vmlal.s8    q6,  d1,  d7 \n\t"
-      "vmlal.s8    q7,  d3,  d7 \n\t"
-      "vpadal.s16   q12, q4 \n\t"  // pairwise-add
-      "vpadal.s16   q13, q5 \n\t"
-      "vpadal.s16   q14, q6 \n\t"
-      "vpadal.s16   q15, q7 \n\t"
-      "vpadd.s32 d0, d16, d17 \n\t"  // reduce to int32
-      "vpadd.s32 d1, d18, d19 \n\t"
-      "vpadd.s32 d2, d20, d21 \n\t"
-      "vpadd.s32 d3, d22, d23 \n\t"
-      "vpadd.s32 d4, d24, d25 \n\t"
-      "vpadd.s32 d5, d26, d27 \n\t"
-      "vpadd.s32 d6, d28, d29 \n\t"
-      "vpadd.s32 d7, d30, d31 \n\t"
-      "vpadd.s32 d8, d0, d1 \n\t"  // reduce to int32 again
-      "vpadd.s32 d9, d2, d3 \n\t"
-      "vpadd.s32 d10, d4, d5 \n\t"
-      "vpadd.s32 d11, d6, d7 \n\t"
-      "vst1.32 {d8}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d9}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d10}, [%[c]], %[ldc] \n\t"
-      "vst1.32 {d11}, [%[c]]  \n\t"
-      : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c)
-      : [ldc] "r"(ldc)
-      : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8",
-        "q9", "q10", "q11", "q12", "q13", "q14", "q15");
-#undef PADDLE_LABEL_AFTER_LOOP
-#undef PADDLE_LABEL_LOOP
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
 // 8 bits int small block inner product
 void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
                     int32_t ldc) {
@@ -667,213 +539,51 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c,
 }
 // 8 bits int inner product
-void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a,
+void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha,
-                       const int8_t *b, float beta, int32_t *c, int32_t *C,
+                               const int8_t *a, const int8_t *b, int8_t beta,
-                       int32_t ldc, bool relu) {
+                               int32_t *c, int32_t *C, int32_t ldc, bool relu,
+                               int8_t *bias) {
 #pragma omp parallel for
-  for (int32_t j = 0; j < nc; j += NR_INT8) {
+  for (int32_t j = 0; j < nc; j += NR) {
    for (int32_t i = 0; i < mc; i += MR_INT8) {
 #if __aarch64__
    // TODO(wzzju)
 #else
      //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      //      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
+      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
 #endif  // __aarch64__
    }
  }
-  if (!relu) {
+  if (alpha != 1) {
-    WriteBasic(mc, nc, c, C, ldc);
+    WriteWithAlphaBeta(mc, nc, c, C, ldc);
    return;
  }
-}
+  if (beta == 0) {
+    WriteBasic(mc, nc, c, C, ldc);
-void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha,
-                               const int8_t *a, const int8_t *b, float beta,
-                               int32_t *c, int8_t *C, int32_t ldc, bool relu,
-                               int32_t *bias) {
-#pragma omp parallel for
-  for (int32_t j = 0; j < nc; j += NR_INT8) {
-    for (int32_t i = 0; i < mc; i += MR_INT8) {
-#if __aarch64__
-    // TODO(wzzju)
-#else
-      //      AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      //      AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-      AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC);
-#endif  // __aarch64__
-    }
-  }
-  if (relu) {
-    WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha);
    return;
-  } else {
-    WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha);
  }
-}
+  if (beta == 1 && !relu) {
+    if (bias == nullptr) {
-// 8 bits int PackMatrixA_4r
+      WriteWithAdd(mc, nc, c, C, ldc);
-void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail,
+    } else {
-                             const int8_t *A, int32_t lda, int8_t *buffer) {
+      WriteWithAddV1(mc, nc, c, C, ldc, bias);
-  const int32_t i_length = m - m_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * KC;
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-    // TODO(wzzju)
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
    }
+    return;
  }
+  if (beta == 1 && relu) {
-  if (m_tail != 0) {
+    if (bias == nullptr) {
-    const int8_t *a0 = &A(i_length, 0);
+      WriteWithAddRelu(mc, nc, c, C, ldc);
-    const int8_t *a1 = a0 + lda;
+    } else {
-    const int8_t *a2 = a0 + 2 * lda;
+      WriteWithAddReluV1(mc, nc, c, C, ldc, bias);
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * KC;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-    // TODO(wzzju)
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
    }
+    return;
  }
 }
 // 8 bits int PackMatrixA_4r
 void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                          int32_t lda, int8_t *buffer) {
  const int8_t *a0, *a1, *a2, *a3;
-  for (int32_t i = 0; i < m - m_tail; i += 4) {
+  for (int32_t i = 0; i < m - m_tail; i += MR_INT8) {
    a0 = A + i * lda;
    a1 = A + (i + 1) * lda;
    a2 = A + (i + 2) * lda;
@@ -915,7 +625,7 @@ void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
 void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
                          int32_t lda, int8_t *buffer) {
  const int32_t i_length = m - m_tail;
-  for (int32_t i = 0; i < i_length; i += 6) {
+  for (int32_t i = 0; i < i_length; i += MR_INT8) {
    const int8_t *a0 = A + i * lda;
    const int8_t *a1 = A + (i + 1) * lda;
    const int8_t *a2 = A + (i + 2) * lda;
@@ -966,79 +676,11 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A,
  }
 }
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                             const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-  for (int32_t j = 0; j < j_length; j += 2) {
-    int8_t *local_buffer = buffer + j * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j);
-      const int8_t *b1 = &B((i << 4), j + 1);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j);
-      const int8_t *b1 = &B((k_count << 4), j + 1);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j_length);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = 0;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j_length);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
 // 8 bits int PackMatrixB
 void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
                          int32_t ldb, int8_t *buffer) {
  const int32_t j_length = n - n_tail;
-  for (int32_t j = 0; j < j_length; j += 8) {
+  for (int32_t j = 0; j < j_length; j += NR) {
    int8_t *local_buffer = buffer + j * k;
    for (int32_t i = 0; i < k; ++i) {
      const int8_t *b0 = &B(i, j);
@@ -1073,7 +715,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
      for (int32_t j = j_length; j < n; ++j) {
        *local_buffer++ = *b0++;
      }
-      for (int32_t j = n; j < j_length + 8; ++j) {
+      for (int32_t j = n; j < j_length + NR; ++j) {
        *local_buffer++ = 0;
      }
    }
@@ -1081,20 +723,19 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B,
 }
 // 8 bits int matrix product (m*k x k*n)
-void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
+void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
+                 int32_t lda, const int8_t *B, int32_t ldb, int8_t beta,
-                 int32_t *C, int32_t ldc, bool relu, int32_t *bias) {
+                 int32_t *C, int32_t ldc, bool relu, int8_t *bias) {
  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
  int32_t L1 = 32 * 1024;
  int32_t L2 = 512 * 1024;
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
+  KC = k;
-  KC = k_complete;
  MC = L1 / (KC * sizeof(int8_t));
  NC = L2 / (KC * sizeof(int8_t));
-  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
+  // make sure MC is multiple of MR_INT8, and NC is multiple of NR
  if (MC == 0) {
    MC = MR_INT8;
  } else {
@@ -1104,106 +745,52 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
  }
  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
  if (NC == 0) {
-    NC = NR_INT8;
+    NC = NR;
  } else {
    int32_t nblock_num = (n + NC - 1) / NC;
    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
+    NC = (NC + NR - 1) / NR * NR;
  }
  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
  packedA_int8 = static_cast<int8_t *>(
      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
  packedB_int8 = static_cast<int8_t *>(
      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-  packedC_int32 = static_cast<int32_t *>(
+  packedC_int8 = static_cast<int32_t *>(
      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
  int32_t mc, nc;
  for (int32_t j = 0; j < n; j += NC) {
    nc = s_min(n - j, NC);
-    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
+    PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8);
    for (int32_t i = 0; i < m; i += MC) {
      mc = s_min(m - i, MC);
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
+      //      PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
+      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
      if (bias == nullptr) {
-        InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                    packedC_int32, &C(i, j), ldc, relu);
-      }
-    }
-  }
-  paddle_mobile::memory::Free(packedA_int8);
-  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
-  paddle_mobile::memory::Free(zero_int8);
-}
-// 8 bits int matrix product (m*k x k*n)
-void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A,
-                 int32_t lda, const int8_t *B, int32_t ldb, float beta,
-                 int8_t *C, int32_t ldc, bool relu, int32_t *bias) {
-  // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73)
-  // L2 cache is 0.5~4 Mib (Contex-A72 cluster)
-  int32_t L1 = 32 * 1024;
-  int32_t L2 = 512 * 1024;
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
-  KC = k_complete;
-  MC = L1 / (KC * sizeof(int8_t));
-  NC = L2 / (KC * sizeof(int8_t));
-  // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8
-  if (MC == 0) {
-    MC = MR_INT8;
-  } else {
-    int32_t mblock_num = (m + MC - 1) / MC;
-    MC = (m + mblock_num - 1) / mblock_num;
-    MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
-  }
-  // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n";
-  if (NC == 0) {
-    NC = NR_INT8;
-  } else {
-    int32_t nblock_num = (n + NC - 1) / NC;
-    NC = (n + nblock_num - 1) / nblock_num;
-    NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
-  }
-  //  DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n";
-  packedA_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC));
-  packedB_int8 = static_cast<int8_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
-  packedC_int32 = static_cast<int32_t *>(
-      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC));
-  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
-  int32_t mc, nc;
-  for (int32_t j = 0; j < n; j += NC) {
-    nc = s_min(n - j, NC);
-    PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8);
-    for (int32_t i = 0; i < m; i += MC) {
-      mc = s_min(m - i, MC);
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8);
-      if (bias != nullptr) {
        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
-                            packedC_int32, &C(i, j), ldc, relu, bias + i);
+                            packedC_int8, &C(i, j), ldc, relu, nullptr);
+      } else {
+        InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta,
+                            packedC_int8, &C(i, j), ldc, relu, bias + i);
      }
    }
  }
  paddle_mobile::memory::Free(packedA_int8);
  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
+  paddle_mobile::memory::Free(packedC_int8);
  paddle_mobile::memory::Free(zero_int8);
 }
 //  8 bits int write back
-// C = A * B
+// C = alpha * A * B + beta * C
+void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
+                              int32_t ldc) {}
+// C = A * B, 8位 int32_t
 void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
                      int32_t ldc) {
 #if __ARM_NEON
@@ -1215,7 +802,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
  int32_t step = sizeof(int32_t) * ldc;
  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4));
  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
  int32_t *volatile c_ptr, *volatile C_ptr;
  int32_t *C0, *c0;
  c_ptr = c;
@@ -1249,7 +836,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
        "end_mc_%=:                         \n\t"
        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
+        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1),
          [step] "r"(step), [step1] "r"(step1)
        : "memory", "r5", "r6", "q0", "q1", "q2", "q3");
  }
@@ -1267,254 +854,20 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
 #endif  // __ARM_NEON
 }
-// C = A * B + bias, scale * C
+// C = A * B + C
-void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
+void Gemm::WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-                             int32_t ldc, int32_t *bias, float scale) {
+                        int32_t ldc) {}
-#if __ARM_NEON
-#if __aarch64__
-// TODO(wzzju)
-#else
-  int32_t zero = 0;
-  int8_t narrow = -128;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.32    q14,  %[zero]           \n\t"
-        "vdup.8     d24,  %[narrow]         \n\t"
-        "loop_mc_%=:                        \n\t"
-        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
-        "vdup.32    q13,  d26[0]            \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q13             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vceq.s8    d15, d14, d24           \n\t"
-        "vsub.s8    d14, d14, d15           \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-        :
+// C = A * B + bias
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
+void Gemm::WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
+                          int32_t ldc, int8_t *bias) {}
-          [scale] "r"(scale), [zero] "r"(zero), [narrow] "r"(narrow)
+// C = A * B + C, relu(C)
-        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
+void Gemm::WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-          "q7", "q12", "q13", "q14", "q15");
+                            int32_t ldc) {}
-  }
-  int32_t nc_left;
+// C = A * B + bias, relu(C)
-  int32_t *c0;
+void Gemm::WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C,
-  int8_t *C0;
+                              int32_t ldc, int8_t *bias) {}
-  int32_t bias_v;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      bias_v = *(bias_ptr + i);
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.32    q14,  %[zero]           \n\t"
-          "vdup.8     d24,  %[narrow]         \n\t"
-          "vdup.32    q13,  %[bias_v]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vceq.s8    d9, d8, d24             \n\t"
-          "vsub.s8    d8, d8, d9              \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
-            [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero),
-            [narrow] "r"(narrow)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14",
-            "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
-// C = A * B + bias, scale * relu(C)
-void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C,
-                                 int32_t ldc, int32_t *bias, float scale) {
-#if __ARM_NEON
-#if __aarch64__
-// TODO(wzzju)
-#else
-  int32_t zero = 0;
-  int32_t nc1 = nc >> 3;
-  int32_t _nc1 = nc & 7;
-  int32_t step = sizeof(int8_t) * ldc;
-  int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3));
-  int32_t volatile m = mc;
-  int32_t volatile n = nc1;
-  int32_t *volatile c_ptr, *volatile bias_ptr;
-  int8_t *volatile C_ptr;
-  c_ptr = c;
-  C_ptr = C;
-  bias_ptr = bias;
-  if (nc1 > 0) {
-    asm volatile(
-        "subs       %[mc], %[mc], #1        \n\t"
-        "blt        end_mc_%=               \n\t"
-        "vdup.32    q15,  %[scale]          \n\t"
-        "vdup.32    q14,  %[zero]           \n\t"
-        "loop_mc_%=:                        \n\t"
-        "vld1.32    {d26[0]}, [%[bias_ptr]]!\n\t"
-        "vdup.32    q13,  d26[0]            \n\t"
-        "mov        r6,   %[C_ptr]          \n\t"
-        "mov        r5,   %[nc1]            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "blt        end_nc1_%=              \n\t"
-        "loop_nc1_%=:                       \n\t"
-        "vld1.32    {q0, q1}, [%[c_ptr]]!   \n\t"
-        "vqadd.s32  q0, q0, q13             \n\t"
-        "vqadd.s32  q1, q1, q13             \n\t"
-        "vmax.s32   q0, q0, q14             \n\t"
-        "vmax.s32   q1, q1, q14             \n\t"
-        "vcvt.f32.s32 q2, q0                \n\t"
-        "vcvt.f32.s32 q3, q1                \n\t"
-        "vmul.f32   q2, q2, q15             \n\t"
-        "vmul.f32   q3, q3, q15             \n\t"
-        "vcvt.s32.f32 q4, q2                \n\t"
-        "vcvt.s32.f32 q5, q3                \n\t"
-        "vqmovn.s32 d12, q4                 \n\t"
-        "vqmovn.s32 d13, q5                 \n\t"
-        "vqmovn.s16 d14, q6                 \n\t"
-        "vst1.8     {d14}, [r6]!            \n\t"
-        "subs       r5,   r5,   #1          \n\t"
-        "bge        loop_nc1_%=             \n\t"
-        "end_nc1_%=:                        \n\t"
-        "add        %[C_ptr], %[C_ptr], %[step]  \n\t"
-        "add        %[c_ptr], %[c_ptr], %[step1] \n\t"
-        "subs       %[mc], %[mc], #1        \n\t"
-        "bge        loop_mc_%=              \n\t"
-        "end_mc_%=:                         \n\t"
-        :
-        : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n),
-          [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr),
-          [scale] "r"(scale), [zero] "r"(zero)
-        : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6",
-          "q7", "q13", "q14", "q15");
-  }
-  int32_t nc_left;
-  int32_t *c0;
-  int8_t *C0;
-  int32_t bias_v;
-  if (_nc1 != 0) {
-    for (int32_t i = 0; i < mc; i++) {
-      C0 = C_ptr + nc1 * 8 + i * ldc;
-      c0 = c_ptr + nc1 * 8 + i * NC;
-      bias_v = *(bias_ptr + i);
-      nc_left = _nc1;
-      asm volatile(
-          "vdup.32    q15,  %[scale]          \n\t"
-          "vdup.32    q14,  %[zero]           \n\t"
-          "vdup.32    q13,  %[bias_v]         \n\t"
-          "cmp        %[_nc1], #4             \n\t"
-          "blt        less_four_%=            \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vmax.s32   q0, q0, q14             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[1]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[2]}, [%[C0]]!       \n\t"
-          "vst1.8     {d8[3]}, [%[C0]]!       \n\t"
-          "subs       %[_nc1], %[_nc1], #4    \n\t"
-          "beq        process_over_%=         \n\t"
-          "less_four_%=:                      \n\t"
-          "vld1.32    {q0}, [%[c0]]!          \n\t"
-          "vqadd.s32  q0, q0, q13             \n\t"
-          "vmax.s32   q0, q0, q14             \n\t"
-          "vcvt.f32.s32 q1, q0                \n\t"
-          "vmul.f32   q1, q1, q15             \n\t"
-          "vcvt.s32.f32 q2, q1                \n\t"
-          "vqmovn.s32 d6, q2                  \n\t"
-          "vqmovn.s16 d8, q3                  \n\t"
-          "loop_save_%=:                      \n\t"
-          "vst1.8     {d8[0]}, [%[C0]]!       \n\t"
-          "vext.8 d8, d8, d8, #1              \n\t"
-          "subs       %[_nc1], %[_nc1], #1    \n\t"
-          "bgt        loop_save_%=            \n\t"
-          "process_over_%=:                   \n\t"
-          :
-          : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0),
-            [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero)
-          : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15");
-    }
-  }
-#endif  // __aarch64__
-#endif  // __ARM_NEON
-}
 }  // namespace math
 }  // namespace operators

--- a/src/operators/math/gemm_omp_int8.cpp
+++ b/src/operators/math/gemm_omp_int8.cpp
@@ -28,10 +28,10 @@ namespace operators {
 namespace math {
 // 8 bits int matrix product (m*k x k*n)
-void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
+void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha,
                     const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb,
-                     float beta, int32_t *C, int32_t ldc, bool relu,
+                     int8_t beta, int32_t *C, int32_t ldc, bool relu,
-                     int32_t *bias) {
+                     int8_t *bias) {
 #ifdef _OPENMP
  int32_t max_threads = omp_get_max_threads();
 #else
@@ -39,11 +39,10 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
 #endif
  int32_t L1 = 64 / max_threads * 1024;
-  const int32_t k_complete = (k + 15) - ((k + 15) & 15);
+  KC = k;
-  KC = k_complete;
  zero_int8 =
-      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * k));
+      static_cast<int8_t *>(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC));
-  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * k);
+  memset(static_cast<void *>(zero_int8), 0, sizeof(int8_t) * KC);
  if (m > n) {
    // 对 A 分块
    MC = L1 / (KC * sizeof(int8_t));
@@ -55,14 +54,14 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
      MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8;
    }
    // 补齐 B
-    NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8;
+    NC = (n + NR - 1) / NR * NR;
    packedB_int8 = static_cast<int8_t *>(
        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC));
 #if __aarch64__
    // TODO(wzzju)
 #else
-    PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8);
+    PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8);
 #endif
    packedA_int8 = static_cast<int8_t *>(
        paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads));
@@ -70,11 +69,11 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
    // 对 B 分块
    NC = L1 / (KC * sizeof(int8_t));
    if (NC == 0) {
-      NC = NR_INT8;
+      NC = NR;
    } else {
      int32_t nblock_num = (n + NC - 1) / NC;
      NC = (n + nblock_num - 1) / nblock_num;
-      NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8;
+      NC = (NC + NR - 1) / NR * NR;
    }
    // 补齐 A
    MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8;
@@ -84,12 +83,12 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
 #if __aarch64__
    // TODO(wzzju)
 #else
-    PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8);
+    PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8);
 #endif
    packedB_int8 = static_cast<int8_t *>(
        paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads));
  }
-  packedC_int32 = static_cast<int32_t *>(
+  packedC_int8 = static_cast<int32_t *>(
      paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads));
  if (m > n) {
@@ -104,19 +103,14 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
      int32_t mc;
      mc = s_min(m - i, MC);
      int8_t *local_A = packedA_int8 + MC * KC * local_threads;
-      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
 #if __aarch64__
      // TODO(wzzju)
 #else
-      PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A);
+      PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A);
 #endif
-      //      InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta,
+      InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C,
-      //      local_C,
+                          &C(i, 0), ldc, relu, bias + i);
-      //                          &C(i, 0), ldc, relu, bias + i);
-      if (bias == nullptr) {
-        InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C,
-                    &C(i, 0), ldc, relu);
-      }
    }
  } else {
 #pragma omp parallel for
@@ -129,25 +123,20 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha,
      int32_t nc;
      nc = s_min(n - j, NC);
      int8_t *local_B = packedB_int8 + KC * NC * local_threads;
-      int32_t *local_C = packedC_int32 + MC * NC * local_threads;
+      int32_t *local_C = packedC_int8 + MC * NC * local_threads;
 #if __aarch64__
      // TODO(wzzju)
 #else
-      PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B);
+      PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B);
 #endif
-      //      InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta,
+      InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C,
-      //      local_C,
+                          &C(0, j), ldc, relu, bias);
-      //                          &C(0, j), ldc, relu, bias);
-      if (bias == nullptr) {
-        InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C,
-                    &C(0, j), ldc, relu);
-      }
    }
  }
  paddle_mobile::memory::Free(packedA_int8);
  paddle_mobile::memory::Free(packedB_int8);
-  paddle_mobile::memory::Free(packedC_int32);
+  paddle_mobile::memory::Free(packedC_int8);
  paddle_mobile::memory::Free(zero_int8);
 }
@@ -155,7 +144,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
                              const int8_t *B, int32_t ldb, int8_t *buffer) {
  const int32_t j_length = n - n_tail;
 #pragma omp parallel for
-  for (int32_t j = 0; j < j_length; j += 8) {
+  for (int32_t j = 0; j < j_length; j += NR) {
    int8_t *local_buffer = buffer + j * k;
    for (int32_t i = 0; i < k; ++i) {
      const int8_t *b0 = &B(i, j);
@@ -190,7 +179,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
      for (int32_t j = j_length; j < n; ++j) {
        *local_buffer++ = *b0++;
      }
-      for (int32_t j = n; j < j_length + 8; ++j) {
+      for (int32_t j = n; j < j_length + NR; ++j) {
        *local_buffer++ = 0;
      }
    }
@@ -199,9 +188,9 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail,
 void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
                              const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
+  const int i_length = m - m_tail;
 #pragma omp parallel for
-  for (int32_t i = 0; i < i_length; i += 4) {
+  for (int32_t i = 0; i < i_length; i += MR_INT8) {
    const int8_t *a0 = A + i * lda;
    const int8_t *a1 = A + (i + 1) * lda;
    const int8_t *a2 = A + (i + 2) * lda;
@@ -232,7 +221,7 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
      default:
        break;
    }
-    for (int32_t j = 0; j < k; ++j) {
+    for (int j = 0; j < k; ++j) {
      *local_buffer++ = *a0++;
      *local_buffer++ = *a1++;
      *local_buffer++ = *a2++;
@@ -241,232 +230,6 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail,
  }
 }
-// 8 bits int PackMatrixA_4r
-void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail,
-                                 const int8_t *A, int32_t lda, int8_t *buffer) {
-  const int32_t i_length = m - m_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t i = 0; i < i_length; i += 4) {
-    const int8_t *a0 = A + i * lda;
-    const int8_t *a1 = A + (i + 1) * lda;
-    const int8_t *a2 = A + (i + 2) * lda;
-    const int8_t *a3 = A + (i + 3) * lda;
-    int8_t *local_buffer = buffer + i * KC;
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-    // TODO(wzzju)
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-  if (m_tail != 0) {
-    const int8_t *a0 = &A(i_length, 0);
-    const int8_t *a1 = a0 + lda;
-    const int8_t *a2 = a0 + 2 * lda;
-    const int8_t *a3 = a0 + 3 * lda;
-    int8_t *local_buffer = buffer + i_length * KC;
-    switch (m_tail) {
-      case 1:
-        a1 = zero_int8;
-      case 2:
-        a2 = zero_int8;
-      case 3:
-        a3 = zero_int8;
-        break;
-      default:
-        break;
-    }
-    for (int32_t j = 0; j < k_count; ++j) {
-#if __ARM_NEON
-#if __aarch64__
-    // TODO(wzzju)
-#else
-      asm volatile(
-          "vld1.s8    {d0, d1},   [%[a0]]!         \n\t"
-          "vld1.s8    {d2, d3},   [%[a1]]!         \n\t"
-          "vld1.s8    {d4, d5},   [%[a2]]!         \n\t"
-          "vld1.s8    {d6, d7},   [%[a3]]!         \n\t"
-          "vst1.s8    {d0, d1},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d2, d3},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d4, d5},   [%[local_buffer]]!    \n\t"
-          "vst1.s8    {d6, d7},   [%[local_buffer]]!    \n\t"
-          : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1),
-            [a2] "+r"(a2), [a3] "+r"(a3)
-          :
-          : "memory", "q0", "q1", "q2", "q3");
-#endif  // __aarch64__
-#else
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t l = 0; l < 16; ++l) {
-        *local_buffer++ = *a3++;
-      }
-#endif  // __ARM_NEON
-    }
-    if (k_tail != 0) {
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a0++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a1++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a2++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *a3++;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
-// 8 bits int PackMatrixB
-void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail,
-                                 const int8_t *B, int32_t ldb, int8_t *buffer) {
-  const int32_t j_length = n - n_tail;
-  const int32_t k_count = k >> 4;
-  const int32_t k_tail = k & 15;
-#pragma omp parallel for
-  for (int32_t j = 0; j < j_length; j += 2) {
-    int8_t *local_buffer = buffer + j * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j);
-      const int8_t *b1 = &B((i << 4), j + 1);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j);
-      const int8_t *b1 = &B((k_count << 4), j + 1);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b1;
-        b1 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-  if (n_tail != 0) {
-    int8_t *local_buffer = buffer + j_length * KC;
-    for (int32_t i = 0; i < k_count; ++i) {
-      const int8_t *b0 = &B((i << 4), j_length);
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int m = 0; m < 16; ++m) {
-        *local_buffer++ = 0;
-      }
-    }
-    if (k_tail != 0) {
-      const int8_t *b0 = &B((k_count << 4), j_length);
-      for (int32_t j = k_count << 4; j < k; ++j) {
-        *local_buffer++ = *b0;
-        b0 += ldb;
-      }
-      for (int32_t j = k; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-      for (int32_t j = k_count << 4; j < KC; ++j) {
-        *local_buffer++ = 0;
-      }
-    }
-  }
-}
 }  // namespace math
 }  // namespace operators
 }  // namespace paddle_mobile
--- a/src/operators/math/math_function.h
+++ b/src/operators/math/math_function.h
@@ -28,12 +28,7 @@ template <typename T>
 void matmul(const framework::Tensor &matrix_a, bool trans_a,
            const framework::Tensor &matrix_b, bool trans_b, T alpha,
            framework::Tensor *matrix_out, T beta, bool relu = false,
-            float *bias = nullptr);
+            T *bias = nullptr);
-void matmul_int8(const framework::Tensor &matrix_a, bool trans_a,
-                 const framework::Tensor &matrix_b, bool trans_b, float alpha,
-                 framework::Tensor *matrix_out, float beta, bool relu = false,
-                 int32_t *bias = nullptr);
 template <typename T>
 void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a,

--- a/src/operators/math/math_function_int8.cpp
+++ b/src/operators/math/math_function_int8.cpp
@@ -20,10 +20,11 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 namespace math {
-void matmul_int8(const framework::Tensor &matrix_a, bool trans_a,
+template <>
-                 const framework::Tensor &matrix_b, bool trans_b, float alpha,
+void matmul<int8_t>(const framework::Tensor &matrix_a, bool trans_a,
-                 framework::Tensor *matrix_out, float beta, bool relu,
+                    const framework::Tensor &matrix_b, bool trans_b,
-                 int32_t *bias) {
+                    int8_t alpha, framework::Tensor *matrix_out, int8_t beta,
+                    bool relu, int8_t *bias) {
  auto dim_a = matrix_a.dims();
  auto dim_b = matrix_b.dims();
  auto dim_out = matrix_out->dims();
@@ -51,45 +52,21 @@ void matmul_int8(const framework::Tensor &matrix_a, bool trans_a,
    }
 #ifdef _OPENMP
-    if (bias != nullptr) {
+    gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-      // TODO(wzzju): gemm.Sgemm_omp_with_bias, now use single thread instead.
+                   matrix_out->data<int32_t>(), N, relu, bias);
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int8_t>(), N, relu, bias);
-    } else {
-      gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias);
-    }
 #else
-    if (bias != nullptr) {
+    gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
+               matrix_out->data<int32_t>(), N, relu, bias);
-                 matrix_out->data<int8_t>(), N, relu, bias);
-    } else {
-      gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data<int8_t>(), N, beta,
-                 matrix_out->data<int32_t>(), N, relu, bias);
-    }
 #endif
  } else {
 #ifdef _OPENMP
-    if (bias != nullptr) {
+    gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-      // TODO(wzzju): gemm.Sgemm_omp_with_bias, now use single thread instead.
+                   matrix_b.data<int8_t>(), N, beta,
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+                   matrix_out->data<int32_t>(), N, relu, bias);
-                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
-                 N, relu, bias);
-    } else {
-      gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                     matrix_b.data<int8_t>(), N, beta,
-                     matrix_out->data<int32_t>(), N, relu, bias);
-    }
 #else
-    if (bias != nullptr) {
+    gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
+               matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(), N,
-                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int8_t>(),
+               relu, bias);
-                 N, relu, bias);
-    } else {
-      gemm.Sgemm(M, N, K, alpha, matrix_a.data<int8_t>(), K,
-                 matrix_b.data<int8_t>(), N, beta, matrix_out->data<int32_t>(),
-                 N, relu, bias);
-    }
 #endif
  }
 }

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -1633,11 +1633,11 @@ class FusionFcParam : public OpParam {
    y_num_col_dims_ = GetAttr<int>("y_num_col_dims", attrs);
    axis_ = GetAttr<int>("axis", attrs);
  }
-  const GType *InputX() const { return input_x_; }
+  GType *InputX() const { return input_x_; }
-  const RType *InputY() const { return input_y_; }
+  RType *InputY() const { return input_y_; }
-  const RType *InputZ() const { return input_z_; }
+  RType *InputZ() const { return input_z_; }
  GType *Out() const { return out_; }

--- a/test/common/test_gemm_perf.cpp
+++ b/test/common/test_gemm_perf.cpp
@@ -28,7 +28,7 @@ limitations under the License. */
 int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
-  paddle_mobile.SetThreadNum(4);
+  paddle_mobile.SetThreadNum(8);
  Tensor aa, bb, cc;
  auto aaptr = aa.mutable_data<float>({m, k});
  auto bbptr = bb.mutable_data<float>({k, n});
@@ -44,12 +44,10 @@ int main() {
    ccptr[i] = 2;
  }
-  Tensor aa_int8, bb_int8, cc_int32, cc_int8;
+  Tensor aa_int8, bb_int8, cc_int8;
  auto aaptr_int8 = aa_int8.mutable_data<int8_t>({m, k});
  auto bbptr_int8 = bb_int8.mutable_data<int8_t>({k, n});
-  auto ccptr_int32 = cc_int32.mutable_data<int32_t>({m, n});
+  auto ccptr_int8 = cc_int8.mutable_data<int32_t>({m, n});
-  auto ccptr_int8 = cc_int8.mutable_data<int8_t>({m, n});
-  int32_t* bias_data = new int32_t[m];
  for (int i = 0; i < m * k; ++i) {
    aaptr_int8[i] = static_cast<int8_t>(2);
@@ -58,11 +56,7 @@ int main() {
    bbptr_int8[i] = static_cast<int8_t>(2);
  }
  for (int i = 0; i < m * n; ++i) {
-    ccptr_int32[i] = static_cast<int32_t>(2);
+    ccptr_int8[i] = static_cast<int32_t>(2);
-  }
-  for (int i = 0; i < m; ++i) {
-    bias_data[i] = 2;
  }
  // float
@@ -82,41 +76,22 @@ int main() {
  auto time2 = time();
  std::cout << "float gemm  cost :" << time_diff(time1, time2) / 10 << "ms\n";
-  // int8_t without bias
+  // int8_t
  // warm-up 10 times
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul_int8(
+    paddle_mobile::operators::math::matmul<int8_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
-        static_cast<float>(0), false, nullptr);
+        static_cast<int8_t>(0), false, nullptr);
  }
  auto time3 = time();
  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul_int8(
+    paddle_mobile::operators::math::matmul<int8_t>(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int32,
+        aa_int8, false, bb_int8, false, static_cast<int8_t>(1), &cc_int8,
-        static_cast<float>(0), false, nullptr);
+        static_cast<int8_t>(0), false, nullptr);
  }
  auto time4 = time();
  std::cout << "int8_t gemm  cost :" << time_diff(time3, time4) / 10 << "ms\n";
-  // int8_t with bias&relu
-  // warm-up 10 times
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul_int8(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int8,
-        static_cast<float>(0), true, &bias_data[0]);
-  }
-  auto time5 = time();
-  for (int j = 0; j < 10; ++j) {
-    paddle_mobile::operators::math::matmul_int8(
-        aa_int8, false, bb_int8, false, static_cast<float>(1), &cc_int8,
-        static_cast<float>(0), true, &bias_data[0]);
-  }
-  auto time6 = time();
-  std::cout << "int8_t gemm_with_bias_relu cost :"
-            << time_diff(time5, time6) / 10 << "ms\n";
-  delete[] bias_data;
  return 0;
 }
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
@@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 See the License for the specific language governing permissions and
 limitations under the License. */
 #include <fstream>
+#include <iomanip>
+#include <iostream>
 #include "../test_include.h"
 #ifdef PADDLE_MOBILE_FPGA_V1
@@ -87,26 +89,29 @@ int main() {
  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
  if (paddle_mobile.Load(std::string(g_resnet50), true)) {
    Tensor input_tensor;
-    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(2),
-                       static_cast<float>(1));
+                       static_cast<float>(2));
    readStream(g_image_src_float,
               input_tensor.mutable_data<float>({1, 3, 224, 224}));
    paddle_mobile.FeedData(input_tensor);
    paddle_mobile.Predict_To(-1);
-    /*for(int i = 0; i < 73; i++)
+    for (int i = 0; i < 73; i++) {
-    {
      auto tensor_ptr = paddle_mobile.FetchResult(i);
-      std::string saveName = "resnet50_result_" + std::to_string (i);
+      std::string saveName = "resnet50_result_" + std::to_string(i);
      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data<float>(),
-    tensor_ptr->numel()); dump_stride(saveName, (*tensor_ptr), 20);
+                                           tensor_ptr->numel() * sizeof(half));
-      //dump(saveName, (*tensor_ptr));
+      dump_stride(saveName, (*tensor_ptr), 20);
-    }*/
+      // dump(saveName, (*tensor_ptr));
+    }
-    /*std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73);
+    std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(73);
-    (*output_tensor).dump<float>("resnet50_result_73");
+    //(*output_tensor).dump<float>("resnet50_result_73");
    output_tensor = paddle_mobile.FetchResult(74);
-    (*output_tensor).dump<float>("resnet50_result_74");*/
+    //(*output_tensor).dump<float>("resnet50_result_74");
-    std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
+    // std::shared_ptr<Tensor> output_tensor = paddle_mobile.FetchResult(74);
+    // output_tensor = paddle_mobile.FetchResult(74);
    float max = 0;
    auto data_ptr = output_tensor->data<float>();
    int maximumIdx = 0;
@@ -116,7 +121,7 @@ int main() {
        max = data_ptr[i];
      }
    }
-    std::cout << "index : " << maximumIdx << ",    value : " << max
+    std::cout << "index : " << std::dec << maximumIdx << ",    value : " << max
              << std::endl;
    std::cout << "Computation done" << std::endl;
    return 0;