Merge branch 'ocr_ctc' of https://github.com/hjchen2/paddle-mobile into ocr_ctc

38ad497e · hjchen2 · bbf632c8 · ce169c24 · 38ad497e · 38ad497e
18 changed file
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF)
 option(CPU "armv7 with neon" ON)
 option(GPU_MALI "mali gpu" OFF)
 option(GPU_CL "opencl gpu" OFF)
 option(FPGA "fpga" OFF)
 if(FPGA)
    option(FPGAV1 "fpga v1" ON)
@@ -144,7 +145,7 @@ if(FPGA)
        endforeach()
        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
        endforeach()
    endif()
    if(FPGAV2)
@@ -156,7 +157,7 @@ if(FPGA)
        endforeach()
        file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
        foreach(f ${_tmp_list})
-            list(REMOVE_ITEM PADDLE_MOBILE_CC ${f})
+            list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
        endforeach()
    endif()

--- a/README.md
+++ b/README.md
@@ -7,11 +7,21 @@
 <!--[![Release](https://img.shields.io/github/release/PaddlePaddle/Paddle-Mobile.svg)](https://github.com/PaddlePaddle/Paddle-Mobile/releases)
 [![License](https://img.shields.io/badge/license-Apache%202-blue.svg)](LICENSE)-->
+Welcome to Paddle-Mobile GitHub project。Paddle-Mobile is a project of PaddlePaddle as well as a deep learning framework for embedded platforms.
 欢迎来到 Paddle-Mobile GitHub 项目。Paddle-Mobile是PaddlePaddle组织下的项目，是一个致力于嵌入式平台的深度学习的框架。
 ## Features
+- high performance in support of ARM CPU 
+- support Mali GPU
+- support Andreno GPU
+- support the realization of GPU Metal on Apple devices
+- support implementation on ZU5、ZU9 and other FPGA-based development boards
+- support implementation on Raspberry Pi and other arm-linux development boards
+## Features
 - 高性能支持ARM CPU 
 - 支持Mali GPU
 - 支持Andreno GPU
@@ -19,6 +29,7 @@
 - 支持ZU5、ZU9等FPGA开发板
 - 支持树莓派等arm-linux开发板
 ## Demo
 - [ANDROID](https://github.com/xiebaiyuan/paddle-mobile-demo)
@@ -26,6 +37,27 @@
 [https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo](https://github.com/PaddlePaddle/paddle-mobile/tree/develop/demo)
+## Documentation
+### Documentation of design
+If you want to know more details about the documentation of paddle-mobile design, please refer to the link as follows. There are many previous designs and discussion: [issue](https://github.com/PaddlePaddle/paddle-mobile/issues).
+[link of documentation of design](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/design_doc.md)
+### Documentation of development
+Documentation of development is mainly about building, running and other tasks.As a developer,you can use it with the help of contributed documents.
+* [iOS](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_ios.md)
+* [Android_CPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android.md)
+* [Android_GPU](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_android_GPU.md)
+* [FPGA](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_fpga.md)
+* [ARM_LINUX](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/doc/development_arm_linux.md)
+### How to contribute your documents
+- [tutorial link to contribute documents](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
+- Main procedure of contributing code is covered in the document above.If you have other problems during the procedure,please send them as [issue](https://github.com/PaddlePaddle/paddle-mobile/issues). We will deal with it as quickly as possible.
 ## 文档
 ### 设计文档
@@ -46,6 +78,24 @@
 - [贡献文档链接](https://github.com/PaddlePaddle/paddle-mobile/blob/develop/CONTRIBUTING.md)
 - 上面文档中涵盖了主要的贡献代码流程，如果在实践中您还遇到了其他问题，可以发[issue](https://github.com/PaddlePaddle/paddle-mobile/issues)。我们看到后会尽快处理。
+## Acquision of Models
+At present Paddle-Mobile only supports Paddle fluid training model. Models wiil be operated regularly after transformation if you have various models.
+### 1. Use Paddle Fluid directly to train
+It is the most reliable method to be recommanded
+### 2. Transform Caffe to Paddle Fluid model
+[https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid](https://github.com/PaddlePaddle/models/tree/develop/fluid/image_classification/caffe2fluid)
+### 3. ONNX
+ONNX is expanded as Open Neural Network Exchange. The project is aimed to make a full communication and usage among diffrent nerual network development frameworks.
+Except for directly using fluid models trained by PaddlePaddle,you can also get certain Paddle fluid models through onnx transformation.
+At present，work in support of onnx is also under operation in Baidu. Related tranformation project can be referred to here：
+[https://github.com/PaddlePaddle/paddle-onnx](https://github.com/PaddlePaddle/paddle-onnx)
+### 4. Download parts of testing models and testing pictures
+[http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
 ## 模型获得
 目前Paddle-Mobile仅支持Paddle fluid训练的模型。如果你手中的模型是不同种类的模型，需要进行模型转换才可以运行。
@@ -64,6 +114,22 @@ ONNX全称为“Open Neural Network Exchange”，即“开放的神经网络切
 ### 4. 部分测试模型和测试图片下载
 [http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip](http://mms-graph.bj.bcebos.com/paddle-mobile%2FmodelsAndImages.zip)
+<!--## Online output of simple search
+Gif as following is the application output of online main part detection of simple search app
+![ezgif-1-050a733dfb](http://otkwwi4x8.bkt.clouddn.com/2018-07-05-ezgif-1-050a733dfb.gif)-->
+## Ask Question
+Welcome to put forward or tackle with our problems,You can post your question in our issue modular on github. [Github Issues](https://github.com/PaddlePaddle/paddle-mobile/issues).
+## Copyright and License
+Paddle-Mobile provide relatively unstricted Apache-2.0 Open source agreement [Apache-2.0 license](LICENSE).
+## Old version Mobile-Deep-Learning
+Original MDL(Mobile-Deep-Learning) project has been transferred to [Mobile-Deep-Learning](https://github.com/allonli/mobile-deep-learning) 
 <!--## 简单搜索线上效果
 如下gif是简单搜索app的线上主体检测应用效果

--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -24,8 +24,6 @@ namespace fpga {
 #define USE_RELU 1
 #define USE_BIAS 2
-int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
 void format_image(framework::Tensor *image_tensor) {
  auto dims = image_tensor->dims();
  auto channel = dims[1], height = dims[2], width = dims[3];
@@ -96,10 +94,6 @@ int get_aligned_filter_element_num(int chw) {
  return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
 }
-int get_aligned_filter_num(int num) {
-  return align_to_x(num, FILTER_NUM_ALIGNMENT);
-}
 void format_filter(framework::Tensor *filter_tensor, float max_value,
                   int group_num) {
  filter_tensor->scale[0] = float(max_value / 127.0);  // NOLINT
@@ -177,46 +171,37 @@ void format_concat_output(framework::Tensor *out, int height, int width,
 void expand_conv_arg(ConvArgs *arg) {
  ConvArgs args = *arg;
-  uint64_t filterlen = (uint64_t)args.kernel.width *
-                       (uint64_t)args.kernel.height *
+  auto fpga_bias_scale_len =
-                       (uint64_t)args.image.channels;
-  filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT);
-  filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT);
-  uint64_t fpga_bias_scale_len =
      align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
-  uint64_t output_height =
+  auto output_height =
      (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
          args.kernel.stride_h +
      1;
-  uint64_t output_width =
+  auto output_width =
      (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
          args.kernel.stride_w +
      1;
-  uint64_t output_size =
-      output_height * output_width * (uint64_t)args.filter_num;
+  auto filter_per_group = args.filter_num / args.group_num;
+  auto channel_per_group = args.image.channels / args.group_num;
-  auto filter_per_group = (uint64_t)(args.filter_num / args.group_num);
-  auto channel_per_group = (uint64_t)(args.image.channels / args.group_num);
+  auto image_row_count = args.image.width * args.image.channels;
+  auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
-  uint64_t image_row_count = ((uint64_t)args.image.width) *
+  auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
-                             ((uint64_t)args.image.channels);  // without align
+                               args.image.pad_width * args.image.channels;
-  uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
+  auto filter_amount_all =
-  uint64_t image_one_pad_per_row =
+      align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
-      align_to_x(image_row_count, IMAGE_ALIGNMENT) +
-      ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
-  uint64_t filter_amount_all =
-      align_to_x(((uint64_t)args.kernel.height) *
-                     ((uint64_t)args.kernel.width) * channel_per_group,
                 FILTER_ELEMENT_ALIGNMENT);
-  uint64_t output_amount_per_row =
+  auto output_amount_per_row =
-      align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT);
+      align_to_x(output_width * args.filter_num, IMAGE_ALIGNMENT);
  // find the opt partition strategy
  uint64_t res_win;
  uint64_t res_fit = 0;
-  for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
+  for (res_win = 1; res_win <= output_width; res_win++) {
    if ((align_to_x(
             (args.image.channels *
              (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
@@ -238,48 +223,48 @@ void expand_conv_arg(ConvArgs *arg) {
  }
  res_fit = res_win;
-  uint64_t block_num = (output_width + res_fit - 1) / res_fit;
+  auto block_num = (output_width + res_fit - 1) / res_fit;
-  uint64_t block_len = res_fit;
+  auto block_len = res_fit;
-  uint64_t block_last = output_width - res_fit * (block_num - 1);
+  auto block_last = output_width - res_fit * (block_num - 1);
-  uint64_t res_amount_per_row = output_width * args.filter_num;
+  auto res_amount_per_row = output_width * args.filter_num;
-  uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
+  auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
-  uint64_t image_block_amount_per_row =
+  auto image_block_amount_per_row =
-      args.kernel.stride_w * (res_fit)*args.image.channels;
+      args.kernel.stride_w * res_fit * args.image.channels;
-  uint64_t filter_pad_width_mul_channel =
+  auto filter_pad_width_mul_channel =
      args.image.pad_width * args.image.channels;
-  uint64_t image_amount_per_row_multi_win_first =
+  auto image_amount_per_row_multi_win_first =
      image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
-  uint64_t image_amount_per_row_multi_win =
+  auto image_amount_per_row_multi_win =
      image_amount_per_row * (4 * args.kernel.stride_h);
-  uint64_t image_block_num = block_num;
+  auto image_block_num = block_num;
-  uint64_t image_block_len =
+  auto image_block_len =
      align_to_x((args.image.channels *
                  (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
                 IMAGE_ALIGNMENT) /
          16 +
      1;
-  uint64_t image_block_len_last =
+  auto image_block_len_last =
      align_to_x(
          (args.image.channels *
           (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
          IMAGE_ALIGNMENT) /
          16 +
      1;
-  uint64_t image_win_cnt = block_len;
+  auto image_win_cnt = block_len;
-  uint64_t image_win_cnt_last = block_last;
+  auto image_win_cnt_last = block_last;
-  uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
+  auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
-  uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
+  auto prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
  if (prog_full_cnt == 1023) {
    prog_full_cnt--;
  }
-  uint64_t post_prog_full_cnt =
+  auto post_prog_full_cnt =
      (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
          ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
          : 0;
-  uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
+  auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
@@ -449,7 +434,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  arg->sub_conv_num = (uint32_t)stride_h;
  arg->filter_num = (uint32_t)filter->dims()[0];
  int sub_conv_num = arg->sub_conv_num;
-  int sub_stride = 1;
  int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
                                                   padding_w, stride_w);
  int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis(
@@ -466,16 +450,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
      stride_w, (int)filter->dims()[3], padding_w);
  arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
-  int sub_channels = (int)input->dims()[1];
+  auto sub_channels = (int)input->dims()[1];
-  int omit_size = arg->omit_size;
-  int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
-  int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
  int sub_filter_num = sub_conv_num * (arg->filter_num);
  int conv_output_size =
      (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
      sub_output_height;
-  int ouput_size = conv_output_size * sub_conv_num;
  int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT);
  int align_sub_filter_count =
@@ -485,7 +465,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
      align_sub_filter_count * align_sub_filter_num;
  for (int i = 0; i < sub_conv_num; ++i) {
-    arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
+    arg->conv_args[i].filter_num = arg->sub_conv_num * arg->filter_num;
    arg->conv_args[i].group_num = (uint32_t)group_num;
    arg->conv_args[i].filter_scale_address = filter->scale;
@@ -496,7 +476,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
    arg->conv_args[i].kernel.stride_w = 1;
    arg->conv_args[i].kernel.stride_h = 1;
-    // DeconvParam.conv_args[i].image.address = (void*)ptr_image;
    arg->conv_args[i].image.scale_address = input->scale;
    arg->conv_args[i].image.channels = (uint32_t)sub_channels;
    arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
@@ -504,30 +483,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
    arg->conv_args[i].image.pad_width = (uint32_t)sub_pad;
    arg->conv_args[i].image.pad_height = (uint32_t)sub_pad;
    arg->conv_args[i].image.address = input_ptr;
-    arg->conv_args[i].sb_address = (void *)bs_ptr;
+    arg->conv_args[i].sb_address = bs_ptr;
    auto filter_sub_space =
        (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
    fpga_copy(filter_sub_space,
              (char *)filter_ptr + i * align_conv_sub_filter_count,
              (size_t)align_conv_sub_filter_count);
-    arg->conv_args[i].filter_address = (void *)(filter_sub_space);
+    arg->conv_args[i].filter_address = filter_sub_space;
    fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count);
    if (sub_conv_num == 1) {
      arg->conv_args[i].output.address = out_ptr;
      arg->conv_args[i].output.scale_address = out->scale;
    } else {
-      auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
+      auto ptr_output = fpga_malloc(conv_output_size * sizeof(half));
-      arg->conv_args[i].output.address = (void *)((half *)ptr_output);
+      arg->conv_args[i].output.address = ptr_output;
      auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
      arg->conv_args[i].output.scale_address = ptr_output_scale;
    }
+    expand_conv_arg(&arg->conv_args[i]);
  }
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
-  // fpga_free(filter_ptr);
+  filter->reset_data_ptr(nullptr);
 }  // fill_deconv_arg
 }  // namespace fpga

--- a/src/fpga/V1/api.h
+++ b/src/fpga/V1/api.h
@@ -21,7 +21,6 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace fpga {
-int get_align_image_cw(int cw);
 void format_image(framework::Tensor* image_tensor);
 void format_fp16_ofm(framework::Tensor* ofm_tensor);  // only allocate memory
 void format_fp32_ofm(framework::Tensor* ofm_tensor);
@@ -30,7 +29,6 @@ float filter_find_max(framework::Tensor* filter_tensor);
 int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
 int get_plit_num(framework::Tensor* filter_tensor);
 int get_aligned_filter_element_num(int chw);
-int get_aligned_filter_num(int num);
 void format_filter(framework::Tensor* filter_tensor, float max_value,
                   int group_num);
 void format_fc_filter(framework::Tensor* filter_tensor, float max_value);

--- a/src/fpga/V1/deconv_filter.cpp
+++ b/src/fpga/V1/deconv_filter.cpp
@@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter
 void deconv_inverse_filter(float** data_in, int num, int channel, int width,
                           int height) {
  float* tmp = *data_in;
-  // float fix_range = 127;//  float scale = fix_range / max;
  int data_size = num * channel * width * height;
  int hw_len = height * width;
-  float* tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
+  auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
  for (int i = 0; i < num; ++i) {
    for (int j = 0; j < channel; ++j) {
      for (int k = 0; k < hw_len; ++k) {
@@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
      }
    }
  }
-  *data_in = (float*)tmp_data;  //
+  *data_in = tmp_data;
  fpga_free(tmp);
 }
@@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
 */
 int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
  if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
-    // error
+    PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
-    return 0;
  }
  return (filter_axis - pad - 1) / stride;
 }
@@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
   position. so the omit rows or columns is (stride - )
 */
 int deconv_get_omit(int stride, int filter_width, int pad) {
-  if (((filter_width - pad) <= 0)) {  // ((filter_width-pad) > stride) ||
+  PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
-    // error
+  int idx;
-    return 0;
-  }
-  int idx = 1;
  bool flag = false;
  for (idx = 1; idx <= stride; ++idx) {
    int j = idx;
@@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
  return (stride - idx);
 }
-int deconv_get_sub_filter_num(int filter_num, int stride) {
-  return filter_num * stride;
-}
 void deconv_get_sub_filter(char** data_in, int height, int width,
                           int sub_conv_n, int kernel_num, int channel) {
  char* ptr_tmp = *data_in;
@@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
  char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset *
                                       sizeof(char));  // continuous space
  for (int i = 0; i < sub_conv_n; ++i) {
-    int offset = i * origin_offset;
    char* ptr_tmp = (ptr_ptr_data)[i];
    filter::align_element(&ptr_tmp, sub_num, sub_chw);

--- a/src/fpga/V1/deconv_filter.h
+++ b/src/fpga/V1/deconv_filter.h
@@ -21,7 +21,6 @@ namespace deconv_filter {
 void deconv_inverse_filter(float** data_in, int num, int channel, int width,
                           int height);
 int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
-int deconv_get_sub_filter_num(int filter_num, int stride);
 int deconv_get_sub_filter_axis(int filter_axis, int stride);
 int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
 int deconv_get_omit(int stride, int filter_width, int pad);

--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
--- a/src/fpga/common/driver.cpp
+++ b/src/fpga/common/driver.cpp
@@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
  uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
  unsigned int nr = (unsigned int)_nr;
  int ret = 0;
-  DLOG << size;
-  DLOG << _nr;
-  DLOG << nr;
  uint64_t a_size = FPGA_PAGE_SIZE * nr;
  DLOG << a_size;
@@ -283,7 +279,7 @@ int fpga_memory_add() {
  return 0;
 }
-uint64_t vaddr_to_paddr(void *address) {
+uint64_t vaddr_to_paddr_driver(void *address) {
  uint64_t paddr = 0;
  auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
  if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
@@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) {
    g_fpgainfo.fpga_addr2size_map.erase(iter);
    munmap(ptr, size);
  } else {
-    DLOG << "Invalid pointer";
+    DLOG << "Invalid pointer" << ptr;
  }
 }
@@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) {
    g_fpgainfo.fpga_addr2size_map.erase(iter);
    munmap(ptr, size);
-    p_addr = vaddr_to_paddr(ptr);
+    p_addr = vaddr_to_paddr_driver(ptr);
    pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
    /*clear bitmap*/
@@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) {
      g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
    }
  } else {
-    DLOG << "Invalid pointer";
+    DLOG << "Invalid pointer" << ptr;
  }
 }
@@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) {
  struct MemoryCacheArgs args;
  uint64_t p_addr;
-  p_addr = vaddr_to_paddr(address);
+  p_addr = vaddr_to_paddr_driver(address);
  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
  args.size = size;
@@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
  struct MemoryCacheArgs args;
  uint64_t p_addr;
-  p_addr = vaddr_to_paddr(address);
+  p_addr = vaddr_to_paddr_driver(address);
  args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR);  // NOLINT
  args.size = size;

--- a/src/fpga/common/driver.h
+++ b/src/fpga/common/driver.h
@@ -31,8 +31,8 @@ namespace driver {
 #define FPGA_REG_PHY_ADDR 0xa0000000
 #define FPGA_REG_SIZE 0x1000
-#define FPGA_MEM_PHY_ADDR 0x20000000
+#define FPGA_MEM_PHY_ADDR 0x40000000
-#define FPGA_MEM_SIZE 0x20000000
+#define FPGA_MEM_SIZE 0x80000000
 #define FPGA_PAGE_SIZE (16UL * 1024UL)
@@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size);
 void fpga_free_driver(void *ptr);
-void fpga_copy_driver(void *dest, const void *src, size_t num);
 int fpga_flush_driver(void *address, size_t size);
 int fpga_invalidate_driver(void *address, size_t size);
-/*pe*/
+uint64_t vaddr_to_paddr_driver(void *address);
-uint64_t vaddr_to_paddr(void *address);
 int fpga_regpoll(uint64_t reg, uint64_t val, int time);

--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -115,7 +115,7 @@ int fpga_invalidate(void *address, size_t size) {
 }
 uint64_t vaddr_to_paddr(void *address) {
 #ifdef PADDLE_MOBILE_ZU5
-  return driver::vaddr_to_paddr(address);
+  return driver::vaddr_to_paddr_driver(address);
 #else
  return 0;
 #endif

--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -37,6 +37,18 @@ enum LayoutType {
  LAYOUT_HWC = 0,
 };
+enum ActivationType {
+  NONE = 0,
+  LEAKYRELU = 1,
+  SIGMOID = 2,
+  TANH = 3,
+};
+struct ActivationArgs {
+  enum ActivationType activation_type;
+  int16_t leaky_relu_negative_slope;
+};
 struct KernelArgs {
  uint32_t width;
  uint32_t height;
@@ -58,7 +70,10 @@ struct ImageOutputArgs {
  void* address;         // output result address;
  float* scale_address;  // output scale address;
  uint64_t timer_cnt;    // time counter for FPGA computation
+  struct ActivationArgs
+      activation;  // To select activation and specify (Leaky)Relu parameter.
 };
 #ifdef PADDLE_MOBILE_FPGA_V1
 struct ConvDriverParam {
  uint64_t image_address_phy;
@@ -198,7 +213,11 @@ struct DeconvArgs {
  struct ConvArgs* conv_args;
 };
-static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
+// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
+// }
+static inline uint32_t align_to_x(int64_t num, int64_t x) {
+  return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x;
+}
 int16_t fp32_2_fp16(float fp32_num);
 float fp16_2_fp32(int16_t fp16_num);

--- a/src/framework/executor.cpp
+++ b/src/framework/executor.cpp
@@ -456,9 +456,8 @@ void Executor<Device, T>::LoadMemory(const VarDesc var_desc, float *tensorInput,
                                     char **data) {}
 template <>
-void Executor<GPU_CL, Precision::FP32>::LoadMemory(const VarDesc var_desc,
+void Executor<GPU_CL, float>::LoadMemory(const VarDesc var_desc,
-                                                   float *tensorInput,
+                                         float *tensorInput, char **data) {
-                                                   char **data) {
  // 1. version
  uint32_t version = *reinterpret_cast<uint32_t *>(*data);

--- a/src/io/paddle_mobile.cpp
+++ b/src/io/paddle_mobile.cpp
@@ -202,50 +202,50 @@ double PaddleMobile<CPU, float>::GetPredictTime() {
 #endif
 #ifdef PADDLE_MOBILE_FPGA
-template <typename Device, T P>
+template <typename Device, typename T>
-void PaddleMobile<Device, P>::InjectVariable(const framework::Tensor &t,
+void PaddleMobile<Device, T>::InjectVariable(const framework::Tensor &t,
                                             std::string var_name) {
  executor_->InjectVariable(t, var_name);
 }
-template <typename Device, T P>
+template <typename Device, typename T>
-void PaddleMobile<Device, P>::FeedData(const framework::Tensor &t) {
+void PaddleMobile<Device, T>::FeedData(const framework::Tensor &t) {
  executor_->FeedData(t);
 }
-template <typename Device, T P>
+template <typename Device, typename T>
-std::shared_ptr<framework::Tensor> PaddleMobile<Device, P>::FetchResult(
+std::shared_ptr<framework::Tensor> PaddleMobile<Device, T>::FetchResult(
    int id) {
  return executor_->FetchResult(id);
 }
-template <typename Device, T P>
+template <typename Device, typename T>
-void PaddleMobile<Device, P>::Predict_From_To(int start, int end) {
+void PaddleMobile<Device, T>::Predict_From_To(int start, int end) {
  executor_->Predict_From_To(start, end);
 }
-template <typename Device, T P>
+template <typename Device, typename T>
-void PaddleMobile<Device, P>::Predict_From(int start) {
+void PaddleMobile<Device, T>::Predict_From(int start) {
  executor_->Predict_From(start);
 }
-template <typename Device, T P>
+template <typename Device, typename T>
-void PaddleMobile<Device, P>::Predict_To(int end) {
+void PaddleMobile<Device, T>::Predict_To(int end) {
  executor_->Predict_To(end);
 }
 #endif
 #ifdef PADDLE_MOBILE_CL
 static std::mutex lc;
-template <typename Device, T P>
+template <typename Device, typename T>
-void PaddleMobile<Device, P>::SetCLPath(std::string path) {
+void PaddleMobile<Device, T>::SetCLPath(std::string path) {
  std::lock_guard<std::mutex> lock(lc);
  if (framework::CLEngine::Instance()->GetCLPath() == "") {
    framework::CLEngine::Instance()->setClPath(path);
  }
 }
 template <>
-double PaddleMobile<GPU_CL, T::FP32>::GetPredictTime() {
+double PaddleMobile<GPU_CL, float>::GetPredictTime() {
  cl_int status;
  cl_uint nPlatform;
  clGetPlatformIDs(0, NULL, &nPlatform);
@@ -443,8 +443,8 @@ double PaddleMobile<GPU_CL, T::FP32>::GetPredictTime() {
    return -1;
  }
 }
-template <typename Device, T P>
+template <typename Device, typename T>
-int PaddleMobile<Device, P>::readText(
+int PaddleMobile<Device, T>::readText(
    const char *kernelPath,
    char **pcode) {  // 读取文本文件放入 pcode，返回字符串长度
  FILE *fp;

--- a/src/operators/kernel/cl/feed_kernel.cpp
+++ b/src/operators/kernel/cl/feed_kernel.cpp
@@ -14,6 +14,7 @@ limitations under the License. */
 #include "operators/kernel/feed_kernel.h"
 #include "framework/cl/cl_tensor.h"
 namespace paddle_mobile {
 namespace operators {
@@ -43,8 +44,8 @@ void FeedKernel<GPU_CL, float>::Compute(const FeedParam<GPU_CL> &param) {
  const int Stride2 = out_C * out_H * out_W;
  const int Stride1 = out_H * out_W;
  const int Stride0 = out_W;
-  CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
+  framework::CLTensor input_cl_tensor(this->cl_helper_.CLContext(),
-                           this->cl_helper_.CLCommandQueue());
+                                      this->cl_helper_.CLCommandQueue());
  input_cl_tensor.Resize(input->dims());
  cl_mem inputBuffer = input_cl_tensor.mutable_with_data<float>(input_data);

--- a/src/operators/kernel/cl/fusion_fc_kernel.cpp
+++ b/src/operators/kernel/cl/fusion_fc_kernel.cpp
@@ -94,8 +94,9 @@ void FusionFcCompute(const FusionFcParam<GPU_CL> &param, cl_context context,
    memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes);
  }
-  math::MatMul<float>(x_matrix, false, y_matrix, false, static_cast<float>(1),
+  math::MatMul<float, float>(x_matrix, false, y_matrix, false,
-                      out, static_cast<float>(1), false);
+                             static_cast<float>(1), out, static_cast<float>(1),
+                             false);
  out_image->InitEmptyImage(context, commandQueue, out->dims());
  framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1);

--- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -14,7 +14,6 @@ limitations under the License. */
 #ifdef TRANSPOSE2_OP
 #include "operators/kernel/transpose2_kernel.h"
-#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
 namespace paddle_mobile {
 namespace operators {

--- a/src/operators/math/gemm.cpp
+++ b/src/operators/math/gemm.cpp
@@ -3160,7 +3160,8 @@ void Gemm::Sgemm_omp(int m, int n, int k, float alpha, const float *A, int lda,
 #endif
  //  int L1 = 64 / max_threads * 1024;
-  int L1 = 32 / max_threads * 1024;
+  int L = (max_threads > 2) ? 64 : 32;
+  int L1 = L / max_threads * 1024;
  KC = k;
  zero = static_cast<float *>(paddle_mobile::memory::Alloc(sizeof(float) * KC));
  memset(static_cast<void *>(zero), 0, sizeof(float) * KC);

--- a/tools/op.cmake
+++ b/tools/op.cmake
@@ -213,8 +213,6 @@ if(NOT FOUND_MATCH)
  set(FUSION_CONVADD_OP ON)
  set(FUSION_CONVADDPRELU_OP ON)
  set(FUSION_CONVADDRELU_OP ON)
-#  set(FUSION_CONVADDRELU_INT8_OP ON)
-#  set(FUSION_FC_INT8_OP ON)
  set(FUSION_FC_OP ON)
  set(LRN_OP ON)
  set(MUL_OP ON)
@@ -316,9 +314,6 @@ endif()
 if (FUSION_CONVADDRELU_OP)
  add_definitions(-DFUSION_CONVADDRELU_OP)
 endif()
-if (FUSION_CONVADDRELU_INT8_OP)
-  add_definitions(-DFUSION_CONVADDRELU_INT8_OP)
-endif()
 if (FUSION_CONVADDPRELU_OP)
  add_definitions(-DFUSION_CONVADDPRELU_OP)
 endif()
@@ -328,9 +323,6 @@ endif()
 if (FUSION_FC_OP)
  add_definitions(-DFUSION_FC_OP)
 endif()
-if(FUSION_FC_INT8_OP)
-  add_definitions(-DFUSION_FC_INT8_OP)
-endif()
 if (LRN_OP)
  add_definitions(-DLRN_OP)
 endif()
@@ -490,7 +482,6 @@ if (FUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
 #  add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_QUANT_OP)
 endif()
 if (TANH_OP)
  add_definitions(-DTANH_OP)
 endif()