fix bugs of concat, reshape and slice op and add usleep in fpga regpoll,...

fix bugs of concat, reshape and slice op and add usleep in fpga regpoll, test=develop， close #2501 (#2502) * update proposal and psroipool kernel file in FPGA V2 track * update, test=develop * update FPGA v2 pe cpp file and ew kernel files, test=develop * fix a bug of sigmoid kernel in FPGA v2 track, test=develop * fix bugs of concat, reshape and slice op and add usleep in fpga regpoll, test=develop * add interupt clear operation before op compute in FPGA V2 track, test=develop

fix bugs of concat, reshape and slice op and add usleep in fpga regpoll,...
fix bugs of concat, reshape and slice op and add usleep in fpga regpoll, test=develop， close #2501 (#2502) * update proposal and psroipool kernel file in FPGA V2 track * update, test=develop * update FPGA v2 pe cpp file and ew kernel files, test=develop * fix a bug of sigmoid kernel in FPGA v2 track, test=develop * fix bugs of concat, reshape and slice op and add usleep in fpga regpoll, test=develop * add interupt clear operation before op compute in FPGA V2 track, test=develop
2c7af720 · qnqinan · Jiaying Zhao · ab362576 · 2c7af720 · 2c7af720
7 changed file
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
@@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
                    height *
                        align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
                        sizeof(int8_t));
-    for (j = 0;
-         j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
-         j++) {
-      images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5);
-    }
  }
  align_each_out_area_cw =
      align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
@@ -102,7 +97,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
        memcpy(
            (int8_t *)image_out + tmp_channel +  // NOLINT
                k * align_each_out_area_cw_differ,
-            images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
+            images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
            channel_num[i] * sizeof(int8_t));
        tmp_channel += channel_num[i];
@@ -110,6 +105,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
    }
  }
  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
+  for (i = 0; i < image_num; i++) {
+    fpga_free(images_in_tmp[i]);
+  }
+  fpga_free(images_in_tmp);
 }
 void split_image(int8_t *image_in, void **images_out, int image_num,

--- a/mobile/src/fpga/V2/pe.cpp
+++ b/mobile/src/fpga/V2/pe.cpp
@@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
  // DLOG << "   activation_type:" << active_args.activation_type
  //     << "   leaky_relu_negative_slope:"
  //     << active_args.leaky_relu_negative_slope;
-  // DLOG << "   reg_ActivationArgs:" << reg_ActivationArgs;
+  DLOG << "   reg_ActivationArgs:";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
    ret = -EIO;
@@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
    return ret;
  }
+    // reg_writeq(reg_ActivationArgs,
+             // REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR);  // active functoion
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
  // new
  reg_writeq((args.driver.row_padding_down << 45) |
                 (args.driver.row_padding_up << 34) |
@@ -365,7 +369,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
  uint64_t cmd = 0;
  uint64_t image_physical_address = 0;
  uint64_t output_physical_address = 0;
+uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  image_physical_address = vaddr_to_paddr(args.image.address);
  output_physical_address = vaddr_to_paddr(args.output.address);
  uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
@@ -440,7 +444,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
    return ret;
  }
+  reg_writeq(output_scale, REG_SCALE_PARAMETER);
  reg_writeq(image_physical_address, 0x808);
  reg_writeq(result_addr_row, 0x810);
  reg_writeq(kernel_padding_step, 0x818);
@@ -497,7 +501,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
  int ret = 0;
+uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
  if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
@@ -534,6 +538,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
  uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
  uint64_t ew_scale_mult_factor = (*ew_scale) |
          ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
  reg_writeq(image_physical_address, 0x808);
  reg_writeq(result_addr_row, 0x810);
  reg_writeq(kernel_padding_step, 0x818);
@@ -928,6 +933,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
 #endif
 #ifdef PADDLE_MOBILE_ZU5
  DLOG << "DWConv";
+  uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
  // return 0;
  uint64_t timer_cnt = 0;
  int ret = 0;
@@ -1011,7 +1017,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
    pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
    return ret;
  }
+  reg_writeq(0ul, REG_SCALE_PARAMETER);
  reg_writeq(image_physical_address, 0x808);
  reg_writeq(result_addr_row, 0x810);
  reg_writeq(kernel_padding_step, 0x818);

--- a/mobile/src/fpga/common/driver.cpp
+++ b/mobile/src/fpga/common/driver.cpp
@@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
  uint64_t i = 0;
  /*timeout精确性待确认*/
  int64_t timeout = time * 6;
-  usleep(1);
  for (i = 0; i < timeout; i++) {
+    usleep(1);
    if (val == reg_readq(reg)) {
      break;
    }

--- a/mobile/src/fpga/common/fpga_common.h
+++ b/mobile/src/fpga/common/fpga_common.h
@@ -211,6 +211,7 @@ struct ConcatArgs {
  uint32_t out_channel;
  uint32_t height;
  uint32_t width;
+  std::vector<std::shared_ptr<char>> vector_concat_space;
 };
 struct SplitConvArgs {

--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -53,6 +53,15 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
  concatArgs.channel_num = channel_num;
  concatArgs.height = height;
  concatArgs.width = width;
+  auto deleter = [](void *p) { fpga::fpga_free(p); };
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.images_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.scales_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.channel_num), deleter));
  param->SetFpgaArgs(concatArgs);
  return true;
 }

--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -110,7 +110,27 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
    }
  }
  output->Resize(framework::make_ddim(shape));
+  bool reshapeNeedFlg = 1;
  if (output->dims() == input->dims()) {
+    reshapeNeedFlg = 0;
+  } else if (output->dims().size() != input->dims().size()) {
+    auto inputdimsize = input->dims().size();
+    auto outputdimsize = output->dims().size();
+    int smallersize =
+            inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
+    int i = 0;
+    for (i = 0; i < smallersize; i++) {
+      if ((input->dims())[i] != (output->dims())[i])
+        break;
+    }
+    if (i == smallersize) {
+      reshapeNeedFlg = 0;
+    }
+  }
+  if (reshapeNeedFlg) {
+    reshape(input, output);
+  } else {
    DLOG << "No need to reshape";
    output->ShareDataWith(*input);
    framework::LoD lod = input->lod();
@@ -118,9 +138,6 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
    output->scale[0] = input->scale[0];
    return;
  }
-  reshape(input, output);
-  //
 }
 }  // namespace operators

--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -30,6 +30,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
  }
  return true;
 }
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
  // Only support slicing in channel dimension
@@ -38,6 +39,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
  auto input = param.input_;
  auto output = param.output_;
+  int H = input->dims()[2];
+  int W = input->dims()[3];
  int HW = input->dims()[2] * input->dims()[3];
  int channel = input->dims()[1];
  auto input_ptr = input->data<int8_t>();
@@ -53,10 +56,32 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
  end = end > channel ? channel : end;
  int len = end - start;
  size_t size = len * sizeof(int8_t);
+  DLOG << input->fpga_data_num;
+  fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t));
+  DLOG << output->fpga_data_num;
+  fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t));
+  int unalignedWC = len * W;
+  int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
-  for (int i = 0; i < HW; i++) {
+  if (unalignedWC != alignedWC) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+      auto tmpOutput = reinterpret_cast<int8_t*>
+              (fpga::fpga_malloc(len*HW * sizeof(int8_t)));
+      for (int i = 0; i < HW; i++) {
+          memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
+      }
+      for (int i = 0; i < H; i++) {
+          for (int j = 0; j < unalignedWC; j++) {
+              *(output_ptr + alignedWC * i + j) =
+                      *(tmpOutput + unalignedWC * i + j);
+          }
+      }
+      fpga::fpga_free(tmpOutput);
+  } else {
+      for (int i = 0; i < HW; i++) {
+          memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+      }
  }
+  fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t));
 }
 }  // namespace operators
 }  // namespace paddle_mobile