fix bugs of concat, reshape and slice op and add usleep in fpga regpoll, test=develop

ca6eec78 · qnqinan · b816754c · ca6eec78 · ca6eec78 · ca6eec78
6 changed file
--- a/mobile/src/fpga/V2/image.cpp
+++ b/mobile/src/fpga/V2/image.cpp
@@ -110,6 +110,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
    }
  }
  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
+  for (i = 0; i < image_num; i++) {
+    fpga_free(images_in_tmp[i]);
+  }
+  fpga_free(images_in_tmp);
 }
 void split_image(int8_t *image_in, void **images_out, int image_num,

--- a/mobile/src/fpga/common/driver.cpp
+++ b/mobile/src/fpga/common/driver.cpp
@@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
  uint64_t i = 0;
  /*timeout精确性待确认*/
  int64_t timeout = time * 6;
-  usleep(1);
  for (i = 0; i < timeout; i++) {
+    usleep(1);
    if (val == reg_readq(reg)) {
      break;
    }

--- a/mobile/src/fpga/common/fpga_common.h
+++ b/mobile/src/fpga/common/fpga_common.h
@@ -211,6 +211,7 @@ struct ConcatArgs {
  uint32_t out_channel;
  uint32_t height;
  uint32_t width;
+  std::vector<std::shared_ptr<char>> vector_concat_space;
 };
 struct SplitConvArgs {

--- a/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/concat_kernel.cpp
@@ -53,6 +53,15 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
  concatArgs.channel_num = channel_num;
  concatArgs.height = height;
  concatArgs.width = width;
+  auto deleter = [](void *p) { fpga::fpga_free(p); };
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.images_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.scales_in), deleter));
+  concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
+      reinterpret_cast<char *>(concatArgs.channel_num), deleter));
  param->SetFpgaArgs(concatArgs);
  return true;
 }

--- a/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/reshape2_kernel.cpp
@@ -110,7 +110,27 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
    }
  }
  output->Resize(framework::make_ddim(shape));
+  bool reshapeNeedFlg = 1;
  if (output->dims() == input->dims()) {
+    reshapeNeedFlg = 0;
+  } else if (output->dims().size() != input->dims().size()) {
+    auto inputdimsize = input->dims().size();
+    auto outputdimsize = output->dims().size();
+    int smallersize =
+            inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
+    int i = 0;
+    for (i = 0; i < smallersize; i++) {
+      if ((input->dims())[i] != (output->dims())[i])
+        break;
+    }
+    if (i == smallersize) {
+      reshapeNeedFlg = 0;
+    }
+  }
+  if (reshapeNeedFlg) {
+    reshape(input, output);
+  } else {
    DLOG << "No need to reshape";
    output->ShareDataWith(*input);
    framework::LoD lod = input->lod();
@@ -118,9 +138,6 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
    output->scale[0] = input->scale[0];
    return;
  }
-  reshape(input, output);
-  //
 }
 }  // namespace operators

--- a/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
+++ b/mobile/src/operators/kernel/fpga/V2/slice_kernel.cpp
@@ -30,6 +30,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
  }
  return true;
 }
 template <>
 void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
  // Only support slicing in channel dimension
@@ -38,6 +39,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
  auto input = param.input_;
  auto output = param.output_;
+  int H = input->dims()[2];
+  int W = input->dims()[3];
  int HW = input->dims()[2] * input->dims()[3];
  int channel = input->dims()[1];
  auto input_ptr = input->data<int8_t>();
@@ -53,10 +56,32 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
  end = end > channel ? channel : end;
  int len = end - start;
  size_t size = len * sizeof(int8_t);
+  DLOG << input->fpga_data_num;
+  fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t));
+  DLOG << output->fpga_data_num;
+  fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t));
+  int unalignedWC = len * W;
+  int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
-  for (int i = 0; i < HW; i++) {
+  if (unalignedWC != alignedWC) {
-    memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+      auto tmpOutput = reinterpret_cast<int8_t*>
+              (fpga::fpga_malloc(len*HW * sizeof(int8_t)));
+      for (int i = 0; i < HW; i++) {
+          memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
+      }
+      for (int i = 0; i < H; i++) {
+          for (int j = 0; j < unalignedWC; j++) {
+              *(output_ptr + alignedWC * i + j) =
+                      *(tmpOutput + unalignedWC * i + j);
+          }
+      }
+      fpga::fpga_free(tmpOutput);
+  } else {
+      for (int i = 0; i < HW; i++) {
+          memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
+      }
  }
+  fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t));
 }
 }  // namespace operators
 }  // namespace paddle_mobile