Merge pull request #1313 from zhangyang0701/develop

add split and transpose kernels for FPGA track close #1312

Merge pull request #1313 from zhangyang0701/develop
add split and transpose kernels for FPGA track close #1312
6d7f9277 · qnqinan · GitHub · 777e7fd4 · b55c8914 · 6d7f9277
8 changed file
--- a/src/fpga/V1/image.cpp
+++ b/src/fpga/V1/image.cpp
@@ -111,6 +111,27 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
  fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
 }

+void split_image(int16_t *image_in, float *scale_in, void **images_out,
+                 float **scales_out, int image_num, uint32_t *channel_nums,
+                 int height, int width) {
+  int total_channel = 0;
+  for (int i = 0; i < image_num; i++) {
+    scales_out[i][0] = scale_in[0];
+    scales_out[i][1] = scale_in[1];
+    total_channel += channel_nums[i];
+  }
+
+  for (int h = 0; h < height; h++) {
+    int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT);
+    for (int i = 0; i < image_num; i++) {
+      int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT);
+      memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
+             channel_nums[i] * sizeof(int16_t));
+      src_offset += channel_nums[i];
+    }
+  }
+}
+
 }  // namespace image
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/image.h
+++ b/src/fpga/V1/image.h
@@ -28,6 +28,9 @@ void concat_images(int16_t** images_in, float** scales_in, void* image_out,
                   float* scale_out, int image_num, uint32_t* channel_num,
                   int height,
                   int width);  // Concat featuremaps along channel direction
+void split_image(int16_t* image_in, float* scale_in, void** images_out,
+                 float** scales_out, int image_num, uint32_t* channel_nums,
+                 int height, int width);
 }  // namespace image
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/V1/pe.cpp
+++ b/src/fpga/V1/pe.cpp
@@ -138,13 +138,11 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
  DLOG << "=============ComputeFpgaConcat===========";
  DLOG << "   Image_num: " << args.image_num
       << "   out_address:" << args.image_out
-       << "   out_scale_address:" << args.scale_out
-       << "   out_channel:" << args.out_channel;
+       << "   out_scale_address:" << args.scale_out;
  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
  for (int i = 0; i < args.image_num; i++) {
    DLOG << "   " << i << "th:        ";
    DLOG << "   channel_num:" << args.channel_num[i]
-         << "   aligned_channel_num:" << args.aligned_channel_num[i]
         << "   image_address:" << args.images_in[i]
         << "   image_scale_address:" << args.scales_in[i];
  }
@@ -156,5 +154,25 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
  return 0;
 }

+int ComputeFPGASplit(const struct SplitArgs &args) {
+#ifdef FPGA_PRINT_MODE
+  DLOG << "=============ComputeFpgaSplit===========";
+  DLOG << "   Image_num: " << args.image_num
+       << "   in_address:" << args.image_in
+       << "   in_scale_address:" << args.scale_in;
+  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
+  for (int i = 0; i < args.image_num; i++) {
+    DLOG << "   " << i << "th:        ";
+    DLOG << "   channel_num:" << args.out_channel_nums[i]
+         << "   image_address:" << args.images_out[i]
+         << "   image_scale_address:" << args.scales_out[i];
+  }
+#endif
+  image::split_image(args.image_in, args.scale_in, args.images_out,
+                     args.scales_out, args.image_num, args.out_channel_nums,
+                     args.height, args.width);
+  return 0;
+}
+
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/fpga/common/fpga_common.h
+++ b/src/fpga/common/fpga_common.h
@@ -74,8 +74,19 @@ struct ConcatArgs {
  void* image_out;
  float* scale_out;
  uint32_t* channel_num;
-  uint32_t* aligned_channel_num;
-  uint32_t out_channel;
+  //  uint32_t* aligned_channel_num;
+  //  uint32_t out_channel;
+  uint32_t height;
+  uint32_t width;
+};
+
+struct SplitArgs {
+  uint32_t image_num;
+  int16_t* image_in;
+  float* scale_in;
+  void** images_out;
+  float** scales_out;
+  uint32_t* out_channel_nums;
  uint32_t height;
  uint32_t width;
 };

--- a/src/fpga/common/pe.h
+++ b/src/fpga/common/pe.h
@@ -25,6 +25,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args);

 int ComputeFpgaConv(const struct SplitConvArgs& args);
 int ComputeFPGAConcat(const struct ConcatArgs& args);
+int ComputeFPGASplit(const struct SplitArgs& args);

 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V1/split_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/split_kernel.cpp
@@ -19,11 +19,45 @@ limitations under the License. */
 namespace paddle_mobile {
 namespace operators {
 template <>
-bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA>* param) {
+bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
+  auto *in = const_cast<Tensor *>(param->InputX());
+  auto outs = param->Outs();
+  auto sections = param->Sections();
+  int axis = param->Axis();
+  PADDLE_MOBILE_ENFORCE(axis == 1, "Only support split in channel dimension");
+  PADDLE_MOBILE_ENFORCE(outs.size() == sections.size(),
+                        "Output number should be equal to section number");
+  auto image_num = (uint32_t)outs.size();
+  auto images_out =
+      reinterpret_cast<void **>(fpga::fpga_malloc(image_num * sizeof(void *)));
+  auto scales_out = reinterpret_cast<float **>(
+      fpga::fpga_malloc(image_num * sizeof(float *)));
+  auto out_channels = reinterpret_cast<uint32_t *>(
+      fpga::fpga_malloc(image_num * sizeof(uint32_t)));
+  for (int i = 0; i < image_num; i++) {
+    fpga::format_fp16_ofm(outs[i]);
+    images_out[i] = outs[i]->mutable_data<float>();
+    scales_out[i] = outs[i]->scale;
+    out_channels[i] = (uint32_t)sections[i];
+  }
+
+  fpga::SplitArgs arg = {0};
+  arg.image_num = image_num;
+  arg.image_in = (half *)in->data<float>();
+  arg.scale_in = in->scale;
+  arg.images_out = images_out;
+  arg.scales_out = scales_out;
+  arg.out_channel_nums = out_channels;
+  arg.height = (uint32_t)in->dims()[2];
+  arg.width = (uint32_t)in->dims()[3];
+
+  param->SetFpgaArgs(arg);
  return true;
 }
 template <>
-void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA>& param) {}
+void SplitKernel<FPGA, float>::Compute(const SplitParam<FPGA> &param) {
+  fpga::ComputeFPGASplit(param.FpgaArgs());
+}

 }  // namespace operators
 }  // namespace paddle_mobile

--- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp
@@ -21,6 +21,7 @@ namespace operators {

 template <>
 bool Transpose2Kernel<FPGA, float>::Init(Transpose2Param<FPGA> *param) {
+  param->Out()->ShareDataWith(*param->InputX());
  return true;
 }


--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -2421,6 +2421,15 @@ class SplitParam : public OpParam {
  int num;
  std::vector<int> sections;
  //  std::vector<GType> out_ts_;
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  fpga::SplitArgs fpga_split_args;
+
+ public:
+  const fpga::SplitArgs &FpgaArgs() const { return fpga_split_args; }
+  void SetFpgaArgs(const fpga::SplitArgs &args) { fpga_split_args = args; }
+#endif
 };
 #endif