fix some bugs in fpga V2 track and update fpga V2 pe code

7a8b998f · qnqinan · 344c1df7 · 7a8b998f · 7a8b998f · 7a8b998f
6 changed file
--- a/src/fpga/V2/api.cpp
+++ b/src/fpga/V2/api.cpp
@@ -204,7 +204,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,

    arg->conv_arg[i].image.address = input_ptr;
    arg->conv_arg[i].image.scale_address = input->scale;
-    arg->conv_arg[i].image.channels = (uint32_t)input->dims()[1];
+    arg->conv_arg[i].image.channels =
+        (uint32_t)get_aligned_channel_num((int)(input->dims()[1]));  // NOLINT
    arg->conv_arg[i].image.height = (uint32_t)input->dims()[2];
    arg->conv_arg[i].image.width = (uint32_t)input->dims()[3];
    arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
@@ -216,7 +217,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
    int num_after_alignment = filter::calc_aligned_num(
        arg->filter_num, (int)input->dims()[1]);  // NOLINT
    arg->conv_arg[i].free_space =
-        fpga_malloc(num_after_alignment * 2 * sizeof(half));
+        fpga_malloc(num_after_alignment * 2 * sizeof(float));  // half
  }
 }


--- a/src/fpga/V2/filter.cpp
+++ b/src/fpga/V2/filter.cpp
@@ -16,7 +16,6 @@ limitations under the License. */
 #include <memory.h>
 #include <algorithm>
 #include "fpga/common/fpga_common.h"
-
 namespace paddle_mobile {
 namespace fpga {
 namespace filter {
@@ -88,12 +87,25 @@ void align_filter(float **data_in, int num, int channel, int height,
  *data_in = new_data;
  fpga_free(temp);
 }
-
+void convert_to_fp16(float **data_in, int data_size) {
+  float *tmp = *data_in;
+  // half_float::half *tmp_data = (half_float::half *)fpga_malloc(data_size *
+  // sizeof(half_float::half));
+  int16_t *tmp_data =
+      (int16_t *)fpga_malloc(data_size * sizeof(int16_t));  // NOLINT
+  for (int i = 0; i < data_size; i++) {
+    // tmp_data[i] = (half_float::half)((*data_in)[i]);
+    tmp_data[i] = fp32_2_fp16((*data_in)[i]);
+  }
+  *data_in = (float *)tmp_data;  // NOLINT
+  fpga_free(tmp);
+}
 void format_filter(float **data_in, int num, int channel, int height, int width,
                   int group_num, float max) {
  convert_to_hwc(data_in, num, channel, height, width);
  align_filter(data_in, num, channel, height, width);
  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  convert_to_fp16(data_in, pixel_num);
  fpga_flush(*data_in, pixel_num * sizeof(float));
 }

@@ -115,6 +127,7 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
  convert_fc_filter(data_in, num, chw);
  align_filter(data_in, num, channel, height, width);
  int pixel_num = calc_aligned_total_pixel_num(num, channel, height, width);
+  convert_to_fp16(data_in, pixel_num);
  fpga_flush(*data_in, pixel_num * sizeof(float));
 }


--- a/src/fpga/V2/pe.cpp
+++ b/src/fpga/V2/pe.cpp
--- a/src/fpga/common/fpga_common.cpp
+++ b/src/fpga/common/fpga_common.cpp
@@ -113,6 +113,12 @@ int fpga_invalidate(void *address, size_t size) {
  return 0;
 #endif
 }
-
+uint64_t vaddr_to_paddr(void *address) {
+#ifdef PADDLE_MOBILE_ZU5
+  return driver::vaddr_to_paddr(address);
+#else
+  return 0;
+#endif
+}
 }  // namespace fpga
 }  // namespace paddle_mobile
--- a/src/operators/kernel/fpga/V2/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/feed_kernel.cpp
@@ -13,7 +13,7 @@ See the License for the specific language governing permissions and
 limitations under the License. */

 #include "operators/kernel/feed_kernel.h"
-
+#include "fpga/V2/filter.h"
 namespace paddle_mobile {
 namespace operators {

@@ -24,7 +24,6 @@ bool FeedKernel<FPGA, float>::Init(FeedParam<FPGA> *param) {
  fpga::format_fp16_ofm(output, aligned_channel);
  return true;
 }
-
 template <>
 void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  auto input =
@@ -33,6 +32,9 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  auto input_ptr = input->data<float>();
  Tensor *output = param.Out();
  auto output_ptr = output->data<float>();
+  auto channel = input->dims()[1];
+  uint32_t aligned_channels =
+      fpga::filter::calc_aligned_channel((int)channel);  // NOLINT

  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};

@@ -41,7 +43,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  args.input_layout_type = fpga::LAYOUT_CHW;
  args.output_layout_type = fpga::LAYOUT_HWC;
  args.image.address = reinterpret_cast<void *>(input_ptr);
-  args.image.channels = (uint32_t)input->dims()[1];
+  args.image.channels = aligned_channels;
  args.image.height = (uint32_t)input->dims()[2];
  args.image.width = (uint32_t)input->dims()[3];
  args.image.pad_height = 0;

--- a/src/operators/kernel/fpga/V2/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/V2/softmax_kernel.cpp
@@ -25,7 +25,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input_ptr = input->data<float>();
  auto float_input = new Tensor;
  float_input->mutable_data<float>({1, input->dims()[1]});
-  fpga::format_fp32_ofm(float_input, 8);
+  fpga::format_fp32_ofm(float_input, 1024);

  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
  args.input_layout_type = fpga::LAYOUT_HWC;