Fix op kernel bugs for FPGA tracks

46559e68 · zhangyang · 69e6edaf · 46559e68 · 46559e68 · 46559e68
7 changed file
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -29,9 +29,7 @@ namespace fpga {

 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
-#ifdef PADDLE_MOBILE_OS_LINUX
 static std::map<void *, size_t> memory_map;
-#endif

 static inline int do_ioctl(int req, const void *arg) {
 #ifdef PADDLE_MOBILE_OS_LINUX
@@ -53,32 +51,38 @@ int open_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
  static uint64_t counter = 0;
-  counter += size;
-  DLOG << size << " bytes allocated. Total " << counter << " bytes";
+
 #ifdef PADDLE_MOBILE_OS_LINUX
  auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  memory_map.insert(std::make_pair(ptr, size));
-  return ptr;
 #else
-  return malloc(size);
+  auto ptr = malloc(size);
 #endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+       << counter << " bytes";
+  return ptr;
 }

 void fpga_free(void *ptr) {
-#ifdef PADDLE_MOBILE_OS_LINUX
  static uint64_t counter = 0;
  size_t size = 0;
+
  auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
  if (iter != memory_map.end()) {
    size = iter->second;
-    munmap(ptr, size);
    memory_map.erase(iter);
-  }
-  counter += size;
-  DLOG << size << " bytes freed. Total " << counter << " bytes";
+#ifdef PADDLE_MOBILE_OS_LINUX
+    munmap(ptr, size);
 #else
-  free(ptr);
+    free(ptr);
 #endif
+    counter += size;
+    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
 }

 void fpga_copy(void *dest, const void *src, size_t num) {
@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) {
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_TEST_MODE
  DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   out_address:" << args.image_out
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
       << "   out_scale_address:" << args.scale_out;
  DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
  for (int i = 0; i < args.image_num; i++) {
@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) {
  auto channel = dims[1], height = dims[2], width = dims[3];
  auto data_ptr = image_tensor->data<float>();
  size_t memory_size = channel * height * width * sizeof(float);
-  float *new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);
  fpga_copy(new_data, data_ptr, memory_size);
  image::format_image(&new_data, channel, height, width);
  image_tensor->reset_data_ptr(new_data);
@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  auto out_ptr = out->data<float>();

  arg->group_num = (uint32_t)group_num;
-  arg->split_num = (uint32_t)fpga::get_plit_num(filter);
+  // Either group_num or split_num = 1;
+  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
  arg->filter_num = (uint32_t)filter->dims()[0];
  arg->output.address = out_ptr;
  arg->output.scale_address = out->scale;
-  arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num *
-                                                       sizeof(fpga::ConvArgs));
+  arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));

  arg->concat_arg.image_num = arg->split_num;
  arg->concat_arg.image_out = out_ptr;
@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
  arg->concat_arg.width = (uint32_t)filter->dims()[3];

  int n = arg->split_num;
-  arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
-  arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *));
-  arg->concat_arg.channel_num =
-      (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
+  arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
+  arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
+  arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
  arg->concat_arg.image_out = out_ptr;

  auto channel = (int)out->dims()[1];
-  int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num);
-  int element_num = fpga::get_aligned_filter_element_num(
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
+  int element_num = get_aligned_filter_element_num(
      filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);

  for (int i = 0; i < n; i++) {
@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
        &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
    arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
    arg->conv_args[i].filter_num =
-        (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
-                                    channel - (n - 1) * filter_num_per_div)
+        (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
                              : filter_num_per_div);

    if (n > 1) {
      arg->conv_args[i].output.scale_address =
-          (float *)fpga::fpga_malloc(2 * sizeof(float));
-      arg->conv_args[i].output.address =
-          fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
-                            arg->conv_args[i].filter_num * sizeof(half));
+          (float *)fpga_malloc(2 * sizeof(float));
+      arg->conv_args[i].output.address = fpga_malloc(
+          input->dims()[2] *
+          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
    }

    else {
@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
    }

    arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
-    arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address;
+    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
    arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
  }
 }

--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
    std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
    for (int j = 0; j < ops.size(); ++j) {
      std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << op->Type();
+      DLOG << "create op: " << j << "  " << op->Type();
      auto op_base = framework::OpRegistry<Dtype>::CreateOp(
          op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
          program_.scope);
@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
  std::shared_ptr<framework::BlockDesc> to_predict_block =
      to_predict_program_->Block(0);
  auto &ops = ops_of_block_[*to_predict_block.get()];
+  int i = 0;
  for (const auto &op : ops) {
+    DLOG << "Init op: " << i++ << "  " << op->Type();
    op->Init();
  }
 }
@@ -695,6 +697,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
    clock_gettime(CLOCK_MONOTONIC, &ts);
    profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
+    DLOG << "Running op: " << i << "  " << ops[i]->Type();
    ops[i]->Run();

 #ifdef PADDLE_MOBILE_PROFILE

--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
    auto input_ptr = input->data<float>();
    fpga::format_image(input);
    Tensor *output = param_.Out();
-    auto output_ptr = output->data<half>();
+    auto output_ptr = output->data<float>();

    fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};


--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -26,7 +26,8 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
  auto input = const_cast<Tensor *>(param->InputX());
  auto input_ptr = input->data<float>();
-  auto float_input = new Tensor(*input);
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>(input->dims());
  fpga::format_fp32_ofm(float_input);

  fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};

--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -341,7 +341,6 @@ class OpParam {
  }
 };

-#ifdef CONV_OP
 template <typename Dtype>
 class ConvParam : public OpParam {
  typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -386,7 +385,6 @@ class ConvParam : public OpParam {
 };
 template <typename Dtype>
 Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
-#endif

 template <typename Dtype>
 class ElementwiseAddParam : OpParam {

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -30,6 +30,27 @@ elseif("FPGAnets" IN_LIST NET)
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)

+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-EW paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-conv paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-pooling paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-bypass paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-softmax paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-concat paddle-mobile)
+
    ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-tensor-quant paddle-mobile)

@@ -66,6 +87,9 @@ else ()
    ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-resnet paddle-mobile)

+    ADD_EXECUTABLE(test-resnet50 net/test_resnet50.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
    # gen test
    ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
    target_link_libraries(test-squeezenet paddle-mobile)
@@ -235,13 +259,4 @@ else ()

    #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)

-
-
-
-
 endif()
-
-# if(FPGA)
-#     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
-#     target_link_libraries(test-tensor-quant paddle-mobile)
-# endif()
--- a/test/fpga/test_resnet50.cpp
+++ b/test/fpga/test_resnet50.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+static const char *g_resnet_combine = "../models/resnet50";
+
+int main() {
+  DLOG << paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+                         std::string(g_resnet_combine) + "/params", true)) {
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+    //    paddle_mobile.Predict_From(73);
+    //    paddle_mobile.Predict_From_To(72, 73);
+
+    DLOG << "Computation done";
+    return 0;
+  }
+}