From 46559e6833cd1e08f52a763dea99815e56fe5fd5 Mon Sep 17 00:00:00 2001
From: zhangyang <zhangyang49@baidu.com>
Date: Mon, 24 Sep 2018 10:59:47 +0800
Subject: [PATCH] Fix op kernel bugs for FPGA tracks

---
 src/fpga/api.cpp                             | 67 +++++++++++---------
 src/io/executor.cpp                          |  5 +-
 src/operators/feed_op.h                      |  2 +-
 src/operators/kernel/fpga/softmax_kernel.cpp |  3 +-
 src/operators/op_param.h                     |  2 -
 test/CMakeLists.txt                          | 33 +++++++---
 test/fpga/test_resnet50.cpp                  | 39 ++++++++++++
 7 files changed, 106 insertions(+), 45 deletions(-)
 create mode 100644 test/fpga/test_resnet50.cpp
diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp
index 9d33c742e3..10787b9155 100644
--- a/src/fpga/api.cpp
+++ b/src/fpga/api.cpp
@@ -29,9 +29,7 @@ namespace fpga {
 
 static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
-#ifdef PADDLE_MOBILE_OS_LINUX
 static std::map<void *, size_t> memory_map;
-#endif
 
 static inline int do_ioctl(int req, const void *arg) {
 #ifdef PADDLE_MOBILE_OS_LINUX
@@ -53,32 +51,38 @@ int open_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
   static uint64_t counter = 0;
-  counter += size;
-  DLOG << size << " bytes allocated. Total " << counter << " bytes";
+
 #ifdef PADDLE_MOBILE_OS_LINUX
   auto ptr = mmap64(nullptr, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0);
-  memory_map.insert(std::make_pair(ptr, size));
-  return ptr;
 #else
-  return malloc(size);
+  auto ptr = malloc(size);
 #endif
+  counter += size;
+  memory_map.insert(std::make_pair(ptr, size));
+  DLOG << "Address: " << ptr << ", " << size << " bytes allocated. Total "
+       << counter << " bytes";
+  return ptr;
 }
 
 void fpga_free(void *ptr) {
-#ifdef PADDLE_MOBILE_OS_LINUX
   static uint64_t counter = 0;
   size_t size = 0;
+
   auto iter = memory_map.find(ptr);  // std::map<void *, size_t>::iterator
   if (iter != memory_map.end()) {
     size = iter->second;
-    munmap(ptr, size);
     memory_map.erase(iter);
-  }
-  counter += size;
-  DLOG << size << " bytes freed. Total " << counter << " bytes";
+#ifdef PADDLE_MOBILE_OS_LINUX
+    munmap(ptr, size);
 #else
-  free(ptr);
+    free(ptr);
 #endif
+    counter += size;
+    DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total "
+         << counter << " bytes";
+  } else {
+    DLOG << "Invalid pointer";
+  }
 }
 
 void fpga_copy(void *dest, const void *src, size_t num) {
@@ -211,7 +215,8 @@ int PerformBypass(const struct BypassArgs &args) {
 int ComputeFPGAConcat(const struct ConcatArgs &args) {
 #ifdef FPGA_TEST_MODE
   DLOG << "=============ComputeFpgaConcat===========";
-  DLOG << "   out_address:" << args.image_out
+  DLOG << "   Image_num: " << args.image_num
+       << "   out_address:" << args.image_out
        << "   out_scale_address:" << args.scale_out;
   DLOG << "   image_height:" << args.height << "   image_width:" << args.width;
   for (int i = 0; i < args.image_num; i++) {
@@ -235,7 +240,7 @@ void format_image(framework::Tensor *image_tensor) {
   auto channel = dims[1], height = dims[2], width = dims[3];
   auto data_ptr = image_tensor->data<float>();
   size_t memory_size = channel * height * width * sizeof(float);
-  float *new_data = (float *)fpga_malloc(memory_size);
+  auto new_data = (float *)fpga_malloc(memory_size);
   fpga_copy(new_data, data_ptr, memory_size);
   image::format_image(&new_data, channel, height, width);
   image_tensor->reset_data_ptr(new_data);
@@ -346,12 +351,12 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
   auto out_ptr = out->data<float>();
 
   arg->group_num = (uint32_t)group_num;
-  arg->split_num = (uint32_t)fpga::get_plit_num(filter);
+  // Either group_num or split_num = 1;
+  arg->split_num = group_num == 1 ? (uint32_t)get_plit_num(filter) : 1;
   arg->filter_num = (uint32_t)filter->dims()[0];
   arg->output.address = out_ptr;
   arg->output.scale_address = out->scale;
-  arg->conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(arg->split_num *
-                                                       sizeof(fpga::ConvArgs));
+  arg->conv_args = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs));
 
   arg->concat_arg.image_num = arg->split_num;
   arg->concat_arg.image_out = out_ptr;
@@ -360,15 +365,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
   arg->concat_arg.width = (uint32_t)filter->dims()[3];
 
   int n = arg->split_num;
-  arg->concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
-  arg->concat_arg.scales_in = (float **)fpga::fpga_malloc(n * sizeof(float *));
-  arg->concat_arg.channel_num =
-      (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
+  arg->concat_arg.images_in = (half **)fpga_malloc(n * sizeof(int *));
+  arg->concat_arg.scales_in = (float **)fpga_malloc(n * sizeof(float *));
+  arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t));
   arg->concat_arg.image_out = out_ptr;
 
   auto channel = (int)out->dims()[1];
-  int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num);
-  int element_num = fpga::get_aligned_filter_element_num(
+  int filter_num_per_div = get_filter_num_per_div(filter, group_num);
+  int element_num = get_aligned_filter_element_num(
       filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
 
   for (int i = 0; i < n; i++) {
@@ -390,16 +394,17 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
         &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
     arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
     arg->conv_args[i].filter_num =
-        (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
-                                    channel - (n - 1) * filter_num_per_div)
+        (uint32_t)(i == n - 1 ? channel - (n - 1) * filter_num_per_div
                               : filter_num_per_div);
 
     if (n > 1) {
       arg->conv_args[i].output.scale_address =
-          (float *)fpga::fpga_malloc(2 * sizeof(float));
-      arg->conv_args[i].output.address =
-          fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
-                            arg->conv_args[i].filter_num * sizeof(half));
+          (float *)fpga_malloc(2 * sizeof(float));
+      arg->conv_args[i].output.address = fpga_malloc(
+          input->dims()[2] *
+          align_to_x(input->dims()[3] * arg->conv_args[i].filter_num,
+                     IMAGE_ALIGNMENT) *
+          sizeof(half));
     }
 
     else {
@@ -408,7 +413,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
     }
 
     arg->concat_arg.images_in[i] = (half *)arg->conv_args[i].output.address;
-    arg->concat_arg.scales_in[i] = (float *)arg->conv_args[i].sb_address;
+    arg->concat_arg.scales_in[i] = arg->conv_args[i].output.scale_address;
     arg->concat_arg.channel_num[i] = arg->conv_args[i].filter_num;
   }
 }
diff --git a/src/io/executor.cpp b/src/io/executor.cpp
index c12f1ce02c..33a6ff3595 100644
--- a/src/io/executor.cpp
+++ b/src/io/executor.cpp
@@ -79,7 +79,7 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
     std::vector<std::shared_ptr<framework::OpDesc>> ops = block_desc->Ops();
     for (int j = 0; j < ops.size(); ++j) {
       std::shared_ptr<framework::OpDesc> op = ops[j];
-      DLOG << "create op: " << op->Type();
+      DLOG << "create op: " << j << "  " << op->Type();
       auto op_base = framework::OpRegistry<Dtype>::CreateOp(
           op->Type(), op->GetInputs(), op->GetOutputs(), op->GetAttrMap(),
           program_.scope);
@@ -103,7 +103,9 @@ Executor<Dtype, P>::Executor(const framework::Program<Dtype> p, int batch_size,
   std::shared_ptr<framework::BlockDesc> to_predict_block =
       to_predict_program_->Block(0);
   auto &ops = ops_of_block_[*to_predict_block.get()];
+  int i = 0;
   for (const auto &op : ops) {
+    DLOG << "Init op: " << i++ << "  " << op->Type();
     op->Init();
   }
 }
@@ -695,6 +697,7 @@ void Executor<Dtype, P>::Predict_From_To(int start, int end) {
     clock_gettime(CLOCK_MONOTONIC, &ts);
     profile[i].runBegin = (uint64_t)ts.tv_sec * 1e9 + ts.tv_nsec;
 #endif
+    DLOG << "Running op: " << i << "  " << ops[i]->Type();
     ops[i]->Run();
 
 #ifdef PADDLE_MOBILE_PROFILE
diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h
index 2cc7fda7f8..cccd4f52eb 100644
--- a/src/operators/feed_op.h
+++ b/src/operators/feed_op.h
@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
     auto input_ptr = input->data<float>();
     fpga::format_image(input);
     Tensor *output = param_.Out();
-    auto output_ptr = output->data<half>();
+    auto output_ptr = output->data<float>();
 
     fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
 
diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp
index 79f1453fc8..7cfd0c7d76 100644
--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
@@ -26,7 +26,8 @@ template <>
 bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
   auto input = const_cast<Tensor *>(param->InputX());
   auto input_ptr = input->data<float>();
-  auto float_input = new Tensor(*input);
+  auto float_input = new Tensor;
+  float_input->mutable_data<float>(input->dims());
   fpga::format_fp32_ofm(float_input);
 
   fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
diff --git a/src/operators/op_param.h b/src/operators/op_param.h
index bb2355d80f..5b53743b75 100644
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -341,7 +341,6 @@ class OpParam {
   }
 };
 
-#ifdef CONV_OP
 template <typename Dtype>
 class ConvParam : public OpParam {
   typedef typename DtypeTensorTrait<Dtype>::gtype GType;
@@ -386,7 +385,6 @@ class ConvParam : public OpParam {
 };
 template <typename Dtype>
 Print &operator<<(Print &printer, const ConvParam<Dtype> &conv_param);
-#endif
 
 template <typename Dtype>
 class ElementwiseAddParam : OpParam {
diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt
index ef03205ae5..a19df61fd1 100644
--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -30,6 +30,27 @@ elseif("FPGAnets" IN_LIST NET)
     ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
 
+    ADD_EXECUTABLE(test-resnet50 fpga/test_resnet50.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-EW fpga/test_fpga_EW.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-EW paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-conv fpga/test_fpga_conv.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-conv paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-pooling fpga/test_fpga_pooling.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-pooling paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-bypass fpga/test_fpga_bypass.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-bypass paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-softmax fpga/test_fpga_softmax.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-softmax paddle-mobile)
+
+    ADD_EXECUTABLE(test-fpga-concat fpga/test_fpga_concat.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-fpga-concat paddle-mobile)
+
     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-tensor-quant paddle-mobile)
 
@@ -66,6 +87,9 @@ else ()
     ADD_EXECUTABLE(test-resnet net/test_resnet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-resnet paddle-mobile)
 
+    ADD_EXECUTABLE(test-resnet50 net/test_resnet50.cpp test_helper.h  test_include.h executor_for_test.h)
+    target_link_libraries(test-resnet50 paddle-mobile)
+
     # gen test
     ADD_EXECUTABLE(test-squeezenet net/test_squeezenet.cpp test_helper.h  test_include.h executor_for_test.h)
     target_link_libraries(test-squeezenet paddle-mobile)
@@ -235,13 +259,4 @@ else ()
 
     #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp)
 
-
-
-
-
 endif()
-
-# if(FPGA)
-#     ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h  test_include.h executor_for_test.h)
-#     target_link_libraries(test-tensor-quant paddle-mobile)
-# endif()
diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp
new file mode 100644
index 0000000000..cca6793f10
--- /dev/null
+++ b/test/fpga/test_resnet50.cpp
@@ -0,0 +1,39 @@
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#include "../test_include.h"
+static const char *g_resnet_combine = "../models/resnet50";
+
+int main() {
+  DLOG << paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model",
+                         std::string(g_resnet_combine) + "/params", true)) {
+    std::vector<int64_t> dims{1, 3, 224, 224};
+    Tensor input_tensor;
+    SetupTensor<float>(&input_tensor, {1, 3, 224, 224}, static_cast<float>(0),
+                       static_cast<float>(1));
+
+    std::vector<float> input(input_tensor.data<float>(),
+                             input_tensor.data<float>() + input_tensor.numel());
+
+    paddle_mobile.FeedData(input_tensor);
+    paddle_mobile.Predict_To(-1);
+    //    paddle_mobile.Predict_From(73);
+    //    paddle_mobile.Predict_From_To(72, 73);
+
+    DLOG << "Computation done";
+    return 0;
+  }
+}
-- 
GitLab