Merge branch 'develop' into develop

adf3e730 · Ray Liu · GitHub · f1840b53 · 70baa872 · adf3e730
6 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -30,9 +30,10 @@ void format_image(framework::Tensor *image_tensor) {
  auto data_ptr = image_tensor->data<float>();
  auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
  float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
-  float *old_p = p_data;
  image::format_image(&p_data, channel, height, width);
-  if (old_p != p_data) {
+  if (p_data != data_ptr && external_ptr == nullptr) {
    image_tensor->reset_data_ptr(p_data);
  }
 }
@@ -48,9 +49,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3];
+    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
-    memory_size =
+    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
-        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
+                  sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
@@ -713,7 +714,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
    }
    for (int j = 0; j < split_num; ++j) {
-      // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
      arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
          activation_enable;
      arg->split_conv_args[i]
@@ -775,19 +775,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
                 filter_size);
-      /*{
-      static int cnt = 0;
-      std::string str = "deconv_filter";
-      if(cnt <= 1){
-          cnt++;
-          str += std::to_string(cnt);
-          int8_t result = 0;
-          fpga::savefile<int8_t>(str,
-      arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result);
-      }
-      }*/
      size_t bs_align_num = align_to_x(
          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
      size_t bs_size = 2 * bs_align_num * sizeof(float);
@@ -803,20 +790,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
-      /*  {
-            static int cnt = 0;
-            std::string str = "deconv_sb";
-            if(cnt <= 1){
-                cnt++;
-                str += std::to_string(cnt);
-                float result = 0;
-                fpga::savefile<float>(str,
-         arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num,
-         result);
-            }
-            }*/
      if (split_num == 1) {
        arg->split_conv_args[i]->conv_arg[j].output.address =
            arg->split_conv_args[i]->output.address;
@@ -863,10 +836,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
                     int16_t leaky_relu_negative_slope, int stride_h,
                     int stride_w, int padding_h, int padding_w,
                     float *bias_ptr) {
+  auto deleter = [](void *p) { fpga_free(p); };
+  arg->vector_dwconv_space.push_back(
+      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
  auto filter_ptr = filter->data<int16_t>();
  auto input_ptr = input->data<half>();
-  auto output_ptr = out->data<half>();
+  auto output_ptr = out->mutable_data<half>();
  arg->sub_conv_num = 1;
  // arg->relu_enabled = relu_enabled;
  arg->output.activation.activation_type = activation_enable;

--- a/src/io/api_paddle_mobile.cc
+++ b/src/io/api_paddle_mobile.cc
@@ -41,10 +41,12 @@ bool PaddleMobilePredictor<Device, T>::Init(const PaddleMobileConfig &config) {
                                       config.memory_pack.combined_params_buf);
  } else if (!config.model_dir.empty()) {
    paddle_mobile_->Load(config.model_dir, config.optimize,
-                         config.quantification, config.batch_size);
+                         config.quantification, config.batch_size,
+                         config.lod_mode);
  } else if (!config.prog_file.empty() && !config.param_file.empty()) {
    paddle_mobile_->Load(config.prog_file, config.param_file, config.optimize,
-                         config.quantification, config.batch_size);
+                         config.quantification, config.batch_size,
+                         config.lod_mode);
  } else {
    LOG(kLOG_ERROR) << "fail to load inference model!";
    return false;

--- a/src/operators/kernel/fpga/V1/feed_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp
@@ -49,6 +49,9 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  fpga::format_image(input);
  auto input_ptr = input->data<float>();
+  auto external_ptr = reinterpret_cast<float *>(input->external_data);
+  float *p_data = external_ptr == nullptr ? input_ptr : external_ptr;
  auto output_ptr = output->data<half>();
  fpga::BypassArgs args = {fpga::DATA_TYPE_FP32};
@@ -57,7 +60,7 @@ void FeedKernel<FPGA, float>::Compute(const FeedParam<FPGA> &param) {
  args.output_data_type = fpga::DATA_TYPE_FP16;
  args.input_layout_type = fpga::LAYOUT_CHW;
  args.output_layout_type = fpga::LAYOUT_HWC;
-  args.image.address = input_ptr;
+  args.image.address = p_data;
  args.image.channels = (uint32_t)input->dims()[1];
  args.image.height = (uint32_t)input->dims()[2];
  args.image.width = (uint32_t)input->dims()[3];

--- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp
@@ -56,8 +56,9 @@ void FetchKernel<FPGA, float>::Compute(const FetchParam<FPGA> &param) {
    return;
  }
  fpga::BypassArgs args = param.fpga_bypass_args;
-  auto data = (input->mutable_data<half>());
+  auto input_address = (input->data<half>());
-  args.image.address = static_cast<void *>(data);
+  args.image.address = static_cast<void *>(input_address);
  fpga::PerformBypass(args);
  fpga::fpga_invalidate(param.fpga_bypass_args.output.address,
                        param.fpga_bypass_args.image.channels * sizeof(float));

--- a/test/CMakeLists.txt
+++ b/test/CMakeLists.txt
@@ -77,6 +77,10 @@ if (CON GREATER -1)
    ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h)
    target_link_libraries(test-rfcn paddle-mobile)
+    ADD_EXECUTABLE(test-marker fpga/test_marker.cpp test_helper.h test_include.h executor_for_test.h)
+    target_link_libraries(test-marker paddle-mobile)
    set(FOUND_MATCH ON)
 endif ()

--- a/test/fpga/test_marker.cpp
+++ b/test/fpga/test_marker.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include <iostream>
+#include "../test_helper.h"
+#include "../test_include.h"
+#ifdef PADDLE_MOBILE_FPGA_V1
+#include "fpga/V1/api.h"
+#endif
+#ifdef PADDLE_MOBILE_FPGA_V2
+#include "fpga/V2/api.h"
+#endif
+#include <string>
+void readStream(std::string filename, char *buf) {
+  std::ifstream in;
+  in.open(filename, std::ios::in | std::ios::binary);
+  if (!in.is_open()) {
+    std::cout << "open File Failed." << std::endl;
+    return;
+  }
+  in.seekg(0, std::ios::end);  // go to the end
+  auto length = in.tellg();    // report location (this is the length)
+  in.seekg(0, std::ios::beg);  // go back to the beginning
+  in.read(buf, length);
+  DLOG << length;
+  in.close();
+}
+void convert_to_chw(int16_t **data_in, int channel, int height, int width,
+                    int num, int16_t *data_tmp) {
+  int64_t amount_per_side = width * height;
+  for (int n = 0; n < num; n++) {
+    for (int h = 0; h < height; h++) {
+      for (int w = 0; w < width; w++) {
+        for (int c = 0; c < channel; c++) {
+          *(data_tmp + n * amount_per_side * channel + c * amount_per_side +
+            width * h + w) = *((*data_in)++);
+        }
+      }
+    }
+  }
+}
+void dump_stride_half(std::string filename, Tensor input_tensor,
+                      const int dumpnum, bool use_chw) {
+  // bool use_chw = true;
+  if (input_tensor.dims().size() != 4) return;
+  int c = (input_tensor.dims())[1];
+  int h = (input_tensor.dims())[2];
+  int w = (input_tensor.dims())[3];
+  int n = (input_tensor.dims())[0];
+  auto data_ptr = input_tensor.get_data();
+  auto *data_ptr_16 = reinterpret_cast<half *>(data_ptr);
+  auto data_tmp = data_ptr_16;
+  if (use_chw) {
+    data_tmp =
+        reinterpret_cast<half *>(malloc(n * c * h * w * sizeof(int16_t)));
+    convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp);
+  }
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]);
+    out << result << std::endl;
+  }
+  out.close();
+  if (data_tmp != data_ptr_16) {
+    free(data_tmp);
+  }
+}
+void dump_stride_float(std::string filename, Tensor input_tensor,
+                       const int dumpnum) {
+  auto data_ptr = reinterpret_cast<float *>(input_tensor.get_data());
+  std::ofstream out(filename.c_str());
+  float result = 0;
+  int stride = input_tensor.numel() / dumpnum;
+  stride = stride > 0 ? stride : 1;
+  for (int i = 0; i < input_tensor.numel(); i += stride) {
+    result = data_ptr[i];
+    out << result << std::endl;
+  }
+  out.close();
+}
+void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum,
+                 bool use_chw) {
+  static int i = 0;
+  if (input_tensor.numel() == 0) {
+    return;
+  }
+  if (input_tensor.type() == typeid(float)) {
+    DLOG << "op: " << i++ << ", float data  " << input_tensor.numel();
+    dump_stride_float(filename, input_tensor, dumpnum);
+  } else {
+    DLOG << "op: " << i++ << ", half data  " << input_tensor.numel();
+    dump_stride_half(filename, input_tensor, dumpnum, use_chw);
+  }
+  DLOG << "dump input address: " << input_tensor.get_data();
+}
+static const char *g_marker_combine = "../models/marker/model";
+static const char *g_image_src_float = "../models/marker/model/input_0.bin";
+int main() {
+  paddle_mobile::fpga::open_device();
+  paddle_mobile::PaddleMobile<paddle_mobile::FPGA> paddle_mobile;
+  // if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model",
+  //                       std::string(g_rfcn_combine) + "/params", true, false,
+  //                     1, true)) {
+  if (paddle_mobile.Load(std::string(g_marker_combine), true)) {
+    float img_info[3] = {720, 1280, 800.0f / 960.0f};
+    auto img = reinterpret_cast<float *>(
+        fpga::fpga_malloc(720 * 1280 * 3 * sizeof(float)));
+    readStream(g_image_src_float, reinterpret_cast<char *>(img));
+    std::vector<void *> v(3, nullptr);
+    paddle_mobile.FeedData({img});
+    paddle_mobile.Predict_To(-1);
+    for (int i = 47; i < 52; i++) {
+      auto tensor_ptr = paddle_mobile.FetchResult(i);
+      std::string saveName = "marker_" + std::to_string(i);
+      // if(i != 58)
+      paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(),
+                                           tensor_ptr->numel() * sizeof(float));
+      //                                   tensor_ptr->numel() * sizeof(float));
+      dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(),
+                  true);  // 20);//tensor_ptr->numel());
+      /*    float result = 0;
+          std::string str = "softmax_input_data";
+          float* data =
+         static_cast<float*>(fpga::fpga_malloc(tensor_ptr->numel() *
+         sizeof(float))); str = "softmax_output_data"; auto output_ptr =
+         static_cast<half*>((*tensor_ptr).get_data()); for (int idx = 0; idx <
+         tensor_ptr->numel(); ++idx)
+          {
+              data[idx] = fpga::fp16_2_fp32(output_ptr[idx]);
+          }
+          fpga::savefile<float>(str,data, tensor_ptr->numel(), result );   */
+    }
+    //   paddle_mobile.GetResults(&v);
+    DLOG << "Computation done";
+    fpga::fpga_free(img);
+  }
+  return 0;
+}