update

85d6c449 · qnqinan · b19dee42 · 85d6c449 · 85d6c449 · 85d6c449
3 changed file
--- a/src/fpga/V1/api.cpp
+++ b/src/fpga/V1/api.cpp
@@ -30,9 +30,9 @@ void format_image(framework::Tensor *image_tensor) {
  auto data_ptr = image_tensor->data<float>();
  auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
  float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
+  float *old_p = p_data;
  image::format_image(&p_data, channel, height, width);
-  if (p_data != data_ptr) {
+  if (old_p != p_data) {
    image_tensor->reset_data_ptr(p_data);
  }
 }
@@ -48,9 +48,9 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) {
  auto dims = ofm_tensor->dims();
  size_t memory_size = 0;
  if (dims.size() == 4) {
-    auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
+    auto channel = dims[1], height = dims[2], width = dims[3];
-    memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
+    memory_size =
-                  sizeof(half);
+        height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
  } else if (dims.size() == 2) {
    memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
  } else {
@@ -162,7 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) {
  fpga_copy(new_data, data_ptr, memory_size);
  filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr);
  filter_tensor->reset_data_ptr(new_data);
-  filter_tensor->set_type(typeid(int8_t));
+  filter_tensor->set_type(typeid(int16_t));
 }
 void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr,
@@ -396,8 +396,8 @@ void expand_conv_arg(ConvArgs *arg) {
  // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
  auto cmd = 0UL | USE_BIAS;
-  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) |
+  auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) |
-                      ((args.deconv_tx_param.sub_conv_num) << 16) |
+                      ((args.deconv_tx_param.sub_conv_num) << 8) |
                      ((args.deconv_tx_param.omit_size) << 0);
  (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
  (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
@@ -623,7 +623,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
  fpga::format_fp16_ofm(out, dims_out_new);
  auto out_ptr = out->data<half>();
  arg->output.address =
-      out_ptr +
+      (half *)out_ptr +  // NOLINT
      omit_size * sizeof(half) *
          (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
  arg->output.scale_address = out->scale;
@@ -713,6 +713,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
    }
    for (int j = 0; j < split_num; ++j) {
+      // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled;
      arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type =
          activation_enable;
      arg->split_conv_args[i]
@@ -758,9 +759,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
          align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num,
                     FILTER_NUM_ALIGNMENT) *
          sizeof(int8_t);
-      auto filter_head =
+      auto filter_head = &((
-          &filter_ptr[j * element_num * filter_num_per_div +  // NOLINT
+          int8_t *)filter_ptr)[j * element_num * filter_num_per_div +  // NOLINT
-                      i * filter_sub_conv_offset];
+                               i * filter_sub_conv_offset];
      arg->split_conv_args[i]->conv_arg[j].filter_address =
          fpga_malloc(filter_size);
      arg->split_conv_args[i]->vector_conv_space.push_back(
@@ -774,6 +775,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
      fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address,
                 filter_size);
+      /*{
+      static int cnt = 0;
+      std::string str = "deconv_filter";
+      if(cnt <= 1){
+          cnt++;
+          str += std::to_string(cnt);
+          int8_t result = 0;
+          fpga::savefile<int8_t>(str,
+      arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result);
+      }
+      }*/
      size_t bs_align_num = align_to_x(
          arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT);
      size_t bs_size = 2 * bs_align_num * sizeof(float);
@@ -789,6 +803,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
      memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size);
      fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size);
+      /*  {
+            static int cnt = 0;
+            std::string str = "deconv_sb";
+            if(cnt <= 1){
+                cnt++;
+                str += std::to_string(cnt);
+                float result = 0;
+                fpga::savefile<float>(str,
+         arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num,
+         result);
+            }
+            }*/
      if (split_num == 1) {
        arg->split_conv_args[i]->conv_arg[j].output.address =
            arg->split_conv_args[i]->output.address;
@@ -835,13 +863,10 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
                     int16_t leaky_relu_negative_slope, int stride_h,
                     int stride_w, int padding_h, int padding_w,
                     float *bias_ptr) {
-  auto deleter = [](void *p) { fpga_free(p); };
+  auto filter_ptr = filter->data<int16_t>();
-  arg->vector_dwconv_space.push_back(
-      std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
-  auto filter_ptr = filter->data<uint8_t>();
  auto input_ptr = input->data<half>();
-  auto output_ptr = out->mutable_data<half>();
+  auto output_ptr = out->data<half>();
  arg->sub_conv_num = 1;
  // arg->relu_enabled = relu_enabled;
  arg->output.activation.activation_type = activation_enable;
@@ -960,10 +985,10 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
                    sizeof(int16_t));
    arg->dw_conv_args[i]->output.scale_address =
        static_cast<float *>(fpga_malloc(2 * sizeof(float)));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(  // NOLINT
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.address),
        deleter));
-    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(  // NOLINT
+    arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
        reinterpret_cast<char *>(arg->dw_conv_args[i]->output.scale_address),
        deleter));
  }

--- a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp
@@ -43,7 +43,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
  for (int i = 0; i < channel * sub_conv_n; i++) {
    bs_ptr[i + sub_conv_n * channel] = 1;
-    // bs_ptr[i] = bias_ptr[i % (channel)];
+    bs_ptr[i] = 0;  // bias_ptr[i % (channel)];
  }
  PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],

--- a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#include "operators/kernel/pad2d_kernel.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool Pad2dKernel<FPGA, float>::Init(Pad2dParam<FPGA> *param) {
+  Tensor *output = param->Out();
+  fpga::format_fp16_ofm(output);
+  return true;
+}
+void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) {
+  auto input_data = (input->data<half>());
+  auto output_data = (output->data<half>());
+  auto input_c = input->dims()[1];
+  auto input_h = input->dims()[2];
+  auto input_w = input->dims()[3];
+  auto output_c = output->dims()[1];
+  auto output_w = output->dims()[3];
+  auto copysize = input_c * input_w;
+  for (int h = 0; h < input_h; ++h) {
+    auto input_offset = h * input_c * input_w;
+    auto output_offset = h * paddle_mobile::fpga::align_to_x(
+                                 output_c * output_w, IMAGE_ALIGNMENT);
+    memcpy((output_data + output_offset), (input_data + input_offset),
+           copysize * sizeof(half));
+  }
+}
+template <>
+void Pad2dKernel<FPGA, float>::Compute(const Pad2dParam<FPGA> &param) {
+  auto in_x = param.InputX();
+  auto out = param.Out();
+  fpga::fpga_invalidate((void *)in_x->data<half>(),  // NOLINT
+                        in_x->numel() * sizeof(half));
+  pad2dFunc(in_x, out);
+  (out->scale)[0] = (in_x->scale)[0];
+  (out->scale)[1] = (in_x->scale)[1];
+  DLOG << (out->scale)[0];
+  DLOG << (out->scale)[1];
+  size_t outputSize =
+      out->dims()[2] *
+      paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]),
+                                      IMAGE_ALIGNMENT) *
+      sizeof(half);
+  fpga::fpga_flush(out->data<half>(), outputSize);
+}
+}  // namespace operators
+}  // namespace paddle_mobile