Merge branch 'develop' into develop

550198aa · zhangyang0701 · GitHub · af5f3e05 · 47fc35ac · 550198aa
13 changed file
--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -36,7 +36,11 @@ static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";
 static inline int do_ioctl(int req, const void *arg) {
+#ifdef PADDLE_MOBILE_OS_LINUX
  return ioctl(req, (unsigned int64_t)arg);
+#else
+  return -1;
+#endif
 }
 int open_device() {
@@ -48,8 +52,12 @@ int open_device() {
 // memory management;
 void *fpga_malloc(size_t size) {
+#ifdef PADDLE_MOBILE_OS_LINUX
  return reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+#else
+  return NULL;
+#endif
 }
 void fpga_free(void *ptr) { munmap(ptr, 0); }

--- a/src/fpga/fpga_quantilization.cpp
+++ b/src/fpga/fpga_quantilization.cpp
@@ -19,15 +19,13 @@ namespace paddle_mobile {
 namespace fpga {
 template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
-                       int height, int width) {
+                       int64_t channel, int64_t height, int64_t width) {
-  int offset_height = 0;
  for (int n = 0; n < num; n++) {
-    int amount_per_row = width * channel;
+    int64_t amount_per_row = width * channel;
    for (int c = 0; c < channel; c++) {
      for (int h = 0; h < height; h++) {
-        int offset_height = h * amount_per_row;
+        int64_t offset_height = h * amount_per_row;
        for (int w = 0; w < width; w++) {
          *(data_out + offset_height + w * channel + c) = *(data_in++);
        }
@@ -38,10 +36,12 @@ static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
 }
 template <typename Dtype>
-static Dtype find_max(Dtype* data, int num) {
+static Dtype find_max(Dtype* data, int64_t num) {
  Dtype max = 0;
  for (int i = 0; i < num; ++i) {
-    max = std::max(max, data[i]);
+    Dtype value = data[i];
+    Dtype abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
  }
  return max;
 }
@@ -51,40 +51,36 @@ void quantify_filter(framework::Tensor* filter) {
  DLOG << "quantilize_filter........";
  float scale = 0;
-  float fix_range = static_cast<float>((1 << (8 - 1)) - 1);
+  auto fix_range = static_cast<float>(std::pow(2, 8 - 1) - 1);
-  const int batch_size = filter->dims()[0];
+  const auto batch_size = filter->dims()[0];
-  const int channel = filter->dims()[1];
+  const auto channel = filter->dims()[1];
-  const int height = filter->dims()[2];
+  const auto height = filter->dims()[2];
-  const int width = filter->dims()[3];
+  const auto width = filter->dims()[3];
-  int8_t* int_data = nullptr;
+  auto* tmp_data = new int8_t[filter->numel()];
-  int8_t* tmp_data = new int8_t[filter->numel()];
  // 32bit filter -> 8bit filter;
  if (filter->type() == typeid(float)) {
-    float* float_data = filter->data<float>();
+    auto* float_data = filter->data<float>();
-    float max = find_max<float>(float_data, filter->numel());
+    auto max = find_max<float>(float_data, filter->numel());
-    scale = (max / fix_range);
+    scale = (fix_range / max);
+    DLOG << "scale:" << scale;
    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = (int8_t)float_data[i] * scale;
+      tmp_data[i] = (int8_t)(float_data[i] * scale);
    }
-    int_data = filter->mutable_data<int8_t>();
  } else {
-    int8_t max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
+    auto max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
-    scale = (max / fix_range);
+    scale = (fix_range / max);
+    std::memcpy(tmp_data, filter->data<int8_t>(), (size_t)filter->numel());
-    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = filter->data<int8_t>()[i];
-    }
-    int_data = filter->mutable_data<int8_t>();
  }
  // NCHW -> NHWC;
-  chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
+  chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
+                     channel, height, width);
  delete tmp_data;
-  *(filter->fpga_args().scale_pointer()) = scale;
+  filter->SetFpgaScale(scale);
 }
 }  // namespace fpga

--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/fpga_quantilization.h
@@ -21,10 +21,9 @@ namespace paddle_mobile {
 namespace fpga {
 template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
-                       int height, int width);
+                       int64_t channel, int64_t height, int64_t width);
-// template <typename Dtype>
 void quantify_filter(framework::Tensor* filter);
 }  // namespace fpga

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -64,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };
 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
+                    size_t>
      functor;
  size_t size = functor(type);
@@ -255,14 +256,26 @@ class Tensor {
 #ifdef PADDLE_MOBILE_FPGA
  struct FPGAArgs {
-    float scale;
+    friend class Tensor;
-    inline float *scale_pointer() { return &scale; }
+    inline float *scale_pointer() { return scale_; }
+    inline float scale() { return *scale_; }
+   private:
+    float *scale_;
  };
  struct FPGAArgs fpga_args() const {
-    return fpgaArgs_;
+    FPGAArgs args;
+    args.scale_ = scale.get();
+    return args;
  }
+  void SetFpgaScale(float s) { *(scale.get()) = s; }
+ private:
+  std::shared_ptr<float> scale = std::make_shared<float>(0);
 #endif
 private:
@@ -331,10 +344,6 @@ class Tensor {
   * begins.
   */
  size_t offset_;
-#ifdef PADDLE_MOBILE_FPGA
-  FPGAArgs fpgaArgs_;
-#endif
 };
 #ifdef PADDLE_MOBILE_DEBUG

--- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp
@@ -61,7 +61,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam *param) {
  param->SetNewBias(new_bias);
  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
  fpga::ConvArgs convArgs;
  convArgs.relu_enabled = relu_enabled;

--- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp
@@ -57,7 +57,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(FusionConvAddBNReluParam *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
  fpga::ConvArgs convArgs;
  convArgs.relu_enabled = relu_enabled;

--- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp
@@ -41,7 +41,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam *param) {
  }
  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
  fpga::ConvArgs convArgs;
  convArgs.relu_enabled = relu_enabled;

--- a/src/operators/kernel/fpga/conv_bn_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp
@@ -56,7 +56,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
  fpga::ConvArgs convArgs;
  convArgs.relu_enabled = relu_enabled;

--- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
+++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp
@@ -53,7 +53,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam *param) {
  param->SetNewScale(new_scale);
  param->SetNewBias(new_bias);
  fpga::quantify_filter(filter);
-  auto filter_ptr = filter->data<float>();
+  auto filter_ptr = filter->data<int8_t>();
  fpga::ConvArgs convArgs;
  convArgs.relu_enabled = relu_enabled;

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+    http://www.apache.org/licenses/LICENSE-2.0
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+#ifdef SOFTMAX_OP
+#include "../softmax_kernel.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "common/types.h"
+#include "fpga/api/fpga_api.h"
+#include "operators/math/softmax.h"
+namespace paddle_mobile {
+namespace operators {
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam *param) {
+  const Tensor *input = param->InputX();
+  if (input->type() == typeid(half)) {
+    auto input_ptr = input->data<half>();
+    auto output_ptr = param->Out();
+    fpga::BypassArgs args;
+    args.convert_type = fpga::DATA_FP16_TO_FP32;
+    args.layout_type = fpga::LAYOUT_HWC_TO_CHW;
+    args.image.address = (void *)(input_ptr);
+    args.image.height = input->dims()[1];
+    args.image.width = input->dims()[2];
+    args.image.channels = input->dims()[3];
+    args.output.address = output_ptr;
+    param->SetFpgaArgs(args);
+  }
+  return true;
+}
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam &param) const {
+  // SoftmaxCompute<float>(param);
+}
+template class SoftmaxKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+#endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -580,6 +580,21 @@ class SoftmaxParam : public OpParam {
 private:
  Tensor *input_x_;
  Tensor *out_;
+#ifdef PADDLE_MOBILE_FPGA
+ private:
+  std::shared_ptr<Tensor> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+ public:
+  Tensor *FloatInput() {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif

--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -34,6 +34,7 @@ REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif
 #endif
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -55,6 +55,7 @@ USE_OP_CPU(softmax);
 USE_OP_MALI_GPU(softmax);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(softmax);
 #endif
 #endif