added fpga softmax kernel

75c3b905 · hanbuhe · 48ea7282 · 75c3b905 · 75c3b905 · 75c3b905
8 changed file
--- a/src/fpga/api/fpga_api.cpp
+++ b/src/fpga/api/fpga_api.cpp
@@ -36,7 +36,11 @@ static int fd = -1;
 static const char *device_path = "/dev/fpgadrv0";

 static inline int do_ioctl(int req, const void *arg) {
+#ifdef PADDLE_MOBILE_OS_LINUX
  return ioctl(req, (unsigned int64_t)arg);
+#else
+  return -1;
+#endif
 }

 int open_device() {
@@ -48,8 +52,12 @@ int open_device() {

 // memory management;
 void *fpga_malloc(size_t size) {
+#ifdef PADDLE_MOBILE_OS_LINUX
  return reinterpret_cast<void *>(
      mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0));
+#else
+  return NULL;
+#endif
 }

 void fpga_free(void *ptr) { munmap(ptr, 0); }

--- a/src/fpga/fpga_quantilization.cpp
+++ b/src/fpga/fpga_quantilization.cpp
@@ -19,15 +19,13 @@ namespace paddle_mobile {
 namespace fpga {

 template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
-                       int height, int width) {
-  int offset_height = 0;
-
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
+                       int64_t channel, int64_t height, int64_t width) {
  for (int n = 0; n < num; n++) {
-    int amount_per_row = width * channel;
+    int64_t amount_per_row = width * channel;
    for (int c = 0; c < channel; c++) {
      for (int h = 0; h < height; h++) {
-        int offset_height = h * amount_per_row;
+        int64_t offset_height = h * amount_per_row;
        for (int w = 0; w < width; w++) {
          *(data_out + offset_height + w * channel + c) = *(data_in++);
        }
@@ -38,10 +36,12 @@ static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
 }

 template <typename Dtype>
-static Dtype find_max(Dtype* data, int num) {
+static Dtype find_max(Dtype* data, int64_t num) {
  Dtype max = 0;
  for (int i = 0; i < num; ++i) {
-    max = std::max(max, data[i]);
+    Dtype value = data[i];
+    Dtype abs = value > 0 ? value : -value;
+    max = std::max(max, abs);
  }
  return max;
 }
@@ -51,40 +51,36 @@ void quantify_filter(framework::Tensor* filter) {
  DLOG << "quantilize_filter........";

  float scale = 0;
-  float fix_range = static_cast<float>((1 << (8 - 1)) - 1);
+  auto fix_range = static_cast<float>(std::pow(2, 8 - 1) - 1);

-  const int batch_size = filter->dims()[0];
-  const int channel = filter->dims()[1];
-  const int height = filter->dims()[2];
-  const int width = filter->dims()[3];
+  const auto batch_size = filter->dims()[0];
+  const auto channel = filter->dims()[1];
+  const auto height = filter->dims()[2];
+  const auto width = filter->dims()[3];

-  int8_t* int_data = nullptr;
-  int8_t* tmp_data = new int8_t[filter->numel()];
+  auto* tmp_data = new int8_t[filter->numel()];

  // 32bit filter -> 8bit filter;
  if (filter->type() == typeid(float)) {
-    float* float_data = filter->data<float>();
-    float max = find_max<float>(float_data, filter->numel());
+    auto* float_data = filter->data<float>();
+    auto max = find_max<float>(float_data, filter->numel());

-    scale = (max / fix_range);
+    scale = (fix_range / max);
+    DLOG << "scale:" << scale;

    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = (int8_t)float_data[i] * scale;
+      tmp_data[i] = (int8_t)(float_data[i] * scale);
    }
-    int_data = filter->mutable_data<int8_t>();
  } else {
-    int8_t max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
-    scale = (max / fix_range);
-
-    for (int i = 0; i < filter->numel(); ++i) {
-      tmp_data[i] = filter->data<int8_t>()[i];
-    }
-    int_data = filter->mutable_data<int8_t>();
+    auto max = find_max<int8_t>(filter->data<int8_t>(), filter->numel());
+    scale = (fix_range / max);
+    std::memcpy(tmp_data, filter->data<int8_t>(), (size_t)filter->numel());
  }
  // NCHW -> NHWC;
-  chw_to_hwc<int8_t>(tmp_data, int_data, batch_size, channel, height, width);
+  chw_to_hwc<int8_t>(tmp_data, filter->mutable_data<int8_t>(), batch_size,
+                     channel, height, width);
  delete tmp_data;
-  *(filter->fpga_args().scale_pointer()) = scale;
+  filter->SetFpgaScale(scale);
 }

 }  // namespace fpga

--- a/src/fpga/fpga_quantilization.h
+++ b/src/fpga/fpga_quantilization.h
@@ -21,10 +21,9 @@ namespace paddle_mobile {
 namespace fpga {

 template <typename Dtype>
-static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel,
-                       int height, int width);
+static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num,
+                       int64_t channel, int64_t height, int64_t width);

-// template <typename Dtype>
 void quantify_filter(framework::Tensor* filter);

 }  // namespace fpga

--- a/src/framework/tensor.h
+++ b/src/framework/tensor.h
@@ -64,7 +64,8 @@ struct SizeOfTypeFunctor<HEAD, TAIL...> {
 };

 static inline size_t SizeOfType(std::type_index type) {
-  SizeOfTypeFunctor<int, half, float, double, int16_t, int64_t, bool, size_t>
+  SizeOfTypeFunctor<int8_t, int, half, float, double, int16_t, int64_t, bool,
+                    size_t>
      functor;
  size_t size = functor(type);

@@ -255,14 +256,26 @@ class Tensor {

 #ifdef PADDLE_MOBILE_FPGA
  struct FPGAArgs {
-    float scale;
+    friend class Tensor;

-    inline float *scale_pointer() { return &scale; }
+    inline float *scale_pointer() { return scale_; }
+    inline float scale() { return *scale_; }
+
+   private:
+    float *scale_;
  };

  struct FPGAArgs fpga_args() const {
-    return fpgaArgs_;
+    FPGAArgs args;
+    args.scale_ = scale.get();
+    return args;
  }
+
+  void SetFpgaScale(float s) { *(scale.get()) = s; }
+
+ private:
+  std::shared_ptr<float> scale = std::make_shared<float>(0);
+
 #endif

 private:
@@ -331,10 +344,6 @@ class Tensor {
   * begins.
   */
  size_t offset_;
-
-#ifdef PADDLE_MOBILE_FPGA
-  FPGAArgs fpgaArgs_;
-#endif
 };

 #ifdef PADDLE_MOBILE_DEBUG

--- a/src/operators/kernel/fpga/softmax_kernel.cpp
+++ b/src/operators/kernel/fpga/softmax_kernel.cpp
+/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved.
+
+Licensed under the Apache License, Version 2.0 (the "License");
+you may not use this file except in compliance with the License.
+You may obtain a copy of the License at
+
+    http://www.apache.org/licenses/LICENSE-2.0
+
+Unless required by applicable law or agreed to in writing, software
+distributed under the License is distributed on an "AS IS" BASIS,
+WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+See the License for the specific language governing permissions and
+limitations under the License. */
+
+#ifdef SOFTMAX_OP
+
+#include "../softmax_kernel.h"
+#include "../central-arm-func/softmax_arm_func.h"
+#include "common/types.h"
+#include "fpga/api/fpga_api.h"
+#include "operators/math/softmax.h"
+namespace paddle_mobile {
+namespace operators {
+
+template <>
+bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam *param) {
+  const Tensor *input = param->InputX();
+  if (input->type() == typeid(half)) {
+    auto input_ptr = input->data<half>();
+    auto output_ptr = param->Out();
+    fpga::BypassArgs args;
+    args.convert_type = fpga::DATA_FP16_TO_FP32;
+    args.layout_type = fpga::LAYOUT_HWC_TO_CHW;
+    args.image.address = reinterpret_cast<void *>(input_ptr);
+    args.image.height = input->dims()[1];
+    args.image.width = input->dims()[2];
+    args.image.channels = input->dims()[3];
+    args.output.address = output_ptr;
+    param->SetFpgaArgs(args);
+  }
+
+  return true;
+}
+
+template <>
+void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam &param) const {
+  // SoftmaxCompute<float>(param);
+}
+
+template class SoftmaxKernel<FPGA, float>;
+}  // namespace operators
+}  // namespace paddle_mobile
+
+#endif
--- a/src/operators/op_param.h
+++ b/src/operators/op_param.h
@@ -580,6 +580,21 @@ class SoftmaxParam : public OpParam {
 private:
  Tensor *input_x_;
  Tensor *out_;
+
+#ifdef PADDLE_MOBILE_FPGA
+
+ private:
+  std::shared_ptr<Tensor> float_input_x_;
+  fpga::BypassArgs fpga_bypass_args;
+
+ public:
+  Tensor *FloatInput() {
+    return float_input_x_ == nullptr ? input_x_ : float_input_x_.get();
+  }
+  void SetFloatInput(Tensor *input) { float_input_x_.reset(input); }
+  const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; }
+  void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; }
+#endif
 };
 #endif


--- a/src/operators/softmax_op.cpp
+++ b/src/operators/softmax_op.cpp
@@ -34,6 +34,7 @@ REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp);
 REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp);
 #endif

 #endif
--- a/src/operators/softmax_op.h
+++ b/src/operators/softmax_op.h
@@ -55,6 +55,7 @@ USE_OP_CPU(softmax);
 USE_OP_MALI_GPU(softmax);
 #endif
 #ifdef PADDLE_MOBILE_FPGA
+USE_OP_FPGA(softmax);
 #endif

 #endif