diff --git a/src/fpga/api/fpga_api.cpp b/src/fpga/api/fpga_api.cpp index 779c846d1f3c465e5113f805b2b3856a1a7894c5..1a0fb3839e753d77aa13e24b900be893e7ab52c9 100644 --- a/src/fpga/api/fpga_api.cpp +++ b/src/fpga/api/fpga_api.cpp @@ -36,7 +36,11 @@ static int fd = -1; static const char *device_path = "/dev/fpgadrv0"; static inline int do_ioctl(int req, const void *arg) { +#ifdef PADDLE_MOBILE_OS_LINUX return ioctl(req, (unsigned int64_t)arg); +#else + return -1; +#endif } int open_device() { @@ -48,8 +52,12 @@ int open_device() { // memory management; void *fpga_malloc(size_t size) { +#ifdef PADDLE_MOBILE_OS_LINUX return reinterpret_cast( mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); +#else + return NULL; +#endif } void fpga_free(void *ptr) { munmap(ptr, 0); } diff --git a/src/fpga/fpga_quantilization.cpp b/src/fpga/fpga_quantilization.cpp index 8b351f1a81e0a92f0e2f12a3f61dd2a7d3948c85..e8faf792b9b3050ff2d5b82978154004c1d78bfa 100644 --- a/src/fpga/fpga_quantilization.cpp +++ b/src/fpga/fpga_quantilization.cpp @@ -19,15 +19,13 @@ namespace paddle_mobile { namespace fpga { template -static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel, - int height, int width) { - int offset_height = 0; - +static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num, + int64_t channel, int64_t height, int64_t width) { for (int n = 0; n < num; n++) { - int amount_per_row = width * channel; + int64_t amount_per_row = width * channel; for (int c = 0; c < channel; c++) { for (int h = 0; h < height; h++) { - int offset_height = h * amount_per_row; + int64_t offset_height = h * amount_per_row; for (int w = 0; w < width; w++) { *(data_out + offset_height + w * channel + c) = *(data_in++); } @@ -38,10 +36,12 @@ static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel, } template -static Dtype find_max(Dtype* data, int num) { +static Dtype find_max(Dtype* data, int64_t num) { Dtype max = 0; for (int i = 0; i < num; ++i) { - max = std::max(max, data[i]); + Dtype value = data[i]; + Dtype abs = value > 0 ? value : -value; + max = std::max(max, abs); } return max; } @@ -51,40 +51,36 @@ void quantify_filter(framework::Tensor* filter) { DLOG << "quantilize_filter........"; float scale = 0; - float fix_range = static_cast((1 << (8 - 1)) - 1); + auto fix_range = static_cast(std::pow(2, 8 - 1) - 1); - const int batch_size = filter->dims()[0]; - const int channel = filter->dims()[1]; - const int height = filter->dims()[2]; - const int width = filter->dims()[3]; + const auto batch_size = filter->dims()[0]; + const auto channel = filter->dims()[1]; + const auto height = filter->dims()[2]; + const auto width = filter->dims()[3]; - int8_t* int_data = nullptr; - int8_t* tmp_data = new int8_t[filter->numel()]; + auto* tmp_data = new int8_t[filter->numel()]; // 32bit filter -> 8bit filter; if (filter->type() == typeid(float)) { - float* float_data = filter->data(); - float max = find_max(float_data, filter->numel()); + auto* float_data = filter->data(); + auto max = find_max(float_data, filter->numel()); - scale = (max / fix_range); + scale = (fix_range / max); + DLOG << "scale:" << scale; for (int i = 0; i < filter->numel(); ++i) { - tmp_data[i] = (int8_t)float_data[i] * scale; + tmp_data[i] = (int8_t)(float_data[i] * scale); } - int_data = filter->mutable_data(); } else { - int8_t max = find_max(filter->data(), filter->numel()); - scale = (max / fix_range); - - for (int i = 0; i < filter->numel(); ++i) { - tmp_data[i] = filter->data()[i]; - } - int_data = filter->mutable_data(); + auto max = find_max(filter->data(), filter->numel()); + scale = (fix_range / max); + std::memcpy(tmp_data, filter->data(), (size_t)filter->numel()); } // NCHW -> NHWC; - chw_to_hwc(tmp_data, int_data, batch_size, channel, height, width); + chw_to_hwc(tmp_data, filter->mutable_data(), batch_size, + channel, height, width); delete tmp_data; - *(filter->fpga_args().scale_pointer()) = scale; + filter->SetFpgaScale(scale); } } // namespace fpga diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/fpga_quantilization.h index 4f1f6ad402a3ff4df773ecbd2121820f4c7dc265..04cb2ce7c0a6df0df2c49431e49d2c5e73d44209 100644 --- a/src/fpga/fpga_quantilization.h +++ b/src/fpga/fpga_quantilization.h @@ -21,10 +21,9 @@ namespace paddle_mobile { namespace fpga { template -static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel, - int height, int width); +static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num, + int64_t channel, int64_t height, int64_t width); -// template void quantify_filter(framework::Tensor* filter); } // namespace fpga diff --git a/src/framework/tensor.h b/src/framework/tensor.h index 797fcf5bffbe5e738fe352d1ca84602f0e5d86a0..721d4ea5e93cf305880ea124a58769f4fa99db62 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -64,7 +64,8 @@ struct SizeOfTypeFunctor { }; static inline size_t SizeOfType(std::type_index type) { - SizeOfTypeFunctor + SizeOfTypeFunctor functor; size_t size = functor(type); @@ -255,14 +256,26 @@ class Tensor { #ifdef PADDLE_MOBILE_FPGA struct FPGAArgs { - float scale; + friend class Tensor; - inline float *scale_pointer() { return &scale; } + inline float *scale_pointer() { return scale_; } + inline float scale() { return *scale_; } + + private: + float *scale_; }; struct FPGAArgs fpga_args() const { - return fpgaArgs_; + FPGAArgs args; + args.scale_ = scale.get(); + return args; } + + void SetFpgaScale(float s) { *(scale.get()) = s; } + + private: + std::shared_ptr scale = std::make_shared(0); + #endif private: @@ -331,10 +344,6 @@ class Tensor { * begins. */ size_t offset_; - -#ifdef PADDLE_MOBILE_FPGA - FPGAArgs fpgaArgs_; -#endif }; #ifdef PADDLE_MOBILE_DEBUG diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 93bbfe9c1a8ae3d9930c759ba0efcef04e5e572f..e624104acf0561470e8aac827c233d0d2d1d9f66 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -61,7 +61,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { param->SetNewBias(new_bias); fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index d5e79a39b79494d543e6e9485497a540a15152aa..d6fee838390f0efe38a539c3a9e8fc09d07a68d0 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -57,7 +57,7 @@ bool ConvAddBNReluKernel::Init(FusionConvAddBNReluParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 3b44506f65cc6700323c3d5f7d0765c9e52f7e0a..fd6379d8f3021d9d859d81f75aaba9ad761dd6ca 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -41,7 +41,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { } fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index fd95f47a1fcb8c444172909abc67ad7f5e0de632..559b948b7b268181dcf75a4eaa40cfd9c78ef0d6 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -56,7 +56,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index fbb3ca512ea863c49ca4da3f9a133f8c91897b53..cfdc85b091017aebaf99d806e6e9104cbcbe05bd 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -53,7 +53,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); fpga::quantify_filter(filter); - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..95bcb28f9c30481bd234d83ab44b415d59388475 --- /dev/null +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -0,0 +1,54 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SOFTMAX_OP + +#include "../softmax_kernel.h" +#include "../central-arm-func/softmax_arm_func.h" +#include "common/types.h" +#include "fpga/api/fpga_api.h" +#include "operators/math/softmax.h" +namespace paddle_mobile { +namespace operators { + +template <> +bool SoftmaxKernel::Init(SoftmaxParam *param) { + const Tensor *input = param->InputX(); + if (input->type() == typeid(half)) { + auto input_ptr = input->data(); + auto output_ptr = param->Out(); + fpga::BypassArgs args; + args.convert_type = fpga::DATA_FP16_TO_FP32; + args.layout_type = fpga::LAYOUT_HWC_TO_CHW; + args.image.address = (void *)(input_ptr); + args.image.height = input->dims()[1]; + args.image.width = input->dims()[2]; + args.image.channels = input->dims()[3]; + args.output.address = output_ptr; + param->SetFpgaArgs(args); + } + + return true; +} + +template <> +void SoftmaxKernel::Compute(const SoftmaxParam ¶m) const { + // SoftmaxCompute(param); +} + +template class SoftmaxKernel; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index af8d35c9ecfb217c71fc024722608d8df28b5090..c39d9657bc50c6dd708f0cd9fd5573642d417f21 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -580,6 +580,21 @@ class SoftmaxParam : public OpParam { private: Tensor *input_x_; Tensor *out_; + +#ifdef PADDLE_MOBILE_FPGA + + private: + std::shared_ptr float_input_x_; + fpga::BypassArgs fpga_bypass_args; + + public: + Tensor *FloatInput() { + return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); + } + void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } + const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } + void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } +#endif }; #endif diff --git a/src/operators/softmax_op.cpp b/src/operators/softmax_op.cpp index c9edfccf4ff08e5a12d735526c3d63c689711357..e85edc69c3291c794f2eeb8119b91b2926c4d870 100644 --- a/src/operators/softmax_op.cpp +++ b/src/operators/softmax_op.cpp @@ -34,6 +34,7 @@ REGISTER_OPERATOR_CPU(softmax, ops::SoftmaxOp); REGISTER_OPERATOR_MALI_GPU(softmax, ops::SoftmaxOp); #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(softmax, ops::SoftmaxOp); #endif #endif diff --git a/src/operators/softmax_op.h b/src/operators/softmax_op.h index f645d7edf7a3b9f7a92cf286feec58e960a5e3b7..bacae23b522daf1cc689a2d7af6b14cd2bc794bb 100644 --- a/src/operators/softmax_op.h +++ b/src/operators/softmax_op.h @@ -55,6 +55,7 @@ USE_OP_CPU(softmax); USE_OP_MALI_GPU(softmax); #endif #ifdef PADDLE_MOBILE_FPGA +USE_OP_FPGA(softmax); #endif #endif