diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 4a1617b4024b38b420b4d9983477970ef30a6081..9d8a1c7f6780b74481d51d2c9b3097df86f6817d 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -174,30 +174,43 @@ void format_ofm(framework::Tensor *ofm_tensor) { ofm_tensor->reset_data_ptr(fpga_malloc(memory_size)); } -void format_filter(framework::Tensor *filter_tensor, int group_num) { +float filter_find_max(framework::Tensor *filter_tensor) { + auto filter_ptr = filter_tensor->data(); + return filter::find_max(filter_ptr, filter_tensor->numel()); +} +int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) { + auto dims = filter_tensor->dims(); + PADDLE_MOBILE_ENFORCE(dims.size() == 4 || dims.size() == 2, + "Filter order should be 4 or 2"); + int chw = dims.size() == 4 ? dims[1] * dims[2] * dims[3] : dims[1]; + int num = dims.size() == 4 ? dims[0] : dims[1]; + int div_capacity = filter::calc_division_capacity(chw); + return filter::calc_num_per_div(num, group_num, div_capacity); +} + +void format_filter(framework::Tensor *filter_tensor, float max_value, + int group_num) { auto dims = filter_tensor->dims(); int num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = filter_tensor->mutable_data(); size_t memory_size = num * channel * height * width * sizeof(float); float *new_data = (float *)fpga_malloc(memory_size); fpga_copy(new_data, data_ptr, memory_size); - float max_value = filter::find_max(new_data, num * channel * height * width); filter::format_filter(&new_data, num, channel, height, width, group_num, max_value); filter_tensor->reset_data_ptr(new_data); } -void format_fc_matrix(framework::Tensor *filter_tensor, int group_num, - int height, int width) { +void format_fc_matrix(framework::Tensor *filter_tensor, float max_value, + int group_num, int height, int width) { auto dims = filter_tensor->dims(); - PADDLE_MOBILE_ENFORCE(dims[0] % (height * width) == 0, - "Filter number should be divisible by group number"); + PADDLE_MOBILE_ENFORCE(height == 1 && width == 1, + "IFM should be flattened for FC"); int num = dims[1], channel = dims[0] / height / width; auto data_ptr = filter_tensor->mutable_data(); size_t memory_size = num * channel * height * width * sizeof(float); float *new_data = (float *)fpga_malloc(memory_size); fpga_copy(new_data, data_ptr, memory_size); - float max_value = filter::find_max(new_data, num * channel * height * width); filter::format_filter(&new_data, num, channel, height, width, group_num, max_value); filter_tensor->reset_data_ptr(new_data); diff --git a/src/fpga/api.h b/src/fpga/api.h index 968e5db356823a8951d8c2de9031e25597a7e998..f7010e56ad4caf7bc49f5d37301009e226780da5 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -54,12 +54,6 @@ struct MemoryCopyArgs { size_t size; }; -struct BNArgs { - bool enabled; - void* bias_address; - void* scale_address; -}; - /** Conv and Pooling kernel */ @@ -178,9 +172,12 @@ int ComputeFpgaEWAdd(const struct EWAddArgs& args); static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } void format_image(framework::Tensor* image_tensor); void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory -void format_filter(framework::Tensor* filter_tensor, int group_num); -void format_fc_matrix(framework::Tensor* filter_tensor, int group_num, - int height = 1, int width = 1); +float filter_find_max(framework::Tensor* filter_tensor); +int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num); +void format_filter(framework::Tensor* filter_tensor, float max_value, + int group_num); +void format_fc_matrix(framework::Tensor* filter_tensor, float max_value, + int group_num, int height = 1, int width = 1); void format_bias_scale_array(float** bias_scale_array, int element_num_per_division, int num); diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp index c37d07d40e7789ed1f7012abe556bd5bc3e04f28..5f1a16d2339f3859f4cd85408c965d8d2634a55f 100644 --- a/src/fpga/filter.cpp +++ b/src/fpga/filter.cpp @@ -35,6 +35,11 @@ int calc_division_number(int num, int group_num, int division_capacity) { } int calc_num_per_div(int num, int group_num, int division_capacity) { + PADDLE_MOBILE_ENFORCE(num % group_num == 0, + "Filter number should be divisible by group number"); + int split_num = calc_split_num(num, division_capacity); + PADDLE_MOBILE_ENFORCE(group_num == 1 || split_num == 1, + "Split number or group number should be 1"); if (group_num == 1) { if (num > division_capacity) { return division_capacity; diff --git a/src/fpga/filter.h b/src/fpga/filter.h index 23e6d60ac1e82c4cde9e533f201aa2f2e46dc2c0..89132fabc4abee15ba8aa5e7cae8a14042cb3ad4 100644 --- a/src/fpga/filter.h +++ b/src/fpga/filter.h @@ -20,6 +20,11 @@ limitations under the License. */ namespace paddle_mobile { namespace fpga { namespace filter { + +int calc_division_capacity(int chw); +int calc_split_num(int num, int division_capacity); +int calc_division_number(int num, int group_num, int division_capacity); +int calc_num_per_div(int num, int group_num, int division_capacity); void convert_to_hwc(float** data_in, int num, int channel, int height, int width); float find_max(float* data_in, int data_size); diff --git a/src/fpga/quantization.cpp b/src/fpga/quantization.cpp deleted file mode 100644 index 44994d4c353490b533110d0965fb63b4fb5c7aa2..0000000000000000000000000000000000000000 --- a/src/fpga/quantization.cpp +++ /dev/null @@ -1,92 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include "fpga/quantization.h" -#include - -namespace paddle_mobile { -namespace fpga { - -template -static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num, - int64_t channel, int64_t height, int64_t width) { - for (int n = 0; n < num; n++) { - int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { - for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; - for (int w = 0; w < width; w++) { - *(data_out + offset_height + w * channel + c) = *(data_in++); - } - } - } - data_out += num; - } -} - -template -static Dtype find_max(Dtype* data, int64_t num) { - Dtype max = 0; - for (int i = 0; i < num; ++i) { - Dtype value = data[i]; - Dtype abs = value > 0 ? value : -value; - max = std::max(max, abs); - } - return max; -} - -// template -void quantize_filter(framework::Tensor* filter) { - DLOG << "quantilize_filter........" << filter->dims(); - - float scale = 0; - auto fix_range = static_cast(std::pow(2, 8 - 1) - 1); - - auto* tmp_data = new int8_t[filter->numel()]; - - // 32bit filter -> 8bit filter; - if (filter->type() == typeid(float)) { - auto* float_data = filter->data(); - auto max = find_max(float_data, filter->numel()); - - scale = (fix_range / max); - DLOG << "scale:" << scale; - - for (int i = 0; i < filter->numel(); ++i) { - tmp_data[i] = (int8_t)(float_data[i] * scale); - } - } else { - auto max = find_max(filter->data(), filter->numel()); - scale = (fix_range / max); - std::memcpy(tmp_data, filter->data(), (size_t)filter->numel()); - } - - if (filter->dims().size() == 4) { - const auto batch_size = filter->dims()[0]; - const auto channel = filter->dims()[1]; - const auto height = filter->dims()[2]; - const auto width = filter->dims()[3]; - chw_to_hwc(tmp_data, filter->mutable_data(), batch_size, - channel, height, width); - } else if (filter->dims().size() == 2) { - std::memcpy(filter->mutable_data(), tmp_data, - (size_t)filter->numel()); - } - - delete tmp_data; - filter->SetFpgaScale(scale); -} - -} // namespace fpga -} // namespace paddle_mobile diff --git a/src/fpga/quantization.h b/src/fpga/quantization.h deleted file mode 100644 index 0d6c2405fccd814f73d44eef20b6735dc0ad0eab..0000000000000000000000000000000000000000 --- a/src/fpga/quantization.h +++ /dev/null @@ -1,30 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#pragma once - -#include "common/types.h" -#include "framework/lod_tensor.h" -#include "framework/tensor.h" - -namespace paddle_mobile { -namespace fpga { - -template -static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int64_t num, - int64_t channel, int64_t height, int64_t width); - -void quantize_filter(framework::Tensor* filter); - -} // namespace fpga -} // namespace paddle_mobile diff --git a/src/framework/tensor.h b/src/framework/tensor.h index ea2ae9b991248144b90a44a165cb322b9c21716e..8d743c0f63eb9f3107603baeab59c41a0f95f1c2 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -328,28 +328,7 @@ class Tensor { inline void reset_data_ptr(void *p) { ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); } - - struct FPGAArgs { - friend class Tensor; - - inline float *scale_pointer() { return scale_; } - inline float scale() { return *scale_; } - - private: - float *scale_; - }; - - struct FPGAArgs fpga_args() const { - FPGAArgs args; - args.scale_ = scale.get(); - return args; - } - - void SetFpgaScale(float s) { *(scale.get()) = s; } - - private: - std::shared_ptr scale = std::make_shared(0); - + float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX #endif }; diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 1edcb8406360becba25d522df0f67836519fd1de..dad0880ea69be8449239657af66553065db05321 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -41,18 +41,19 @@ class FeedOp : public framework::OperatorBase { void Init() { Tensor *output = param_.Out(); - output->mutable_data(); + fpga::format_ofm(output); } void RunImpl() const { - const Tensor *input = param_.InputX(); + Tensor *input = const_cast(param_.InputX()); auto input_ptr = input->data(); + fpga::format_image(input); Tensor *output = param_.Out(); auto output_ptr = output->mutable_data(); - auto output_scale_address = output->fpga_args().scale_pointer(); + fpga::BypassArgs args; args.convert_type = fpga::DATA_FP32_TO_FP16; - args.layout_type = fpga::LAYOUT_CHW_TO_HWC; + args.layout_type = fpga::LAYOUT_NO_CONVERT; args.image.address = (void *)input_ptr; args.image.channels = input->dims()[1]; args.image.height = input->dims()[2]; @@ -60,7 +61,7 @@ class FeedOp : public framework::OperatorBase { args.image.pad_height = 0; args.image.pad_width = 0; args.output.address = output_ptr; - args.output.scale_address = output_scale_address; + args.output.scale_address = output->scale; fpga::PerformBypass(args); } diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 7f120fa930334194600103b7310e3e8b50adbe31..089fec9aeaee198e6dbc0bf732b061fe014ed66b 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -16,7 +16,6 @@ limitations under the License. */ #include "operators/kernel/conv_add_bn_kernel.h" #include "fpga/api.h" -#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -24,14 +23,14 @@ namespace operators { template <> bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { bool relu_enabled = false; - const Tensor *input = param->Input(); + Tensor *input = const_cast(param->Input()); auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); Tensor *filter = param->Filter(); Tensor *out = param->Output(); - auto out_ptr = out->mutable_data(); + auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -54,15 +53,23 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i * 2] = new_scale_ptr[i]; - bs_ptr[i * 2 + 1] = new_bias_ptr[i]; + bs_ptr[i + channel] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; } param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantize_filter(filter); + float max_value = fpga::filter_find_max(filter); + fpga::format_filter(filter, max_value, param->Groups()); auto filter_ptr = filter->data(); + int element_num_per_div = + fpga::get_element_num_per_div(filter, param->Groups()); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)filter_ptr; @@ -79,9 +86,9 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { convArgs.image.width = input->dims()[3]; convArgs.image.pad_height = param->Paddings()[0]; convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->fpga_args().scale_pointer(); + convArgs.image.scale_address = input->scale; convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = out->fpga_args().scale_pointer(); + convArgs.output.scale_address = out->scale; param->SetFpgaArgs(convArgs); return true; diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index 7bf5cd3a66c2149079ac213342e8ed7b046cfa99..030dfcad9ac0a1cb93f7be626e59adbfc1630052 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef FUSION_CONVADDBNRELU_OP #include "operators/kernel/conv_add_bn_relu_kernel.h" -#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -24,13 +23,12 @@ template <> bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { bool relu_enabled = true; - const Tensor *input = param->Input(); + Tensor *input = const_cast(param->Input()); auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); Tensor *filter = param->Filter(); Tensor *out = param->Output(); - auto out_ptr = out->mutable_data(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -52,14 +50,23 @@ bool ConvAddBNReluKernel::Init( static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i * 2] = new_scale_ptr[i]; - bs_ptr[i * 2 + 1] = new_bias_ptr[i]; + bs_ptr[i + 2] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; } param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantize_filter(filter); + + float max_value = fpga::filter_find_max(filter); + fpga::format_filter(filter, max_value, param->Groups()); auto filter_ptr = filter->data(); + int element_num_per_div = + fpga::get_element_num_per_div(filter, param->Groups()); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)filter_ptr; @@ -76,9 +83,9 @@ bool ConvAddBNReluKernel::Init( convArgs.image.width = input->dims()[3]; convArgs.image.pad_height = param->Paddings()[0]; convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->fpga_args().scale_pointer(); + convArgs.image.scale_address = input->scale; convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = out->fpga_args().scale_pointer(); + convArgs.output.scale_address = out->scale; param->SetFpgaArgs(convArgs); return true; } diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 48007e4cb8e90c500d53455d4dd8095827c92831..81fc42980c17acea0f051d9dc548fd5e4c602bd5 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef FUSION_CONVADDRELU_OP #include "operators/kernel/conv_add_relu_kernel.h" -#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -23,26 +22,33 @@ namespace operators { template <> bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { bool relu_enabled = true; - const Tensor *input = param->Input(); + Tensor *input = const_cast(param->Input()); auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); Tensor *filter = param->Filter(); Tensor *out = param->Output(); - auto out_ptr = out->mutable_data(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { - bs_ptr[i * 2] = 1; - bs_ptr[i * 2 + 1] = bias_ptr[i]; + bs_ptr[i + channel] = 1; + bs_ptr[i] = bias_ptr[i]; } - fpga::quantize_filter(filter); + float max_value = fpga::filter_find_max(filter); + fpga::format_filter(filter, max_value, param->Groups()); auto filter_ptr = filter->data(); + int element_num_per_div = + fpga::get_element_num_per_div(filter, param->Groups()); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)filter_ptr; @@ -60,9 +66,9 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { convArgs.image.pad_height = param->Paddings()[0]; convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->fpga_args().scale_pointer(); + convArgs.image.scale_address = input->scale; convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = out->fpga_args().scale_pointer(); + convArgs.output.scale_address = out->scale; param->SetFpgaArgs(convArgs); return true; } diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index 49c5f92beb1f12f2b73d31b22c27cd4dd38e115f..997c0a754404bcda6334e8f9d068243bfd7102b0 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -16,7 +16,6 @@ limitations under the License. */ #include "operators/kernel/conv_bn_kernel.h" #include "fpga/api.h" -#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -24,12 +23,11 @@ namespace operators { template <> bool ConvBNKernel::Init(FusionConvBNParam *param) { bool relu_enabled = false; - const Tensor *input = param->Input(); + Tensor *input = const_cast(param->Input()); auto input_ptr = input->data(); Tensor *filter = param->Filter(); Tensor *out = param->Output(); - auto out_ptr = out->mutable_data(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -50,14 +48,23 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { new_scale_ptr[i] = bn_scale_ptr[i] / static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i * 2] = new_scale_ptr[i]; - bs_ptr[i * 2 + 1] = new_bias_ptr[i]; + bs_ptr[i + channel] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; } param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantize_filter(filter); + + float max_value = fpga::filter_find_max(filter); + fpga::format_filter(filter, max_value, param->Groups()); auto filter_ptr = filter->data(); + int element_num_per_div = + fpga::get_element_num_per_div(filter, param->Groups()); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)filter_ptr; @@ -74,9 +81,9 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { convArgs.image.width = input->dims()[3]; convArgs.image.pad_height = param->Paddings()[0]; convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->fpga_args().scale_pointer(); + convArgs.image.scale_address = input->scale; convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = out->fpga_args().scale_pointer(); + convArgs.output.scale_address = out->scale; param->SetFpgaArgs(convArgs); return true; diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index 6bbe1b8763160993da5edb96162e54c8ab688d14..e0865b4a7cfcf0422393ee56fb4f4d370eb9abcf 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef FUSION_CONVBNRELU_OP #include "operators/kernel/conv_bn_relu_kernel.h" -#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -23,11 +22,10 @@ namespace operators { template <> bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bool relu_enabled = true; - const Tensor *input = param->Input(); + Tensor *input = const_cast(param->Input()); auto input_ptr = input->data(); Tensor *filter = param->Filter(); Tensor *out = param->Output(); - auto out_ptr = out->mutable_data(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -47,14 +45,23 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { new_scale_ptr[i] = bn_scale_ptr[i] / static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i]; - bs_ptr[i * 2] = new_scale_ptr[i]; - bs_ptr[i * 2 + 1] = new_bias_ptr[i]; + bs_ptr[i + channel] = new_scale_ptr[i]; + bs_ptr[i] = new_bias_ptr[i]; } param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::quantize_filter(filter); + + float max_value = fpga::filter_find_max(filter); + fpga::format_filter(filter, max_value, param->Groups()); auto filter_ptr = filter->data(); + int element_num_per_div = + fpga::get_element_num_per_div(filter, param->Groups()); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)filter_ptr; @@ -71,9 +78,9 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { convArgs.image.width = input->dims()[3]; convArgs.image.pad_height = param->Paddings()[0]; convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->fpga_args().scale_pointer(); + convArgs.image.scale_address = input->scale; convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = out->fpga_args().scale_pointer(); + convArgs.output.scale_address = out->scale; param->SetFpgaArgs(convArgs); return true; } diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp index a71801680a25ae0b21ed5b2a2738c4b593e7c0ac..573e15a80bda57ebf5217d4f47ead5d2d5165688 100644 --- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp @@ -22,11 +22,12 @@ template <> bool ElementwiseAddReluKernel::Init( ElementwiseAddReluParam *param) { bool relu_enabled = true; - const Tensor *input_x = param->InputX(); - const Tensor *input_y = param->InputY(); + Tensor *input_x = const_cast(param->InputX()); + Tensor *input_y = const_cast(param->InputY()); Tensor *out = param->Out(); auto input_x_ptr = input_x->data(); auto input_y_ptr = input_y->data(); + fpga::format_ofm(out); auto out_ptr = out->mutable_data(); fpga::EWAddArgs ewaddArgs; @@ -35,21 +36,19 @@ bool ElementwiseAddReluKernel::Init( ewaddArgs.const1 = 1; ewaddArgs.image0.address = (void *)input_x_ptr; ewaddArgs.image0.channels = input_x->dims()[1]; - ewaddArgs.image0.scale_address = - input_x->fpga_args().scale_pointer(); // ew has scale attribute?? + ewaddArgs.image0.scale_address = input_x->scale; ewaddArgs.image0.height = input_x->dims()[2]; ewaddArgs.image0.width = input_x->dims()[3]; ewaddArgs.image0.pad_height = 0; ewaddArgs.image0.pad_width = 0; ewaddArgs.image1.address = (void *)input_y_ptr; ewaddArgs.image1.channels = input_y->dims()[1]; - ewaddArgs.image1.scale_address = - input_y->fpga_args().scale_pointer(); // ew has scale attribute?? + ewaddArgs.image1.scale_address = input_y->scale; ewaddArgs.image1.height = input_y->dims()[2]; ewaddArgs.image1.width = input_y->dims()[3]; ewaddArgs.image1.pad_height = 0; ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->fpga_args().scale_pointer(); + ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.address = (void *)out_ptr; param->SetFpgaArgs(ewaddArgs); return true; diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 52c8c71537cd3d11eb3bfab43c78a5ad79d0db37..1065eea0160066cbe9f1efcc0131cc45f08378d6 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #include "operators/kernel/fc_relu_kernel.h" #include "fpga/api.h" -#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -23,26 +22,42 @@ namespace operators { template <> bool FusionFcReluKernel::Init(FusionFcReluParam *param) { bool relu_enabled = true; - const Tensor *input_x = param->InputX(); + Tensor *input_x = const_cast(param->InputX()); auto input_x_ptr = input_x->data(); Tensor *input_y = param->InputY(); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); Tensor *out = param->Out(); - auto out_ptr = out->mutable_data(); PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], "Image channel should be equal to weight number"); int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { - bs_ptr[i * 2] = 1; - bs_ptr[i * 2 + 1] = input_z_ptr[i]; + bs_ptr[i + channel] = 1; + bs_ptr[i] = input_z_ptr[i]; } - fpga::quantize_filter(input_y); + int num = input_y->dims()[1]; + int chw = input_y->dims()[0]; + PADDLE_MOBILE_ENFORCE( + chw == input_x->numel(), + "Filter element num should be equal to IFM element num"); + int height = input_x->dims()[2]; + int width = input_x->dims()[3]; + int filter_channel = chw / height / width; + + input_y->Resize(framework::make_ddim({num, filter_channel, height, width})); + float max_value = fpga::filter_find_max(input_y); + fpga::format_filter(input_y, max_value, 1); auto input_y_ptr = input_y->data(); + int element_num_per_div = fpga::get_element_num_per_div(input_y, 1); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)input_y_ptr; @@ -59,11 +74,9 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { convArgs.image.width = input_x->dims()[3]; convArgs.image.pad_height = 0; convArgs.image.pad_width = 0; - convArgs.image.scale_address = - input_x->fpga_args().scale_pointer(); // fc input has scale attribute?? + convArgs.image.scale_address = input_x->scale; convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = - out->fpga_args().scale_pointer(); // fc output has scale attribute?? + convArgs.output.scale_address = out->scale; param->SetFpgaArgs(convArgs); return true; diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 199b6b6878ad2c838e3b3d12d8e92a70ea541dad..7a83fa65980a3055ea8b80caa8ca83caf750e8cc 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -14,7 +14,6 @@ limitations under the License. */ #ifdef FUSION_FC_OP #include "operators/kernel/fusion_fc_kernel.h" -#include "fpga/quantization.h" namespace paddle_mobile { namespace operators { @@ -22,26 +21,42 @@ namespace operators { template <> bool FusionFcKernel::Init(FusionFcParam *param) { bool relu_enabled = false; - const Tensor *input_x = param->InputX(); + Tensor *input_x = const_cast(param->InputX()); auto input_x_ptr = input_x->data(); Tensor *input_y = param->InputY(); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); Tensor *out = param->Out(); - auto out_ptr = out->mutable_data(); PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], "Image channel should be equal to weight number"); int channel = out->dims()[1]; float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { - bs_ptr[i * 2] = 1; - bs_ptr[i * 2 + 1] = input_z_ptr[i]; + bs_ptr[i + channel] = 1; + bs_ptr[i] = input_z_ptr[i]; } - fpga::quantize_filter(input_y); + int num = input_y->dims()[1]; + int chw = input_y->dims()[0]; + PADDLE_MOBILE_ENFORCE( + chw == input_x->numel(), + "Filter element num should be equal to IFM element num"); + int height = input_x->dims()[2]; + int width = input_x->dims()[3]; + int filter_channel = chw / height / width; + + input_y->Resize(framework::make_ddim({num, filter_channel, height, width})); + float max_value = fpga::filter_find_max(input_y); + fpga::format_filter(input_y, max_value, 1); auto input_y_ptr = input_y->data(); + int element_num_per_div = fpga::get_element_num_per_div(input_y, 1); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + + fpga::format_ofm(out); + auto out_ptr = out->mutable_data(); + fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; convArgs.filter_address = (void *)input_y_ptr; @@ -58,9 +73,9 @@ bool FusionFcKernel::Init(FusionFcParam *param) { convArgs.image.width = input_x->dims()[3]; convArgs.image.pad_height = 0; convArgs.image.pad_width = 0; - convArgs.image.scale_address = input_x->fpga_args().scale_pointer(); + convArgs.image.scale_address = input_x->scale; convArgs.output.address = (void *)out_ptr; - convArgs.output.scale_address = out->fpga_args().scale_pointer(); + convArgs.output.scale_address = out->scale; param->SetFpgaArgs(convArgs); return true; } diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp index 1565c1519d0048d027fb860f80f313179188c4d5..98c8da84d7875e1ffe5d6127b6bbc49ac907f923 100644 --- a/src/operators/kernel/fpga/pool_kernel.cpp +++ b/src/operators/kernel/fpga/pool_kernel.cpp @@ -21,9 +21,10 @@ namespace operators { template <> bool PoolKernel::Init(PoolParam *param) { - const Tensor *input = param->Input(); + Tensor *input = const_cast(param->Input()); auto input_ptr = input->data(); Tensor *output = param->Output(); + fpga::format_ofm(output); auto output_ptr = output->mutable_data(); vector ksize = param->Ksize(); vector strides = param->Strides(); @@ -36,7 +37,9 @@ bool PoolKernel::Init(PoolParam *param) { poolArgs.image.width = input->dims()[3]; poolArgs.image.pad_height = paddings[0]; poolArgs.image.pad_width = paddings[1]; + poolArgs.image.scale_address = input->scale; poolArgs.output.address = output_ptr; + poolArgs.output.scale_address = input->scale; poolArgs.kernel.height = ksize[0]; poolArgs.kernel.width = ksize[1]; poolArgs.kernel.stride_h = strides[0];