diff --git a/src/fpga/fpga_quantilization.cpp b/src/fpga/fpga_quantilization.cpp new file mode 100644 index 0000000000000000000000000000000000000000..34033a60a683183695a79bfafbaf14223e2eebf2 --- /dev/null +++ b/src/fpga/fpga_quantilization.cpp @@ -0,0 +1,95 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "fpga/fpga_quantilization.h" +#include + +namespace paddle_mobile { +namespace fpga { + +template +static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel, + int height, int width) { + int offset_height = 0; + + for (int n = 0; n < num; n++) { + int amount_per_row = width * channel; + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_out + offset_height + w * channel + c) = *(data_in++); + } + } + } + data_out += num; + } +} + +template +static Dtype find_max(Dtype* data, int num) { + Dtype max = 0; + for (int i = 0; i < num; ++i) { + max = std::max(max, data[i]); + } + return max; +} + +template +framework::Tensor* quantilize_filter(framework::Tensor* filter) { + float scale = 0; + float fix_range = static_cast((1 << (8 - 1)) - 1); + + const int batch_size = filter->dims()[0]; + const int channel = filter->dims()[1]; + const int height = filter->dims()[2]; + const int width = filter->dims()[3]; + + int8_t* int_data = nullptr; + int8_t* tmp_data = new int[filter->numel()]; + + // 32bit filter -> 8bit filter; + if (filter->type() == typeid(float)) { + float* float_data = filter->data(); + float max = find_max(float_data, filter->numel()); + + scale = (max / fix_range); + + framework::Tensor* filter = filter; + framework::Tensor* quant_filter = new framework::Tensor(); + + int_data = quant_filter->mutable_data(); + for (int i = 0; i < filter->numel(); ++i) { + tmp_data[i] = (int8_t)float_data[i] * scale; + } + filter = quant_filter; + } else { + int8_t max = find_max(filter->data(), filter->numel()); + scale = (max / fix_range); + + int_data = filter->data(); + for (int i = 0; i < filter->numel(); ++i) { + tmp_data[i] = int_data[i]; + } + int_data = filter->mutable_data(); + } + // NCHW -> NHWC; + chw_to_hwc(tmp_data, int_data, batch_size, channel, height, width); + delete tmp_data; + *(filter->fpga_args().scale_pointer()) = scale; + return filter; +} + +} // namespace fpga +} // namespace paddle_mobile diff --git a/src/fpga/fpga_quantilization.h b/src/fpga/fpga_quantilization.h index 7a1df04732580c7225423cedeb277beca3edc154..8dacd20abdc85da05a451ec763fd01f03f8f4516 100644 --- a/src/fpga/fpga_quantilization.h +++ b/src/fpga/fpga_quantilization.h @@ -18,35 +18,13 @@ limitations under the License. */ #include "framework/tensor.h" namespace paddle_mobile { +namespace fpga { template -framework::Tensor* quantilize_filter(framework::Tensor* filter) { - float scale = 0; - // 32bit filter -> 8bit filter; - float min = 0f; - float max = 0f; - if (filter->type() == typeid(float)) { - float* floatData = originalFilter->data(); - for (int i = 0; i < filter->numel(); ++i) { - min = std::min(min, floatData[i]); - max = std::max(max, floatData[i]); - } - - float fix_range = (float)((1 << (8 - 1)) - 1); - float float_range = max; - scale = (float_range / fix_range); - - framework::Tensor* originalFilter = filter; - framework::Tensor* quantFilter = new framework::Tensor(); - int8_t* intData = quantFilter->mutable_data(); - for (int i = 0; i < filter->numel(); ++i) { - intData[i] = (int8_t)floatData[i] * scale; - } - quantFilter.scale = scale; - // NCHW -> NHWC; - return quantFilter; - } - return filter; -} +static void chw_to_hwc(Dtype* data_in, Dtype* data_out, int num, int channel, + int height, int width); +template +framework::Tensor* quantilize_filter(framework::Tensor* filter); +} // namespace fpga } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp index ae1270b146373587287140116114970963dcca7c..c6e04787a58bc437bf0738cf67072426f1cbaa57 100644 --- a/src/operators/kernel/fpga/concat_kernel.cpp +++ b/src/operators/kernel/fpga/concat_kernel.cpp @@ -36,18 +36,18 @@ void ConcatKernel::Compute(const ConcatParam ¶m) const { auto out_channel = out_dim[3]; auto out_offset = 0; - for (int i = 0; i < inputs.size(); ++i) { auto input = inputs[i]; auto channels = input->dims()[3]; out_offset += channels; auto src = input->data(); for (int j = 0; j < pixels; ++j) { - auto dst = out->data() + out_offset; + auto dst = out->mutable_data() + out_offset; memory::Copy(dst, src, sizeof(half)); } } } +template class ConcatKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 6719db3a80cb3c3a2ee603096b2659fa5489497d..3240a8d6b9604d0876691b641c072bc596312dbd 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -16,6 +16,7 @@ limitations under the License. */ #include "operators/kernel/conv_add_bn_kernel.h" #include "fpga/api/fpga_api.h" +#include "fpga/quantilization.h" namespace paddle_mobile { namespace operators { @@ -28,7 +29,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); const Tensor *filter = param->Filter(); - auto filter_ptr = filter->data(); + Tensor *out = param->Output(); auto out_ptr = out->mutable_data(); auto bn_mean_ptr = param->InputMean()->data(); @@ -41,7 +42,8 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { "Image channel should be equal to bias number"); const int channel = input->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + float *bs_ptr = + reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); Tensor *new_scale = new Tensor(); Tensor *new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); @@ -58,26 +60,33 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); + const Tensor *quant_filter = quantilize_filter(filter); + + // delete original filter? + filter = quant_filter; + + auto filter_ptr = filter->data(); fpga::ConvArgs convArgs; convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)filter_ptr; + convArgs.filter_address = reinterpret_cast filter_ptr; convArgs.filter_num = filter->dims()[0]; convArgs.group_num = param->Groups(); - convArgs.sb_address = (void *)bs_ptr; + convArgs.sb_address = reinterpret_cast bs_ptr; convArgs.kernel.stride_h = param->Strides()[0]; convArgs.kernel.stride_w = param->Strides()[1]; convArgs.kernel.height = filter->dims()[2]; convArgs.kernel.width = filter->dims()[3]; - convArgs.image.address = (void *)input_ptr; + convArgs.image.address = reinterpret_cast input_ptr; convArgs.image.channels = input->dims()[1]; convArgs.image.height = input->dims()[2]; convArgs.image.width = input->dims()[3]; convArgs.image.pad_height = param->Paddings()[0]; convArgs.image.pad_width = param->Paddings()[1]; convArgs.image.scale_address = input->fpga_args().scale_pointer(); - convArgs.output.address = (void *)out_ptr; + convArgs.output.address = reinterpret_cast out_ptr; convArgs.output.scale_address = out->fpga_args().scale_pointer(); param->SetFpgaArgs(convArgs); + return true; } diff --git a/src/operators/kernel/fpga/conv_kernel.cpp b/src/operators/kernel/fpga/conv_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..91d0f393fcc1018bacd507c5f7975f7b3a2a56ca --- /dev/null +++ b/src/operators/kernel/fpga/conv_kernel.cpp @@ -0,0 +1,38 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_OP + +#include "operators/kernel/conv_kernel.h" +#include "operators/kernel/central-arm-func/conv_arm_func.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvKernel::Init(ConvParam *param) { + return true; +} + +template <> +void ConvKernel::Compute(const ConvParam ¶m) const { + // ConvCompute(param); +} + +template class ConvKernel; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index ee1d12bfd6be13d67fd8360be2ab5c8d7f86e662..f4a14f1bc4197051594a0f8609b4662ad4c7cefb 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -160,4 +160,12 @@ else () #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) + + +endif() + +if(FPGA) + ADD_EXECUTABLE(test-tensor-quant fpga/test_tensor_quant.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-tensor-quant paddle-mobile) + endif() diff --git a/test/fpga/test_tensor_quant.cpp b/test/fpga/test_tensor_quant.cpp index 1e30b9be551c608c5200460ebb80526270da5aed..3835c395a4764c3c978b6bba9c1af48305be1d58 100644 --- a/test/fpga/test_tensor_quant.cpp +++ b/test/fpga/test_tensor_quant.cpp @@ -20,7 +20,7 @@ int main() { paddle_mobile::PaddleMobile paddle_mobile; bool optimize = false; if (paddle_mobile.Load(g_googlenet, optimize)) { - auto time2 = time(); + auto time1 = time(); DLOG << "load cost: " << time_diff(time1, time1) << "ms"; std::vector input; std::vector dims{1, 3, 224, 224};