diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 1fe62128fed08fe935fadeb98a55fc22acd4b21b..138906c790574a4a0201180b5d18cd67960a7e1d 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -104,7 +104,7 @@ int fpga_invalidate(void *address, size_t size) { } half fp32_2_fp16(float fp32_num) { - unsigned long tmp = *(unsigned long *)(&fp32_num); + unsigned long tmp = *(unsigned long *)(&fp32_num); // NOLINT half t = ((tmp & 0x007fffff) >> 13) | ((tmp & 0x80000000) >> 16) | (((tmp & 0x7f800000) >> 13) - (112 << 10)); if (tmp & 0x1000) { @@ -120,7 +120,7 @@ float fp16_2_fp32(half fp16_num) { int tmp = 0; float fp32_num; tmp = s << 16 | exp << 23 | frac << 13; - fp32_num = *(float *)&tmp; + fp32_num = *(float *)&tmp; // NOLINT return fp32_num; } @@ -347,6 +347,20 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, filter_tensor->reset_data_ptr(new_data); } +void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { + filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT + filter_tensor->scale[1] = float(127.0 / max_value); // NOLINT + auto dims = filter_tensor->dims(); + auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; + auto data_ptr = filter_tensor->data(); + size_t memory_size = num * channel * height * width * sizeof(float); + auto new_data = (float *)fpga_malloc(memory_size); // NOLINT + fpga_copy(new_data, data_ptr, memory_size); + filter::format_fc_filter(&new_data, num, channel, height, width, 1, + max_value); + filter_tensor->reset_data_ptr(new_data); +} + void format_bias_scale_array(float **bias_scale_array, int element_num_per_division, int num) { bias_scale::format_bias_scale_array(bias_scale_array, diff --git a/src/fpga/api.h b/src/fpga/api.h index d1809596239ee28671e266055c78f157c02beed6..a4f71e119c83de40771f321abfc8bb2821e4523a 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -109,8 +109,8 @@ struct PoolingArgs { struct EWAddArgs { bool relu_enabled; - half const0; // output0 = const0 x input0 + const1 x input1; - half const1; + uint32_t const0; // output0 = const0 x input0 + const1 x input1; + uint32_t const1; struct ImageInputArgs image0; struct ImageInputArgs image1; struct ImageOutputArgs output; @@ -214,6 +214,7 @@ int get_aligned_filter_element_num(int chw); int get_aligned_filter_num(int num); void format_filter(framework::Tensor* filter_tensor, float max_value, int group_num); +void format_fc_filter(framework::Tensor* filter_tensor, float max_value); void format_bias_scale_array(float** bias_scale_array, int element_num_per_division, int num); void format_concat_output(framework::Tensor* out, int height, int width, diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp index a4266ff9f5e30b47f7e9118b8ec722445423714a..34e0ad6f18f8e80d636e42630e03650c018a8825 100644 --- a/src/fpga/filter.cpp +++ b/src/fpga/filter.cpp @@ -225,6 +225,45 @@ void format_filter(float **data_in, int num, int channel, int height, int width, num_after_alignment * sizeof(char)); } +void convert_fc_filter(char **data_in, int num, int chw) { + char *tmp = *data_in; + char *data_tmp = (char *)fpga_malloc(chw * num * sizeof(char)); // NOLINT + for (int n = 0; n < num; n++) { + for (int c = 0; c < chw; c++) { + data_tmp[n * chw + c] = (*data_in)[num * c + n]; + } + } + *data_in = data_tmp; + fpga_free(tmp); +} + +void format_fc_filter(float **data_in, int num, int channel, int height, + int width, int group_num, float max) { + int data_size = channel * height * width * num; + int chw = channel * height * width; + + int division_capacity = calc_division_capacity(chw); + int num_per_div_before_alignment = + calc_num_per_div(num, group_num, division_capacity); + int num_per_div_after_alignment = + align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); + int div_num = + (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * div_num; + + quantize(data_in, data_size, max); + + char **quantize_data = (char **)data_in; // NOLINT + + convert_fc_filter(quantize_data, num, chw); + align_element(quantize_data, num, chw); + align_num(quantize_data, num_per_div_before_alignment, num, chw); + reorder(quantize_data, num_after_alignment, chw); + interleave(quantize_data, num_after_alignment, chw); + fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * + num_after_alignment * sizeof(char)); +} + } // namespace filter } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/filter.h b/src/fpga/filter.h index 89132fabc4abee15ba8aa5e7cae8a14042cb3ad4..5d03ee9b4a0b1455b27f7c978678bd1dfaa5a698 100644 --- a/src/fpga/filter.h +++ b/src/fpga/filter.h @@ -25,7 +25,7 @@ int calc_division_capacity(int chw); int calc_split_num(int num, int division_capacity); int calc_division_number(int num, int group_num, int division_capacity); int calc_num_per_div(int num, int group_num, int division_capacity); -void convert_to_hwc(float** data_in, int num, int channel, int height, +void convert_to_hwc(char** data_in, int num, int channel, int height, int width); float find_max(float* data_in, int data_size); void quantize(float** data_in, int data_size, float max); @@ -36,6 +36,11 @@ void reorder(float** data_in, int num_after_alignment, int chw); void interleave(float** data_in, int num_after_alignment, int chw); void format_filter(float** data_in, int num, int channel, int height, int width, int group_num, float max); + +void convert_fc_filter(char** data_in, int num, int chw); +void format_fc_filter(float** data_in, int num, int channel, int height, + int width, int group_num, float max); + } // namespace filter } // namespace fpga } // namespace paddle_mobile diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index 4609438ec9fbdb5b5030b56a4bf18b9437bf7c2e..b07232867c0c66a9d064469f279dffe55b4b75bb 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -101,6 +101,11 @@ bool PaddleMobilePredictor::Run( return true; } +template +PaddleMobilePredictor::~PaddleMobilePredictor() { + paddle_mobile_->Clear(); +} + // A factory to help create difference predictor. template <> std::unique_ptr diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h index 66c6a4d5d9f8fc81b96642c6d5b62757dd581bc3..bdeb7e18653843ec9547f027068768532ba04fb2 100644 --- a/src/io/api_paddle_mobile.h +++ b/src/io/api_paddle_mobile.h @@ -32,7 +32,7 @@ namespace paddle_mobile { template class PaddleMobilePredictor : public PaddlePredictor { public: - PaddleMobilePredictor() {} + PaddleMobilePredictor() = delete; explicit PaddleMobilePredictor(const PaddleMobileConfig& config); @@ -40,7 +40,7 @@ class PaddleMobilePredictor : public PaddlePredictor { std::vector* output_data, int batch_size = -1) override; - ~PaddleMobilePredictor() override{}; + ~PaddleMobilePredictor() override; private: std::unique_ptr> paddle_mobile_; diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h index 97564f4132d2e43cf736c2eb4a95d437584be24f..104ba11153cdb9b3bb5e249a771a2cd27ad7dbac 100644 --- a/src/io/paddle_inference_api.h +++ b/src/io/paddle_inference_api.h @@ -87,7 +87,6 @@ enum class PaddleEngineKind { class PaddlePredictor { public: struct Config; - PaddlePredictor() = default; PaddlePredictor(const PaddlePredictor&) = delete; PaddlePredictor& operator=(const PaddlePredictor&) = delete; @@ -107,6 +106,9 @@ class PaddlePredictor { struct Config { std::string model_dir; // path to the model directory. }; + + protected: + PaddlePredictor() = default; }; struct PaddleMobileConfig : public PaddlePredictor::Config { diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 052607aae7f3211da211f8aaaff5bb75a36138ce..904dd8a1da9e67d0c1283806e766d3a25dc27309 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -46,7 +46,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); - fpga::format_filter(filter, max_value, 1); + fpga::format_fc_filter(filter, max_value); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 6536f796ef2b27d33080c79cf36ac462604782be..46dae1b2a076add9f17e4e5bc6d3a99ad583fb50 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -47,7 +47,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); - fpga::format_filter(filter, max_value, 1); + fpga::format_fc_filter(filter, max_value); int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); diff --git a/src/operators/kernel/fpga/mul_kernel.cpp b/src/operators/kernel/fpga/mul_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..07aa4bcc43d28805ab0660bf89149c5ec5f1c732 --- /dev/null +++ b/src/operators/kernel/fpga/mul_kernel.cpp @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef MUL_OP + +#include "operators/kernel/mul_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool MulKernel::Init(MulParam *param) { + bool relu_enabled = false; + auto input_x = const_cast(param->InputX()); + auto filter = const_cast(param->InputY()); + auto out = param->Out(); + + PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], + "Image channel should be equal to weight number"); + int channel = (uint32_t)out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + for (int i = 0; i < channel; i++) { + bs_ptr[i + channel] = 1; + bs_ptr[i] = 0; + } + int num = (uint32_t)filter->dims()[1]; + int chw = (uint32_t)filter->dims()[0]; + PADDLE_MOBILE_ENFORCE( + chw == input_x->numel(), + "Filter element num should be equal to IFM element num"); + int height = (uint32_t)input_x->dims()[2]; + int width = (uint32_t)input_x->dims()[3]; + int filter_channel = chw / height / width; + + filter->Resize(framework::make_ddim({num, filter_channel, height, width})); + float max_value = fpga::filter_find_max(filter); + fpga::format_fc_filter(filter, max_value); + + int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + fpga::format_fp16_ofm(out); + + fpga::WrapperConvArgs conv_arg = {0}; + fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, + 0, bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void MulKernel::Compute(const MulParam ¶m) const { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/mul_op.cpp b/src/operators/mul_op.cpp index a6b055b62fa25fbca2a85dfa386fa406e207b2e9..69e3bb300d741e74ab8d6eea6c62052b4d0d8f1d 100644 --- a/src/operators/mul_op.cpp +++ b/src/operators/mul_op.cpp @@ -61,5 +61,7 @@ REGISTER_OPERATOR_CPU(mul, ops::MulOp); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(mul, ops::MulOp); #endif - +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(mul, ops::MulOp); +#endif #endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 0c3e544bda9832888016ee304b946d53823d5324..72b39e727ccd5dad98c005e3e01034bef5582d71 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -441,6 +441,15 @@ class MulParam : OpParam { GType *out_; int x_num_col_dims_; int y_num_col_dims_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::WrapperConvArgs fpga_conv_args; + + public: + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } +#endif }; #endif diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index cca6793f10da5a0784cf8a3ba2d0104f3508028d..f850eb3e5ea3a03fe90d82c1eca2af6c9f8e9106 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -18,8 +18,9 @@ static const char *g_resnet_combine = "../models/resnet50"; int main() { DLOG << paddle_mobile::fpga::open_device(); paddle_mobile::PaddleMobile paddle_mobile; - if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model", - std::string(g_resnet_combine) + "/params", true)) { + // if (paddle_mobile.Load(std::string(g_resnet_combine) + "/model", + // std::string(g_resnet_combine) + "/params", true)) { + if (paddle_mobile.Load(std::string(g_resnet_combine), true)) { std::vector dims{1, 3, 224, 224}; Tensor input_tensor; SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(0), diff --git a/test/framework/test_inference_api.cpp b/test/framework/test_inference_api.cpp index 7dec2fe29753c75ee70f31428d104450acce9404..e1713bb203dc011f0fd7c48ff3b736f48d56eb44 100644 --- a/test/framework/test_inference_api.cpp +++ b/test/framework/test_inference_api.cpp @@ -46,7 +46,12 @@ int main() { tensor_out.dtype = PaddleDType::FLOAT32; std::vector outputs(1, tensor_out); - assert(predictor->Run(paddle_tensor_feeds, &outputs)); + std::cout << " before predict " << std::endl; + + predictor->Run(paddle_tensor_feeds, &outputs); + + std::cout << " after predict " << std::endl; + // assert(); float* data_o = static_cast(outputs[0].data.data()); for (size_t j = 0; j < outputs[0].data.length() / sizeof(float); ++j) { diff --git a/test/net/test_resnet.cpp b/test/net/test_resnet.cpp index d2a4abbbfd2c023f1e8220e74f815eda44acb6db..528942456485e1abe1ff7fa833cc6b90c9a6fe86 100644 --- a/test/net/test_resnet.cpp +++ b/test/net/test_resnet.cpp @@ -52,8 +52,8 @@ int main() { #else auto time3 = time(); paddle_mobile.FeedData(input_tensor); - paddle_mobile.Predict_To(10); - paddle_mobile.Predict_From(10); + paddle_mobile.Predict_To(-1); + /*paddle_mobile.Predict_From(10); auto tensor_ptr = paddle_mobile.FetchResult(9); std::cout << "Tensor element number for op[9]: " << tensor_ptr->numel() << std::endl; @@ -63,7 +63,7 @@ int main() { auto time4 = time(); std::cout << "predict cost :" << time_diff(time3, time4) << "ms" - << std::endl; + << std::endl;*/ #endif } return 0; diff --git a/tools/op.cmake b/tools/op.cmake index 9a6ec0a147b564296d89113a2838cc6bd73975a1..898f66a634d70a5def7c7ce328a7a291d9b55c70 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -121,6 +121,7 @@ if (CON GREATER -1) set(FUSION_CONVBNRELU_OP ON) set(FUSION_CONVBN_OP ON) set(FUSION_CONVADD_OP ON) + set(MUL_OP ON) set(FOUND_MATCH ON) endif()