diff --git a/src/operators/activation_op.cpp b/src/operators/activation_op.cpp index 76c9e1a014bc0e51b032d8516ba9448fa25b2aa5..d98fcb92297dd485fc3e59cfe592e00672f4ffca 100644 --- a/src/operators/activation_op.cpp +++ b/src/operators/activation_op.cpp @@ -58,6 +58,7 @@ REGISTER_OPERATOR_CPU(relu6, ops::Relu6Op); REGISTER_OPERATOR_MALI_GPU(relu, ops::ReluOp); #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(relu, ops::ReluOp); #endif #ifdef PADDLE_MOBILE_CL REGISTER_OPERATOR_CL(relu, ops::ReluOp); diff --git a/src/operators/elementwise_mul_op.cpp b/src/operators/elementwise_mul_op.cpp index 335a908ace54664f0bcbca37bdcde30047edee5d..3417fedbb2b8717355e1a7492321ecd5d7c6a9c3 100644 --- a/src/operators/elementwise_mul_op.cpp +++ b/src/operators/elementwise_mul_op.cpp @@ -36,6 +36,7 @@ REGISTER_OPERATOR_CPU(elementwise_mul, ops::ElementwiseMulOp); REGISTER_OPERATOR_MALI_GPU(elementwise_mul, ops::ElementwiseMulOp); #endif #ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(elementwise_mul, ops::ElementwiseMulOp); #endif #endif diff --git a/src/operators/fusion_fc_relu_op.cpp b/src/operators/fusion_fc_relu_op.cpp index b19e94cf9a8255b7e9d860cdd17fcfa76274aa02..e11da8814b3a5ef3b128be944965fb97d6142da8 100644 --- a/src/operators/fusion_fc_relu_op.cpp +++ b/src/operators/fusion_fc_relu_op.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef FUSION_FC_RELU_OP +#ifdef FUSION_FCRELU_OP #include "operators/fusion_fc_relu_op.h" namespace paddle_mobile { diff --git a/src/operators/kernel/elementwise_mul_kernel.h b/src/operators/kernel/elementwise_mul_kernel.h index 54baa50fcafb8ddbbefecb635ea85f120f16250d..f71b6257d5c978735198f1b42e15f3f454eb8787 100644 --- a/src/operators/kernel/elementwise_mul_kernel.h +++ b/src/operators/kernel/elementwise_mul_kernel.h @@ -17,7 +17,6 @@ limitations under the License. */ #pragma once #include "framework/operator.h" -#include "operators/math/elementwise_op_function.h" #include "operators/op_param.h" namespace paddle_mobile { diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp index a830996524cba9ff05259bf7ccf3a55c99749a87..c549e5a6eee98f38f1806367054b925440e3ebf1 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp @@ -15,55 +15,176 @@ limitations under the License. */ #include "operators/kernel/elementwise_add_kernel.h" +#include +#include "fpga/V1/api.h" + namespace paddle_mobile { namespace operators { template <> bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { - // bool relu_enabled = false; - paddle_mobile::fpga::ActivationType activation_enable = - paddle_mobile::fpga::NONE; - int16_t leaky_relu_negative_slope = 0; - auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); - fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); - - fpga::EWAddArgs ewaddArgs = {0}; - // ewaddArgs.relu_enabled = relu_enabled; - ewaddArgs.output.activation.activation_type = activation_enable; - ewaddArgs.output.activation.leaky_relu_negative_slope = - leaky_relu_negative_slope; - ewaddArgs.const0 = 0x3c00; // =1 - ewaddArgs.const1 = 0x3c00; // =1 - ewaddArgs.image0.address = input_x_ptr; - ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; - ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; - ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; - ewaddArgs.image0.pad_height = 0; - ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = input_y_ptr; - ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; - ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; - ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; - ewaddArgs.image1.pad_height = 0; - ewaddArgs.image1.pad_width = 0; - ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = out_ptr; - fpga::expand_EW_arg(&ewaddArgs); - param->SetFpgaArgs(ewaddArgs); + if (input_y->type() != typeid(float)) { + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto *input_x = const_cast(param->InputX()); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); + fpga::format_fp16_ofm(out); + auto out_ptr = out->mutable_data(); + + fpga::EWAddArgs ewaddArgs = {0}; + // ewaddArgs.relu_enabled = relu_enabled; + ewaddArgs.output.activation.activation_type = activation_enable; + ewaddArgs.output.activation.leaky_relu_negative_slope = + leaky_relu_negative_slope; + ewaddArgs.const0 = 0x3c00; // =1 + ewaddArgs.const1 = 0x3c00; // =1 + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; + ewaddArgs.image0.scale_address = input_x->scale; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; + ewaddArgs.image0.pad_height = 0; + ewaddArgs.image0.pad_width = 0; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; + ewaddArgs.image1.scale_address = input_y->scale; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; + ewaddArgs.image1.pad_height = 0; + ewaddArgs.image1.pad_width = 0; + ewaddArgs.output.scale_address = out->scale; + ewaddArgs.output.address = out_ptr; + fpga::expand_EW_arg(&ewaddArgs); + param->SetFpgaArgs(ewaddArgs); + } else { + param->float_input_x.Resize(param->InputX()->dims()); + param->float_input_x.init(typeid(float)); + fpga::format_fp32_ofm(&(param->float_input_x)); + + param->float_out.Resize(param->InputX()->dims()); + // param->float_out.init(typeid(float)); + param->float_out.mutable_data(param->InputX()->dims()); + fpga::format_fp32_ofm(&(param->float_out)); + + fpga::format_fp16_ofm(out); + } return true; } +inline void ElementwiseAddCompute(const ElementwiseAddParam ¶m) { + auto input_x = param.float_input_x; + auto input_y = param.InputY(); + auto Out = param.float_out; + int axis = param.Axis(); + + const auto &x_dims = input_x.dims(); + const auto &y_dims = input_y->dims(); + /// axis = -1 represent the last dimensions. + axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis); + size_t batch = 1; + size_t channels = 1; + size_t elementwise_num = 1; + for (int i = 0; i < axis; ++i) { + batch *= x_dims[i]; + } + for (int i = 0; i < y_dims.size(); ++i) { + channels *= y_dims[i]; + } + for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) { + elementwise_num *= x_dims[i]; + } + const float *bias_data = input_y->data(); + const float *input_data = input_x.data(); + float *output_data = Out.mutable_data(); + for (int i = 0; i < batch; ++i) { + for (int j = 0; j < channels; ++j) { + size_t offset = (i * channels + j) * elementwise_num; + const float *input = input_data + offset; + const float bias = bias_data[j]; + float *output = output_data + offset; + // DLOG << "output address: "<< output; + for (int k = 0; k < elementwise_num; ++k) { + output[k] = input[k] + bias; + // DLOG << "output[" << k << "]= " << output[k] ; + } + } + } +} template <> void ElementwiseAddKernel::Compute( const ElementwiseAddParam ¶m) { - fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + auto input_y = const_cast(param.InputY()); + if (input_y->type() != typeid(float)) { + fpga::ComputeFpgaEWAdd(param.FpgaArgs()); + } else { + auto input_x = const_cast(param.InputX()); + auto intput_x_float = const_cast(&(param.float_input_x)); + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = input_x->data(); + args.image.channels = (uint32_t)(input_x->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = intput_x_float->data(); + args.output.scale_address = intput_x_float->scale; + + // fpga::fpga_flush(input_x->data(),input_x->fpga_data_num * + // sizeof(half)); + fpga::PerformBypass(args); + fpga::fpga_invalidate(args.output.address, + input_x->fpga_data_num * sizeof(float)); + + // just for test + /* { + static int cnt = 0; + if(cnt == 0){ + std::string str= "first_bypass_data"; + float rslt = 0.0f; + fpga::savefile(str, args.output.address, input_x->fpga_data_num, + rslt); cnt++; + } + }*/ + ElementwiseAddCompute(param); + + auto out_float = const_cast(&(param.float_out)); + DLOG << "out float: " << out_float->data(); + fpga::fpga_flush(out_float->data(), + input_x->fpga_data_num * sizeof(float)); + // just for test + /*{ + static int cnt = 0; + if(cnt == 0){ + std::string str= "ew_output_data"; + float rslt = 0.0f; + + fpga::savefile(str, out_float->data(), input_x->fpga_data_num, + rslt); cnt++; + } + }*/ + auto Out = param.Out(); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = out_float->data(); + args.image.channels = (uint32_t)(input_x->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = Out->data(); + args.output.scale_address = Out->scale; + fpga::PerformBypass(args); + } } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e421ddb78ff4f1a0f0c51c985db9c26666001d03 --- /dev/null +++ b/src/operators/kernel/fpga/V1/elementwise_mul_kernel.cpp @@ -0,0 +1,93 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ELEMENTWISEMUL_OP + +#include "operators/kernel/elementwise_mul_kernel.h" +#include "operators/math/elementwise_op_function.h" + +namespace paddle_mobile { +namespace operators { + +template +struct MulFunctor { + inline T operator()(T a, T b) const { return a * b; } +}; +template <> +bool ElementwiseMulKernel::Init(ElementwiseMulParam *param) { + param->float_input_x.Resize(param->InputX()->dims()); + param->float_input_x.init(typeid(float)); + fpga::format_fp32_ofm(&(param->float_input_x)); + + param->float_out.Resize(param->InputX()->dims()); + param->float_out.init(typeid(float)); + fpga::format_fp32_ofm(&(param->float_out)); + + auto *out = param->Out(); + fpga::format_fp16_ofm(out); + return true; +} + +template <> +void ElementwiseMulKernel::Compute( + const ElementwiseMulParam ¶m) { + auto input_x = const_cast(param.InputX()); + auto intput_x_float = const_cast(&(param.float_input_x)); + // auto intput_x_32_ptr = + // const_cast(param.float_input_x.data()); + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = input_x->data(); + args.image.channels = (uint32_t)(input_x->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = intput_x_float->data(); + args.output.scale_address = intput_x_float->scale; + fpga::PerformBypass(args); + fpga::fpga_invalidate(args.output.address, + input_x->fpga_data_num * sizeof(float)); + + auto input_y = param.InputY(); + int axis = param.Axis(); + auto out_float = const_cast(&(param.float_out)); + ElementwiseComputeEx, float>( + intput_x_float, input_y, axis, MulFunctor(), out_float); + fpga::fpga_flush(out_float->data(), + input_x->fpga_data_num * sizeof(float)); + + Tensor *Out = param.Out(); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.input_layout_type = fpga::LAYOUT_CHW; + args.output_layout_type = fpga::LAYOUT_HWC; + args.image.address = out_float->data(); + args.image.channels = (uint32_t)(Out->fpga_data_num); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = Out->data(); + args.output.scale_address = Out->scale; + fpga::PerformBypass(args); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index d32375f1c66b8db5c3ae933ec5a1b00cdb508d5f..5010ac7ad41850e41b58a897ae2e969b7831e90d 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -62,15 +62,27 @@ void FetchKernel::Compute(const FetchParam ¶m) { output->ShareDataWith(*input); return; } + fpga::BypassArgs args = param.fpga_bypass_args; auto input_address = (input->data()); args.image.address = static_cast(input_address); + float *outdata_ptr = + reinterpret_cast(param.fpga_bypass_args.output.address); + const int num_th = 32; + if ((param.Out()->fpga_data_num) < num_th) { + fpga::fpga_invalidate(input_address, (input->fpga_data_num) * sizeof(half)); + + for (int idx = 0; idx < product(input->dims()); ++idx) { + outdata_ptr[idx] = fpga::fp16_2_fp32(input_address[idx]); + } + return; + } + fpga::PerformBypass(args); auto outC = param.Out()->dims()[1]; auto outH = param.Out()->dims()[2]; auto outW = param.Out()->dims()[3]; - float *outdata_ptr = - reinterpret_cast(param.fpga_bypass_args.output.address); + fpga::fpga_invalidate(param.fpga_bypass_args.output.address, param.Out()->fpga_data_num * sizeof(float)); diff --git a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp index fadeae324ff8f5160bc5ff410c2e02b09539a01e..6669ff2ccdcea028000a2e12b82721cc442d9271 100644 --- a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp @@ -30,8 +30,8 @@ bool FusionFcKernel::Init(FusionFcParam *param) { auto input_z_ptr = input_z->data(); auto out = param->Out(); - PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], - "Image channel should be equal to weight number"); + // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], + // "Image channel should be equal to weight number"); int channel = (uint32_t)out->dims()[1]; auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT diff --git a/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fbeb63fe606aac014f76088210c74a4118e6c78 --- /dev/null +++ b/src/operators/kernel/fpga/V1/fusion_fc_relu_kernel.cpp @@ -0,0 +1,75 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_FCRELU_OP + +#include "operators/kernel/fc_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FusionFcReluKernel::Init(FusionFcReluParam *param) { + // bool relu_enabled = false; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::LEAKYRELU; + int16_t leaky_relu_negative_slope = 0; + auto input_x = const_cast(param->InputX()); + auto filter = const_cast(param->InputY()); + const Tensor *input_z = param->InputZ(); + auto input_z_ptr = input_z->data(); + auto out = param->Out(); + + // PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], + // "Image channel should be equal to weight number"); + int channel = (uint32_t)out->dims()[1]; + auto bs_ptr = + (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT + for (int i = 0; i < channel; i++) { + bs_ptr[i + channel] = 1; + bs_ptr[i] = input_z_ptr[i]; + } + int num = (uint32_t)filter->dims()[1]; + int chw = (uint32_t)filter->dims()[0]; + PADDLE_MOBILE_ENFORCE( + chw == input_x->numel(), + "Filter element num should be equal to IFM element num"); + int height = (uint32_t)input_x->dims()[2]; + int width = (uint32_t)input_x->dims()[3]; + int filter_channel = chw / height / width; + + out->Resize(framework::make_ddim({1, channel, 1, 1})); + filter->Resize(framework::make_ddim({num, filter_channel, height, width})); + float max_value = fpga::filter_find_max(filter); + fpga::format_fc_filter(filter, max_value); + + int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + fpga::format_fp16_ofm(out); + + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable, + leaky_relu_negative_slope, 1, 1, 1, 0, 0, bs_ptr); + param->SetFpgaArgs(conv_arg); + return true; +} + +template <> +void FusionFcReluKernel::Compute( + const FusionFcReluParam ¶m) { + fpga::ComputeFpgaConv(param.FpgaArgs()); +} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp index 0489d86da5335b9abbc487f115875307b5d95990..a86e011b61d6dab79159cbb4d34cabab747a1d3b 100644 --- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -503,8 +503,10 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { auto score_index = *(param.score_index_.get()); int pre_nms_top_n = param.pre_nms_topn_; - int post_nms_top_n = 100; // param.post_nms_topn_; - float nms_thresh = param.nms_thresh_; + int post_nms_top_n = param.post_nms_topn_; + // DLOG << " param.post_nms_topn_ : " << param.post_nms_topn_; + + float nms_thresh = param.nms_thresh_ / 2.0f; float min_size = param.min_size_; float eta = param.eta_; diff --git a/src/operators/kernel/fpga/V1/relu_kernel.cpp b/src/operators/kernel/fpga/V1/relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..6fff10f6206212379e865f2041e3d35cca955bfd --- /dev/null +++ b/src/operators/kernel/fpga/V1/relu_kernel.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RELU_OP + +#include "operators/kernel/activation_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ReluKernel::Init(ReluParam *param) { + param->Out()->ShareDataWith(*param->InputX()); + return true; +} + +template <> +void ReluKernel::Compute(const ReluParam ¶m) {} +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/op_param.cpp b/src/operators/op_param.cpp index 4d1689911686198612eb4df4dfe8f99450ba503d..be12ecbe73c9b3c1fe7ea6b7380288cbe42838b1 100644 --- a/src/operators/op_param.cpp +++ b/src/operators/op_param.cpp @@ -12,7 +12,7 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#include "op_param.h" +#include "operators/op_param.h" namespace paddle_mobile { namespace operators { @@ -47,6 +47,9 @@ template class ConvParam; template class ElementwiseAddParam; template class ElementwiseAddParam; template class ElementwiseAddParam; +template class ElementwiseMulParam; +template class ElementwiseMulParam; +template class ElementwiseMulParam; #ifdef MUL_OP template class MulParam; diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 4492c4ba6d56d41ee253fde0ac96933fa851dc77..5b9a2ea8f2a4b1841d583e602e4746b87d71cea6 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -563,6 +563,10 @@ class ElementwiseAddParam : public OpParam { public: const fpga::EWAddArgs &FpgaArgs() const { return fpga_EW_add_args; } void SetFpgaArgs(const fpga::EWAddArgs &args) { fpga_EW_add_args = args; } + + public: + Tensor float_input_x, float_out; + #endif }; @@ -596,6 +600,12 @@ class ElementwiseMulParam : public OpParam { GType *input_y_; GType *out_; int axis_; +#ifdef PADDLE_MOBILE_FPGA + + public: + Tensor float_input_x, float_out; + +#endif }; #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 138362f20892cb1b5db9bf0a2c83baec79f5f0f4..167b374de91ee4224578e4fc30c18fb8e2e5ea9d 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -82,7 +82,9 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-rfcn-api fpga/test_rfcn_api.cpp) target_link_libraries(test-rfcn-api paddle-mobile) - + + ADD_EXECUTABLE(test-marker2 fpga/test_marker2.cpp test_helper.h test_include.h executor_for_test.h ) + target_link_libraries(test-marker2 paddle-mobile) set(FOUND_MATCH ON) endif () diff --git a/test/fpga/test_marker2.cpp b/test/fpga/test_marker2.cpp new file mode 100644 index 0000000000000000000000000000000000000000..b4af515c7313325bffa7ba0ec465b985d6eb75b8 --- /dev/null +++ b/test/fpga/test_marker2.cpp @@ -0,0 +1,181 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif +#include +#ifdef COST_TIME_PRINT +#include +#include +#include +#endif +void readStream(std::string filename, char *buf) { + std::ifstream in; + in.open(filename, std::ios::in | std::ios::binary); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + + in.seekg(0, std::ios::end); // go to the end + auto length = in.tellg(); // report location (this is the length) + in.seekg(0, std::ios::beg); // go back to the beginning + in.read(buf, length); + DLOG << length; + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width, + int num, int16_t *data_tmp) { + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * amount_per_side * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } + } + } +} + +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum, bool use_chw) { + // bool use_chw = true; + if (input_tensor.dims().size() != 4) return; + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + int n = (input_tensor.dims())[0]; + auto data_ptr = input_tensor.get_data(); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + auto data_tmp = data_ptr_16; + if (use_chw) { + data_tmp = + reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); + convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); + } + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + if (data_tmp != data_ptr_16) { + free(data_tmp); + } +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} + +void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, + bool use_chw) { + static int i = 0; + if (input_tensor.numel() == 0) { + return; + } + if (input_tensor.type() == typeid(float)) { + DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); + dump_stride_float(filename, input_tensor, dumpnum); + } else { + DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); + dump_stride_half(filename, input_tensor, dumpnum, use_chw); + } + DLOG << "dump input address: " << input_tensor.get_data(); +} + +static const char *g_marker_combine = "../models/marker/marker_2segment"; +// static const char *g_marker_combine = "../models/marker/model2"; +static const char *g_image_src_float = + "../models/marker/marker_2segment/marker_2.bin"; +// static const char *g_image_src_float = "../models/marker/model2/data.bin"; +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + + if (paddle_mobile.Load(std::string(g_marker_combine) + "/model", + std::string(g_marker_combine) + "/params", true, false, + 1, true)) { + // if (paddle_mobile.Load(std::string(g_marker_combine), true)) { + float img_info[3] = {432, 1280, 1.0f}; + auto img = reinterpret_cast( + fpga::fpga_malloc(144 * 14 * 14 * sizeof(float))); + readStream(g_image_src_float, reinterpret_cast(img)); + + std::vector v(3, nullptr); + paddle_mobile.FeedData({img}); + // paddle_mobile.Predict_To(-1); +#ifdef COST_TIME_PRINT + timeval start11, end11; + long dif_sec, dif_usec; // NOLINT +#endif + +#ifdef COST_TIME_PRINT + gettimeofday(&start11, NULL); +#endif + + paddle_mobile.Predict_To(-1); + +#ifdef COST_TIME_PRINT + gettimeofday(&end11, NULL); + dif_sec = end11.tv_sec - start11.tv_sec; + dif_usec = end11.tv_usec - start11.tv_usec; + std::cout << "total: " + << " cost time: " << (dif_sec * 1000000 + dif_usec) << " us" + << std::endl; +#endif + + for (int i = 0; i < 8; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "marker_" + std::to_string(i); + // if(i != 58) + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), + tensor_ptr->numel() * sizeof(float)); + // tensor_ptr->numel() * sizeof(float)); + + dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), + true); // 20);//tensor_ptr->numel()); + } + + // paddle_mobile.GetResults(&v); + DLOG << "Computation done"; + fpga::fpga_free(img); + } + + return 0; +} diff --git a/tools/op.cmake b/tools/op.cmake index 83d972d3b208fe680fa1bee311be41dd316b82c6..aed5ce3260a293c23d05c9b85e73174dc2860857 100755 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -138,6 +138,9 @@ if (CON GREATER -1) set(CONV_TRANSPOSE_OP ON) set(FUSION_DECONVADDBNRELU_OP ON) set(FUSION_DECONVADDBN_OP ON) + set(ELEMENTWISEMUL_OP ON) + set(FUSION_FCRELU_OP ON) + set(RELU_OP ON) set(FOUND_MATCH ON) endif()