diff --git a/src/common/types.cpp b/src/common/types.cpp old mode 100644 new mode 100755 index 93e3ee516a59a1615b738793d06f3c35557243dc..8b996fa5511a6d8e1b10b5a0aa13e820ee643c26 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -105,12 +105,14 @@ const char *G_OP_TYPE_FUSION_DECONV_ADD_RELU = "fusion_deconv_add_relu"; const char *G_OP_TYPE_SEQUENCE_EXPAND = "sequence_expand"; const char *G_OP_TYPE_SEQUENCE_POOL = "sequence_pool"; const char *G_OP_TYPE_SEQUENCE_SOFTMAX = "sequence_softmax"; - const char *G_OP_TYPE_SLICE = "slice"; const char *G_OP_TYPE_ANCHOR_GENERATOR = "anchor_generator"; const char *G_OP_TYPE_GENERATE_PROPOSALS = "generate_proposals"; const char *G_OP_TYPE_PSROI_POOL = "psroi_pool"; const char *G_OP_TYPE_ROI_PERSPECTIVE = "roi_perspective_transform"; +const char *G_OP_TYPE_PAD2D = "pad2d"; +const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU = "fusion_deconv_add_bn_relu"; +const char *G_OP_TYPE_FUSION_DECONV_ADD_BN = "fusion_deconv_add_bn"; std::unordered_map< std::string, std::pair, std::vector>> @@ -210,5 +212,8 @@ std::unordered_map< {{"Scores", "BboxDeltas", "ImInfo", "Anchors", "Variances"}, {"RpnRois", "RpnRoiProbs"}}}, {G_OP_TYPE_PSROI_POOL, {{"X", "ROIs"}, {"Out"}}}, - {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}}; + {G_OP_TYPE_ROI_PERSPECTIVE, {{"X", "ROIs"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DECONV_ADD_BN, {{"Input"}, {"Out"}}}, + {G_OP_TYPE_PAD2D, {{"X"}, {"Out"}}}}; } // namespace paddle_mobile diff --git a/src/common/types.h b/src/common/types.h old mode 100644 new mode 100755 index 9c189d5921546ebaaf3d058a47858157864e13ae..12f5253a74043a8609004520d68f1137c387f37d --- a/src/common/types.h +++ b/src/common/types.h @@ -199,6 +199,9 @@ extern const char *G_OP_TYPE_ANCHOR_GENERATOR; extern const char *G_OP_TYPE_GENERATE_PROPOSALS; extern const char *G_OP_TYPE_PSROI_POOL; extern const char *G_OP_TYPE_ROI_PERSPECTIVE; +extern const char *G_OP_TYPE_PAD2D; +extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; +extern const char *G_OP_TYPE_FUSION_DECONV_ADD_BN; extern std::unordered_map< std::string, std::pair, std::vector>> diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 5c960bbea7f8e65053998a29cd72d7b78f2fb97a..9607961c4785f631afb4b5e207ebff2c8e33623e 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -162,7 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { fpga_copy(new_data, data_ptr, memory_size); filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); filter_tensor->reset_data_ptr(new_data); - filter_tensor->set_type(typeid(int8_t)); + filter_tensor->set_type(typeid(int16_t)); } void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, @@ -396,8 +396,8 @@ void expand_conv_arg(ConvArgs *arg) { // auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; auto cmd = 0UL | USE_BIAS; - auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) | - ((args.deconv_tx_param.sub_conv_num) << 16) | + auto deconv_param = ((args.deconv_tx_param.deconv_en) << 16) | + ((args.deconv_tx_param.sub_conv_num) << 8) | ((args.deconv_tx_param.omit_size) << 0); (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); @@ -623,7 +623,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga::format_fp16_ofm(out, dims_out_new); auto out_ptr = out->data(); arg->output.address = - out_ptr + + (half *)out_ptr + // NOLINT omit_size * sizeof(half) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); arg->output.scale_address = out->scale; @@ -713,6 +713,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, } for (int j = 0; j < split_num; ++j) { + // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled; arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = activation_enable; arg->split_conv_args[i] @@ -758,9 +759,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, FILTER_NUM_ALIGNMENT) * sizeof(int8_t); - auto filter_head = - &filter_ptr[j * element_num * filter_num_per_div + // NOLINT - i * filter_sub_conv_offset]; + auto filter_head = &(( + int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT + i * filter_sub_conv_offset]; arg->split_conv_args[i]->conv_arg[j].filter_address = fpga_malloc(filter_size); arg->split_conv_args[i]->vector_conv_space.push_back( @@ -774,6 +775,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga_flush(arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size); + /*{ + static int cnt = 0; + std::string str = "deconv_filter"; + if(cnt <= 1){ + cnt++; + str += std::to_string(cnt); + int8_t result = 0; + fpga::savefile(str, + arg->split_conv_args[i]->conv_arg[j].filter_address, filter_size, result); + } + + }*/ + size_t bs_align_num = align_to_x( arg->split_conv_args[i]->conv_arg[j].filter_num, BS_NUM_ALIGNMENT); size_t bs_size = 2 * bs_align_num * sizeof(float); @@ -789,6 +803,20 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, memcpy(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_head, bs_size); fpga_flush(arg->split_conv_args[i]->conv_arg[j].sb_address, bs_size); + /* { + static int cnt = 0; + std::string str = "deconv_sb"; + if(cnt <= 1){ + cnt++; + str += std::to_string(cnt); + float result = 0; + fpga::savefile(str, + arg->split_conv_args[i]->conv_arg[j].sb_address, 2 * bs_align_num, + result); + } + + }*/ + if (split_num == 1) { arg->split_conv_args[i]->conv_arg[j].output.address = arg->split_conv_args[i]->output.address; @@ -835,13 +863,10 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int stride_h, int stride_w, int padding_h, int padding_w, float *bias_ptr) { - auto deleter = [](void *p) { fpga_free(p); }; - arg->vector_dwconv_space.push_back( - std::shared_ptr(reinterpret_cast(bias_ptr), deleter)); - - auto filter_ptr = filter->data(); + auto filter_ptr = filter->data(); auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); + auto output_ptr = out->data(); + arg->sub_conv_num = 1; // arg->relu_enabled = relu_enabled; arg->output.activation.activation_type = activation_enable; diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index ebba4f3eaf7ff822bae240f8565b4b5f86f1a796..833decef5808e3a1fe9f63a6d1008ea890247c73 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -21,15 +21,37 @@ namespace paddle_mobile { namespace fpga { namespace image { -void convert_to_hwc(float **data_in, int channel, int height, int width) { +void convert_to_hwc(float **data_in, int channel, int height, int width, + int num) { + float *data_tmp = reinterpret_cast( + fpga_malloc(num * channel * height * width * sizeof(float))); + int64_t amount_per_row = width * channel; + for (int n = 0; n < num; n++) { + for (int c = 0; c < channel; c++) { + for (int h = 0; h < height; h++) { + int64_t offset_height = h * amount_per_row; + for (int w = 0; w < width; w++) { + *(data_tmp + n * channel * height * width + offset_height + + w * channel + c) = *((*data_in)++); + } + } + } + } + *data_in = data_tmp; +} + +void convert_to_chw(float **data_in, int channel, int height, int width, + int num) { float *data_tmp = (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT - int64_t amount_per_row = width * channel; - for (int c = 0; c < channel; c++) { + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { for (int h = 0; h < height; h++) { - int64_t offset_height = h * amount_per_row; for (int w = 0; w < width; w++) { - *(data_tmp + offset_height + w * channel + c) = *((*data_in)++); + for (int c = 0; c < channel; c++) { + *(data_tmp + n * height * width * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } } } } @@ -55,7 +77,7 @@ void align_element_conv(float **data_in, int height, int cw) { } void format_image(float **data_in, int channel, int height, int width) { - convert_to_hwc(data_in, channel, height, width); + // convert_to_hwc(data_in, channel, height, width); int cw = channel * width; int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); if (align_cw != cw) { @@ -132,8 +154,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out, for (int i = 0; i < image_num; i++) { des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) + w * channel_nums[i]; - memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, - channel_nums[i] * sizeof(int16_t)); + memcpy(reinterpret_cast(images_out[i]) + des_offset, + image_in + src_offset, channel_nums[i] * sizeof(int16_t)); src_offset += channel_nums[i]; } } diff --git a/src/fpga/V1/image.h b/src/fpga/V1/image.h index f3c7b2731cb555c0c8871f6cd1d9f9df3e6429f2..c81de8f4554d9d2a9396bf587ec7ab10806e856a 100644 --- a/src/fpga/V1/image.h +++ b/src/fpga/V1/image.h @@ -20,7 +20,11 @@ namespace paddle_mobile { namespace fpga { namespace image { -void convert_to_hwc(float** data_in, int channel, int height, int width); +void convert_to_hwc(float** data_in, int channel, int height, int width, + int num = 1); +void convert_to_chw(float** data_in, int channel, int height, int width, + int num = 1); + void align_element_conv(float** data_in, int height, int cw); void format_image(float** data_in, int channel, int height, int width); diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index bc01c37751ef0e2acee1cf469c015b321d9c9680..49507dc75dbcbd1bac9385ed6fab14b694c8f7be 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -13,8 +13,8 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "framework/operator.h" +#include #include "operators/op_param.h" - namespace paddle_mobile { namespace framework { @@ -70,7 +70,12 @@ void OperatorBase::Run() { auto vari = this->scope_->FindVar(var_vec_in[i]); if (vari->IsInitialized()) { const Tensor *tensor = vari->template Get(); - if (tensor) DLOG << type_ << " input- " << key << "=" << *tensor; + if (tensor) { + DLOG << type_ << " input- " << key << "=" << *tensor; +#ifdef PADDLE_MOBILE_FPGA + DLOG << var_vec_in[i]; +#endif + } } } } @@ -80,7 +85,12 @@ void OperatorBase::Run() { auto vari = scope_->FindVar(var_vec_out[i]); if (vari->IsInitialized()) { const Tensor *tensor = vari->template Get(); - if (tensor) DLOG << type_ << " output- " << key << "=" << *tensor; + if (tensor) { + DLOG << type_ << " output- " << key << "=" << *tensor; +#ifdef PADDLE_MOBILE_FPGA + DLOG << var_vec_out[i]; +#endif + } } } } diff --git a/src/framework/operator.h b/src/framework/operator.h index 1c7605944a77e4f8d6d4ea033e3d460030653217..ae51280f0afc8135836dbe76350ee130944708e8 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include #include #include @@ -80,7 +81,9 @@ class OperatorBase { } #ifdef PADDLE_MOBILE_FPGA void InsertTensors(); + void ChangeNameMap(string key, std::vector value); #endif + protected: std::shared_ptr scope_; std::string type_; @@ -95,6 +98,7 @@ class OperatorBase { template class OperatorWithKernel : public OperatorBase { public: +#ifndef PADDLE_MOBILE_FPGA1 OperatorWithKernel(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, std::shared_ptr scope) @@ -104,6 +108,25 @@ class OperatorWithKernel : public OperatorBase { kernel_.InitCLHelper(scope->GetCLScpoe()); #endif } +#else + OperatorWithKernel(const std::string &type, const VariableNameMap inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + std::shared_ptr scope) + : OperatorBase(type, inputs, outputs, attrs, scope) { + static int feed_num = 0; + static int fetch_num = 0; + if (type == "feed") { + auto new_name = string("feed") + std::to_string(feed_num++); + auto var = scope->Var(new_name); + (const_cast(inputs)).at("X") = {string(new_name)}; + } else if (type == "fetch") { + auto new_name = string("fetch") + std::to_string(fetch_num++); + auto var = scope->Var(new_name); + (const_cast(outputs)).at("Out") = {string(new_name)}; + } + param_ = ParamType(inputs, outputs, attrs, *scope); + } +#endif virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void InferShape() const = 0; diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp index 5ddb71aaf700b96b0630c1d0a4a8779f3ac1ddcb..db263081446f9804e5352588063a23f72a8bf163 100644 --- a/src/framework/scope.cpp +++ b/src/framework/scope.cpp @@ -126,6 +126,8 @@ std::vector Scope::VarContain(const std::string substring) { return v; } +void Scope::InsertVar(const std::string str, Variable *var) {} + void Scope::print_vars() { DLOG << "====================start to print variables================="; for (auto pair : vars_) { diff --git a/src/framework/scope.h b/src/framework/scope.h index c85a09979607316149de711440b3228a655e49b7..d9e3a179e0aae9f93947df60cea410d3eb5cb128 100644 --- a/src/framework/scope.h +++ b/src/framework/scope.h @@ -86,6 +86,7 @@ class Scope { #ifdef PADDLE_MOBILE_FPGA Variable *Var(const std::string &name, const int id); std::vector VarContain(const std::string substring); + void InsertVar(const std::string str, Variable *var); void print_vars(); #endif diff --git a/src/operators/fusion_deconv_add_bn_op.cpp b/src/operators/fusion_deconv_add_bn_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..cb22e29f0903259d7bcf46271fb2a8bd70ba8eb7 --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_op.cpp @@ -0,0 +1,33 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBN_OP + +#include "operators/fusion_deconv_add_bn_op.h" + +namespace paddle_mobile { +namespace operators {} +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +REGISTER_FUSION_MATCHER(fusion_deconv_add_bn, ops::FusionDeconvAddBNMatcher); +#ifdef PADDLE_MOBILE_CPU +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn, ops::FusionDeconvAddBNOp); +#endif + +#endif diff --git a/src/operators/fusion_deconv_add_bn_op.h b/src/operators/fusion_deconv_add_bn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..f7f9b9e2094a7228c944b70b88ae3105ae9f37e8 --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_op.h @@ -0,0 +1,116 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_DECONVADDBN_OP +#pragma once +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/deconv_add_bn_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDeconvAddBNMatcher : public framework::FusionOpMatcher { + public: + FusionDeconvAddBNMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); + node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_BATCHNORM); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}}, + {G_OP_TYPE_BATCHNORM, + {{"Scale", "Scale"}, + {"Mean", "Mean"}, + {"Bias", "Bias"}, + {"Variance", "Variance"}, + {"Y", "BNY"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN; } +}; + +template +class FusionDeconvAddBNOp : public framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNParam, + operators::DeconvAddBNKernel> { + public: + FusionDeconvAddBNOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNParam, + operators::DeconvAddBNKernel>(type, inputs, outputs, + attrs, scope) {} + + void InferShape() const { + auto input = this->param_.Input(); + auto in_dims = input->dims(); + + auto filter = this->param_.Filter(); + auto filter_dims = filter->dims(); + + std::vector strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + std::vector dilations = this->param_.Dilations(); + + int groups = this->param_.Groups(); + + PADDLE_MOBILE_ENFORCE( + in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() == filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims[1] == filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - + 2 * paddings[i] + filter_extent); + } + this->param_.Output()->Resize(framework::make_ddim(output_shape)); + } + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_DECONV_ADD_BN_OP diff --git a/src/operators/fusion_deconv_add_bn_relu_op.cpp b/src/operators/fusion_deconv_add_bn_relu_op.cpp new file mode 100755 index 0000000000000000000000000000000000000000..b7e9abe660b350e9d3ccc89aef685505a7449a9f --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_relu_op.cpp @@ -0,0 +1,34 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBNRELU_OP + +#include "operators/fusion_deconv_add_bn_relu_op.h" + +namespace paddle_mobile { +namespace operators {} +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +REGISTER_FUSION_MATCHER(fusion_deconv_add_bn_relu, + ops::FusionDeconvAddBNReluMatcher); +#ifdef PADDLE_MOBILE_CPU +#endif +#ifdef PADDLE_MOBILE_MALI_GPU +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(fusion_deconv_add_bn_relu, ops::FusionDeconvAddBNReluOp); +#endif + +#endif diff --git a/src/operators/fusion_deconv_add_bn_relu_op.h b/src/operators/fusion_deconv_add_bn_relu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..97070ef01e544839be8eab6ddba21c43dfa9a26e --- /dev/null +++ b/src/operators/fusion_deconv_add_bn_relu_op.h @@ -0,0 +1,118 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#ifdef FUSION_DECONVADDBNRELU_OP +#pragma once +#include +#include + +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/deconv_add_bn_relu_kernel.h" + +namespace paddle_mobile { +namespace operators { +using std::string; +using std::vector; +class FusionDeconvAddBNReluMatcher : public framework::FusionOpMatcher { + public: + FusionDeconvAddBNReluMatcher() { + node_ = framework::Node(G_OP_TYPE_CONV_TRANSPOSE); + node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_BATCHNORM) > + std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}, {"X", "X"}}}, + {G_OP_TYPE_BATCHNORM, + {{"Scale", "Scale"}, + {"Mean", "Mean"}, + {"Bias", "Bias"}, + {"Variance", "Variance"}, + {"Y", "BNY"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DECONV_ADD_BN_RELU; } +}; + +template +class FusionDeconvAddBNReluOp + : public framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNReluParam, + operators::DeconvAddBNReluKernel> { + public: + FusionDeconvAddBNReluOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDeconvAddBNReluParam, + operators::DeconvAddBNReluKernel>( + type, inputs, outputs, attrs, scope) {} + + void InferShape() const { + auto input = this->param_.Input(); + auto in_dims = input->dims(); + + auto filter = this->param_.Filter(); + auto filter_dims = filter->dims(); + + std::vector strides = this->param_.Strides(); + std::vector paddings = this->param_.Paddings(); + std::vector dilations = this->param_.Dilations(); + + int groups = this->param_.Groups(); + + PADDLE_MOBILE_ENFORCE( + in_dims.size() == 4 || in_dims.size() == 5, + "ConvTransposeOp intput should be 4-D or 5-D tensor."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() == filter_dims.size(), + "ConvTransposeOp input dimension and filter dimension " + "should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims.size() - strides.size() == 2U, + "ConvTransposeOp input dimension and strides dimension should " + "be consistent."); + PADDLE_MOBILE_ENFORCE(paddings.size() == strides.size(), + "ConvTransposeOp paddings dimension and strides " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE(paddings.size() == dilations.size(), + "ConvTransposeOp paddings dimension and dilations " + "dimension should be the same."); + PADDLE_MOBILE_ENFORCE( + in_dims[1] == filter_dims[0], + "In ConvTransposeOp, The number of input channels should " + "be equal to the number of filter's channels."); + + std::vector output_shape({in_dims[0], filter_dims[1] * groups}); + for (size_t i = 0; i < strides.size(); ++i) { + auto filter_extent = dilations[i] * (filter_dims[i + 2] - 1) + 1; + output_shape.push_back((in_dims[i + 2] - 1) * strides[i] - + 2 * paddings[i] + filter_extent); + } + this->param_.Output()->Resize(framework::make_ddim(output_shape)); + } + + protected: +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif // FUSION_DECONV_ADD_BN_RELU_OP diff --git a/src/operators/kernel/deconv_add_bn_kernel.h b/src/operators/kernel/deconv_add_bn_kernel.h new file mode 100755 index 0000000000000000000000000000000000000000..181367031c0be48666efeda3df4426da38c67d4f --- /dev/null +++ b/src/operators/kernel/deconv_add_bn_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBN_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class DeconvAddBNKernel + : public OpKernelBase> { + public: + void Compute(const FusionDeconvAddBNParam ¶m); + + bool Init(FusionDeconvAddBNParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/deconv_add_bn_relu_kernel.h b/src/operators/kernel/deconv_add_bn_relu_kernel.h new file mode 100755 index 0000000000000000000000000000000000000000..c63b4db050ade64903ff817b40900faaef65924d --- /dev/null +++ b/src/operators/kernel/deconv_add_bn_relu_kernel.h @@ -0,0 +1,39 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBNRELU_OP + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +using framework::OpKernelBase; + +template +class DeconvAddBNReluKernel + : public OpKernelBase> { + public: + void Compute(const FusionDeconvAddBNReluParam ¶m); + + bool Init(FusionDeconvAddBNReluParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp index 4e68b5e30ccc53ae84deb0866f982d70e175d8eb..359c34b0cefa20ee13789402c87c8f13ca31cc50 100644 --- a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp +++ b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp @@ -43,9 +43,11 @@ bool AnchorGeneratorKernel::Init( // DLOG << "stride_height: " << stride_height; for (int h_idx = 0; h_idx < feature_height; ++h_idx) { + int offset0 = h_idx * feature_width * num_anchors * 4; for (int w_idx = 0; w_idx < feature_width; ++w_idx) { - int offset = h_idx * w_idx * num_anchors * 4; + int offset1 = w_idx * num_anchors * 4; for (int idx = 0; idx < num_anchors; idx++) { + int offset = offset0 + offset1 + idx * 4; anchor_ptr[offset + 0] = anchors_offset[idx * 4 + 0] + w_idx * stride_width; anchor_ptr[offset + 1] = diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp old mode 100644 new mode 100755 diff --git a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp index d1adec36adc73665d2e542b14b2e368830a2202d..5f8f85278e81911d67f1e072b390e6cd74149ee4 100644 --- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp @@ -16,13 +16,10 @@ limitations under the License. */ #include "operators/kernel/conv_bn_relu_kernel.h" #include - namespace paddle_mobile { namespace operators { - template <> bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { - // bool relu_enabled = true; paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; @@ -43,7 +40,6 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); - for (int i = 0; i < channel; i++) { new_scale_ptr[i] = bn_scale_ptr[i] / static_cast(pow((bn_var_ptr[i] + epsilon), 0.5)); @@ -51,24 +47,36 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i] = new_bias_ptr[i]; } - - fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); - fpga::SplitConvArgs conv_arg = {0}; - fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, - leaky_relu_negative_slope, param->Groups(), - param->Strides()[0], param->Strides()[1], - param->Paddings()[0], param->Paddings()[1], bs_ptr); - param->SetFpgaArgs(conv_arg); - + const int groups = param->Groups(); + if (groups == channel) { + fpga::format_dwconv_data(filter, out, new_scale_ptr, &new_bias_ptr); + fpga::DWconvArgs dwconv_arg = {0}; + fpga::fill_dwconv_arg(&dwconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Strides()[0], + param->Strides()[1], param->Paddings()[0], + param->Paddings()[1], new_bias_ptr); + param->SetFpgaArgs(dwconv_arg); + } else { + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); + fpga::SplitConvArgs conv_arg = {0}; + fpga::fill_split_arg(&conv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(conv_arg); + } delete new_scale; delete new_bias; return true; } - template <> void ConvBNReluKernel::Compute( const FusionConvBNReluParam ¶m) { - fpga::ComputeFpgaConv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWConv(param.FpgaDwconvArgs()); + } else { + fpga::ComputeFpgaConv(param.FpgaArgs()); + } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..788504df5d2ea1005cfaa76f12b58e61c0218391 --- /dev/null +++ b/src/operators/kernel/fpga/V1/conv_transpose_kernel.cpp @@ -0,0 +1,89 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef CONV_TRANSPOSE_OP + +#include "operators/kernel/conv_transpose_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool ConvTransposeKernel::Init(ConvTransposeParam *param) { + // bool relu_enabled = false; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + // const Tensor *bias = param->Bias(); + // auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + // PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + // "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = 0; // bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, + activation_enable, leaky_relu_negative_slope, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } + return true; +} + +template <> +void ConvTransposeKernel::Compute( + const ConvTransposeParam ¶m) { + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4239ac1e5da421cb0e2421a8919d8d15e40348af --- /dev/null +++ b/src/operators/kernel/fpga/V1/deconv_add_bn_kernel.cpp @@ -0,0 +1,90 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBN_OP + +#include "operators/kernel/deconv_add_bn_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvAddBNKernel::Init(FusionDeconvAddBNParam *param) { + // bool relu_enabled = true; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::NONE; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + const Tensor *bias = param->InputBias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, + activation_enable, leaky_relu_negative_slope, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } + return true; +} + +template <> +void DeconvAddBNKernel::Compute( + const FusionDeconvAddBNParam ¶m) { + // fpga::ComputeFpgaDeconv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp new file mode 100755 index 0000000000000000000000000000000000000000..28b8c83198a5517ed0dc9732e0033030a876a7da --- /dev/null +++ b/src/operators/kernel/fpga/V1/deconv_add_bn_relu_kernel.cpp @@ -0,0 +1,91 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DECONVADDBNRELU_OP + +#include "operators/kernel/deconv_add_bn_relu_kernel.h" +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool DeconvAddBNReluKernel::Init( + FusionDeconvAddBNReluParam *param) { + // bool relu_enabled = true; + paddle_mobile::fpga::ActivationType activation_enable = + paddle_mobile::fpga::LEAKYRELU; + int16_t leaky_relu_negative_slope = 0; + auto input = const_cast(param->Input()); + const Tensor *bias = param->InputBias(); + auto bias_ptr = bias->data(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); + + PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], + "Output channel should be equal to bias number"); + int channel = out->dims()[1]; + + int sub_conv_n = param->Strides()[0]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT + sizeof(float)); // NOLINT + + for (int i = 0; i < channel * sub_conv_n; i++) { + bs_ptr[i + sub_conv_n * channel] = 1; + bs_ptr[i] = bias_ptr[i % (channel)]; + } + + PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0], + "stride_width should be equal to stride_height "); + PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3], + "filter width should be equal to filter height "); + PADDLE_MOBILE_ENFORCE(((filter->dims()[2] % param->Strides()[0]) == 0), + "filter axis should be the multiple of stride axis "); + if (param->Groups() == channel) { + fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(), + sub_conv_n); + fpga::DWDeconvArgs DWDeconv_arg = {0}; + fpga::fill_DWDeconv_arg(&DWDeconv_arg, input, out, filter, + activation_enable, leaky_relu_negative_slope, + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(DWDeconv_arg); + } else { + fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n); + fpga::DeconvArgs deconv_arg = {0}; + fpga::fill_deconv_arg(&deconv_arg, input, out, filter, activation_enable, + leaky_relu_negative_slope, param->Groups(), + param->Strides()[0], param->Strides()[1], + param->Paddings()[0], param->Paddings()[1], bs_ptr); + param->SetFpgaArgs(deconv_arg); + } + return true; +} + +template <> +void DeconvAddBNReluKernel::Compute( + const FusionDeconvAddBNReluParam ¶m) { + // fpga::ComputeFpgaDeconv(param.FpgaArgs()); + if (param.Groups() == param.Output()->dims()[1]) { + fpga::ComputeDWDeconv(param.FpgaDWDconvArgs()); + } else { + fpga::ComputeFpgaDeconv(param.FpgaArgs()); + } +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp index 89e35f8a42d66aad6734ad6643b1b7204ad207ea..a52521b8470886c3ee2d3c4979d513a6e8b5aa93 100644 --- a/src/operators/kernel/fpga/V1/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp @@ -25,11 +25,6 @@ bool FeedKernel::Init(FeedParam *param) { input->Resize(output->dims()); if (output->dims().size() != 4) { - auto input_ptr = input->mutable_data(); - size_t size = output->numel() * sizeof(float); - auto p = fpga::fpga_malloc(size); - memcpy(p, input_ptr, size); - output->reset_data_ptr(p); return true; } fpga::format_fp16_ofm(output); @@ -41,7 +36,14 @@ void FeedKernel::Compute(const FeedParam ¶m) { auto output = param.Out(); auto input = const_cast(param.InputX()); - if (input->dims().size() != 4) { + if (output->dims().size() != 4) { + size_t size = output->numel() * sizeof(float); + auto output_ptr = output->data(); + auto input_ptr = input->data(); + auto external_ptr = reinterpret_cast(input->external_data); + float *p_data = external_ptr == nullptr ? input_ptr : external_ptr; + memcpy(output_ptr, p_data, size); + input->external_data = nullptr; return; } diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index 883c4e4dcb81e54d0de63ab9d90f2061b3734596..b575d952371c5352d2d23d465b08d7749b82d140 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -49,17 +49,20 @@ bool FetchKernel::Init(FetchParam *param) { template <> void FetchKernel::Compute(const FetchParam ¶m) { - auto input = param.InputX(); + auto input = const_cast(param.InputX()); if (input->type() == typeid(float)) { auto output = param.Out(); output->ShareDataWith(*input); return; } - fpga::PerformBypass(param.fpga_bypass_args); + fpga::BypassArgs args = param.fpga_bypass_args; + auto data = (input->mutable_data()); + args.image.address = static_cast(data); + fpga::PerformBypass(args); fpga::fpga_invalidate(param.fpga_bypass_args.output.address, param.fpga_bypass_args.image.channels * sizeof(float)); - // TODO: DEalign: get rid of extra 0 + // TODO(zhangyang): DEalign: get rid of extra 0 } template class FetchKernel; diff --git a/src/operators/kernel/fpga/V1/pad2d_kernel.cpp b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..f47a585ee412316ce65084c5fa10a622ffb93a4f --- /dev/null +++ b/src/operators/kernel/fpga/V1/pad2d_kernel.cpp @@ -0,0 +1,60 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/pad2d_kernel.h" +namespace paddle_mobile { +namespace operators { +template <> +bool Pad2dKernel::Init(Pad2dParam *param) { + Tensor *output = param->Out(); + fpga::format_fp16_ofm(output); + return true; +} +void pad2dFunc(const framework::Tensor *input, framework::Tensor *output) { + auto input_data = (input->data()); + auto output_data = (output->data()); + auto input_c = input->dims()[1]; + auto input_h = input->dims()[2]; + auto input_w = input->dims()[3]; + auto output_c = output->dims()[1]; + auto output_w = output->dims()[3]; + auto copysize = input_c * input_w; + for (int h = 0; h < input_h; ++h) { + auto input_offset = h * input_c * input_w; + auto output_offset = h * paddle_mobile::fpga::align_to_x( + output_c * output_w, IMAGE_ALIGNMENT); + memcpy((output_data + output_offset), (input_data + input_offset), + copysize * sizeof(half)); + } +} +template <> +void Pad2dKernel::Compute(const Pad2dParam ¶m) { + auto in_x = param.InputX(); + auto out = param.Out(); + fpga::fpga_invalidate((void *)in_x->data(), // NOLINT + in_x->numel() * sizeof(half)); + pad2dFunc(in_x, out); + (out->scale)[0] = (in_x->scale)[0]; + (out->scale)[1] = (in_x->scale)[1]; + DLOG << (out->scale)[0]; + DLOG << (out->scale)[1]; + size_t outputSize = + out->dims()[2] * + paddle_mobile::fpga::align_to_x((out->dims()[1]) * (out->dims()[3]), + IMAGE_ALIGNMENT) * + sizeof(half); + fpga::fpga_flush(out->data(), outputSize); +} +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp index 6dd43bf8cb95336d071cee52cfab52838f62ce88..e3bcbd25ea10fe01e085e90af9da422bc340717f 100644 --- a/src/operators/kernel/fpga/V1/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp @@ -22,15 +22,29 @@ namespace operators { template <> bool PoolKernel::Init(PoolParam *param) { auto *input = const_cast(param->Input()); - auto input_ptr = input->data(); - Tensor *output = param->Output(); - fpga::format_fp16_ofm(output); - auto output_ptr = output->mutable_data(); + auto *output = param->Output(); vector ksize = param->Ksize(); vector strides = param->Strides(); vector paddings = param->Paddings(); std::string pooling_type = param->PoolingType(); + if (input->type() == typeid(float)) { + int channels = input->dims()[1]; + int height = input->dims()[2]; + int width = input->dims()[3]; + int num = input->dims()[0]; + int out_width = (width + 2 * paddings[1] - ksize[1]) / strides[1] + 1; + int out_height = (height + 2 * paddings[0] - ksize[0]) / strides[0] + 1; + framework::DDim dim = + framework::make_ddim({num, channels, out_height, out_width}); + output->mutable_data(dim); + return true; + } + + auto input_ptr = input->data(); + fpga::format_fp16_ofm(output); + auto output_ptr = output->mutable_data(); + fpga::PoolingArgs poolArgs = {0}; poolArgs.mode = pooling_type == "max" ? 0 : 1; // max:0, avg:1 poolArgs.kernel_reciprocal = @@ -54,6 +68,31 @@ bool PoolKernel::Init(PoolParam *param) { template <> void PoolKernel::Compute(const PoolParam ¶m) { + auto *input = const_cast(param.Input()); + + if (input->type() == typeid(float)) { + auto *output = param.Output(); + auto in = input->data(); + auto len = output->numel(); + auto out = output->mutable_data(); + int N = input->dims()[0], C = input->dims()[1], H = input->dims()[2], + W = input->dims()[3]; + int HW = H * W, CHW = C * H * W, WC = W * C; + + for (int n = 0; n < N; n++) { + for (int c = 0; c < C; c++) { + out[n * C + c] = 0; + for (int h = 0; h < H; h++) { + for (int w = 0; w < W; w++) { + out[n * C + c] += in[n * CHW + h * WC + w * C + + c]; // in[n * CHW + c * HW + h * W + w]; // + } + } + out[n * C + c] /= HW; + } + } + return; + } fpga::ComputeFpgaPool(param.FpgaArgs()); } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp index 4f50d6edb10c2f0cd7f75c4f4395a7b90c993e4a..3f0ba42f05f528d6b067a3ef3e460609aaf22a4b 100644 --- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -67,6 +67,30 @@ bool ProposalKernel::Init(ProposalParam *param) { return true; } +template +void CPUGather(const Tensor &src, const Tensor &index, Tensor *output) { + PADDLE_MOBILE_ENFORCE(index.dims().size() == 1 || + (index.dims().size() == 2 && index.dims()[1] == 1), + "Dim not correct"); + int64_t index_size = index.dims()[0]; + + auto src_dims = src.dims(); + + const T *p_src = src.data(); + const int *p_index = index.data(); + T *p_output = output->data(); + + // slice size + int slice_size = 1; + for (int i = 1; i < src_dims.size(); ++i) slice_size *= src_dims[i]; + + const size_t slice_bytes = slice_size * sizeof(T); + + for (int64_t i = 0; i < index_size; ++i) { + int index_ = p_index[i]; + memcpy(p_output + i * slice_size, p_src + index_ * slice_size, slice_bytes); + } +} void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { auto *out_data = dst->data(); @@ -103,38 +127,49 @@ static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, T bbox_center_x = 0, bbox_center_y = 0; T bbox_width = 0, bbox_height = 0; - if (variances) { - bbox_center_x = - variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + - anchor_center_x; - bbox_center_y = variances_data[i * len + 1] * - bbox_deltas_data[i * len + 1] * anchor_height + - anchor_center_y; - bbox_width = std::exp(std::min(variances_data[i * len + 2] * - bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(variances_data[i * len + 3] * - bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } else { - bbox_center_x = - bbox_deltas_data[i * len] * anchor_width + anchor_center_x; - bbox_center_y = - bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; - bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], - kBBoxClipDefault)) * - anchor_width; - bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], - kBBoxClipDefault)) * - anchor_height; - } + /* + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } else { + */ + bbox_center_x = bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + + /* + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + */ + bbox_width = std::exp(bbox_deltas_data[i * len + 2]) * anchor_width; + bbox_height = std::exp(bbox_deltas_data[i * len + 3]) * anchor_height; + // } proposals_data[i * len] = bbox_center_x - bbox_width / 2; proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; - proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; - proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + /* + //wong + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + //wong + */ + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2; } // return proposals; } @@ -328,9 +363,12 @@ std::pair ProposalForOneImage( anchor_sel.mutable_data({index_t.numel(), 4}); var_sel.mutable_data({index_t.numel(), 4}); + CPUGather(scores_slice, index_t, &scores_sel); + CPUGather(bbox_deltas_slice, index_t, &bbox_sel); + CPUGather(anchors, index_t, &anchor_sel); Tensor proposals; proposals.mutable_data({index_t.numel(), 4}); - BoxCoder(&anchor_sel, &bbox_sel, &var_sel, &proposals); + BoxCoder(&anchor_sel, &bbox_sel, nullptr, &proposals); ClipTiledBoxes(im_info_slice, &proposals); @@ -341,6 +379,8 @@ std::pair ProposalForOneImage( bbox_sel.mutable_data({keep.numel(), 4}); scores_filter.mutable_data({keep.numel(), 1}); + CPUGather(proposals, keep, &bbox_sel); + CPUGather(scores_sel, keep, &scores_filter); if (nms_thresh <= 0) { return std::make_pair(bbox_sel, scores_filter); } @@ -351,14 +391,86 @@ std::pair ProposalForOneImage( keep_nms.Resize({post_nms_top_n}); } - proposals.mutable_data({keep_nms.numel(), 4}); - scores_sel.mutable_data({keep_nms.numel(), 1}); + // proposals.mutable_data({keep_nms.numel(), 4});//original + // scores_sel.mutable_data({keep_nms.numel(), 1});//original + proposals.mutable_data({post_nms_top_n, 4}); // wong + scores_sel.mutable_data({post_nms_top_n, 1}); // wong + CPUGather(bbox_sel, keep_nms, &proposals); + CPUGather(scores_filter, keep_nms, &scores_sel); return std::make_pair(proposals, scores_sel); } template <> void ProposalKernel::Compute(const ProposalParam ¶m) { + auto input_score = param.scores_; + auto input_score_data = input_score->data(); + auto input_score_data_tmp = input_score->data(); + uint32_t score_n, score_height, score_width, score_channels; + + auto input_bbox = param.bbox_deltas_; + auto input_bbox_data = input_bbox->data(); + auto input_bbox_data_tmp = input_bbox->data(); + uint32_t bbox_n, bbox_height, bbox_width, bbox_channels; + + score_n = (uint32_t)(input_score->dims()[0]); + score_channels = (uint32_t)(input_score->dims()[1]); + score_height = (uint32_t)(input_score->dims()[2]); + score_width = (uint32_t)(input_score->dims()[3]); + + bbox_n = (uint32_t)(input_bbox->dims()[0]); + bbox_channels = (uint32_t)(input_bbox->dims()[1]); + bbox_height = (uint32_t)(input_bbox->dims()[2]); + bbox_width = (uint32_t)(input_bbox->dims()[3]); + + // score_tmp->init(typeid(half)); + std::shared_ptr score_tmp = std::make_shared(); + score_tmp->Resize(param.scores_->dims()); + score_tmp->mutable_data(); + + std::shared_ptr bbox_tmp = std::make_shared(); + bbox_tmp->Resize(param.bbox_deltas_->dims()); + bbox_tmp->mutable_data(); + + auto score_tmp_data = score_tmp->data(); + auto bbox_tmp_data = bbox_tmp->data(); + int64_t amount_per_side = score_width * score_height; + int idx = 0; + fpga::fpga_invalidate( + input_score_data_tmp, + score_height * score_width * score_channels * sizeof(half)); + for (int h = 0; h < score_height; h++) { + for (int w = 0; w < score_width; w++) { + for (int c = 0; c < score_channels; c++) { + idx++; + // DLOG << "wong input_score: "<< + // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); + *(score_tmp_data + c * amount_per_side + score_width * h + w) = + (*(input_score_data_tmp++)); + } + } + } + amount_per_side = bbox_width * bbox_height; + fpga::fpga_invalidate(input_bbox_data_tmp, bbox_height * bbox_width * + bbox_channels * sizeof(half)); + for (int h = 0; h < bbox_height; h++) { + for (int w = 0; w < bbox_width; w++) { + for (int c = 0; c < bbox_channels; c++) { + idx++; + // DLOG << "wong input_score: "<< + // paddle_mobile::fpga::fp16_2_fp32(input_score_data[idx]); + *(bbox_tmp_data + c * amount_per_side + bbox_width * h + w) = + (*(input_bbox_data_tmp++)); + } + } + } + struct paddle_mobile::fpga::BypassArgs temp_score_arg; + struct paddle_mobile::fpga::BypassArgs temp_bbox_arg; + temp_score_arg = param.score_arg; + temp_score_arg.image.address = score_tmp->data(); + + temp_bbox_arg = param.bbox_arg; + temp_bbox_arg.image.address = bbox_tmp->data(); auto score_tensor = param.float_score.get(); fpga::PerformBypass(param.score_arg); fpga::fpga_invalidate(score_tensor->data(), @@ -396,23 +508,23 @@ void ProposalKernel::Compute(const ProposalParam ¶m) { int64_t w_bbox = bbox_dim[3]; // - Tensor bbox_deltas_swap, scores_swap; - bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}); - scores_swap.mutable_data({num, h_score, w_score, c_score}); + rpn_rois->mutable_data({bbox_deltas->numel(), 4}); + rpn_roi_probs->mutable_data({scores->numel(), 1}); framework::LoD lod; lod.resize(1); auto &lod0 = lod[0]; lod0.push_back(0); - anchors.Resize({anchors.numel() / 4, 4}); + anchors.Resize({anchors.numel(), 4}); + variances.Resize({variances.numel(), 4}); int64_t num_proposals = 0; for (int64_t i = 0; i < num; ++i) { Tensor im_info_slice = im_info->Slice(i, i + 1); - Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); - Tensor scores_slice = scores_swap.Slice(i, i + 1); + Tensor bbox_deltas_slice = (*bbox_tensor).Slice(i, i + 1); + Tensor scores_slice = (*score_tensor).Slice(i, i + 1); - bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox, 4}); scores_slice.Resize({h_score * w_score * c_score, 1}); std::pair tensor_pair = ProposalForOneImage( diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp index 97e820e83c434dc4d552a7b0e83329fc5f6d6888..3309f9f7ee983fb4efde3cecb1cae0fa9732b523 100644 --- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp @@ -18,6 +18,8 @@ limitations under the License. */ #include #include "operators/kernel/detection_kernel.h" +#include "fpga/V1/api.h" +#include "fpga/V1/image.h" namespace paddle_mobile { namespace operators { @@ -29,8 +31,7 @@ bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { param->float_input = std::make_shared(); param->float_input->mutable_data(param->input_x_->dims()); - param->float_output = std::make_shared(); - param->float_output->mutable_data(param->output_->dims()); + // param->float_output = std::make_shared(); auto input = param->input_x_; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -46,22 +47,90 @@ bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { args.output.scale_address = param->float_input->scale; param->input_arg = args; - fpga::format_fp16_ofm(param->output_); - - input = param->float_output.get(); - args.input_data_type = fpga::DATA_TYPE_FP32; - args.output_data_type = fpga::DATA_TYPE_FP16; - args.image.address = input->data(); - args.image.height = (uint32_t)input->dims()[2]; - args.image.width = (uint32_t)input->dims()[3]; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = param->output_->mutable_data(); - args.output.scale_address = param->output_->scale; - param->input_arg = args; + auto* rois = param->input_rois_; + int rois_num = rois->dims()[0]; + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, param->output_->dims()[1], param->output_->dims()[2], + param->output_->dims()[3]}); + param->output_->Resize(dims_out_new); + // fpga::format_fp16_ofm(param->output_); + + param->output_->mutable_data(dims_out_new); + // auto output = param->float_output.get(); + // param->output_ = output; + /* args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = output->data(); + args.image.height = (uint32_t)output->dims()[2]; + args.image.width = (uint32_t)output->dims()[3]; + args.image.channels = (uint32_t)output->dims()[1] ; + args.output.address = param->output_->mutable_data(); + args.output.scale_address = param->output_->scale; + param->output_arg = args;*/ return true; } +template +void PSROIPooling(const Dtype* bottom_data, const Dtype spatial_scale, + const int channels, const int height, const int width, + const int pooled_height, const int pooled_width, + const Dtype* bottom_rois, const int output_dim, + const int group_size, Dtype* top_data, + // int* mapping_channel, + int index, int* rois_batch_id) { + // The output is in order (n, ctop, ph, pw) + // static int cnt = 0; + int pw = index % pooled_width; + int ph = (index / pooled_width) % pooled_height; + int ctop = (index / pooled_width / pooled_height) % output_dim; + int n = index / pooled_width / pooled_height / output_dim; + + // [start, end) interval for spatial sampling + bottom_rois += n * 4; + int roi_batch_ind = rois_batch_id[n]; // bottom_rois[0]; + Dtype roi_start_w = static_cast(round(bottom_rois[0])) * spatial_scale; + Dtype roi_start_h = static_cast(round(bottom_rois[1])) * spatial_scale; + Dtype roi_end_w = + static_cast(round(bottom_rois[2]) + 1.) * spatial_scale; + Dtype roi_end_h = + static_cast(round(bottom_rois[3]) + 1.) * spatial_scale; + + // Force too small ROIs to be 1x1 + Dtype roi_width = std::max(roi_end_w - roi_start_w, 0.1f); // avoid 0 + Dtype roi_height = std::max(roi_end_h - roi_start_h, 0.1f); + + // Compute w and h at bottom + Dtype bin_size_h = roi_height / static_cast(pooled_height); + Dtype bin_size_w = roi_width / static_cast(pooled_width); + + int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); + int hend = ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); + int wend = ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + hend = std::min(std::max(hend, 0), height); + wstart = std::min(std::max(wstart, 0), width); + wend = std::min(std::max(wend, 0), width); + bool is_empty = (hend <= hstart) || (wend <= wstart); + + int gw = pw; + int gh = ph; + int c = (ctop * group_size + gh) * group_size + gw; + + bottom_data += (roi_batch_ind * channels + c) * height * width; + Dtype out_sum = 0; + for (int h = hstart; h < hend; ++h) { + for (int w = wstart; w < wend; ++w) { + int bottom_index = h * width + w; + out_sum += bottom_data[bottom_index]; + } + } + + Dtype bin_area = (hend - hstart) * (wend - wstart); + top_data[index] = is_empty ? 0. : out_sum / bin_area; +} template <> void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto input_tensor = param.float_input.get(); @@ -71,7 +140,7 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto* in = input_tensor; auto* rois = param.input_rois_; - auto* out = param.float_output.get(); + auto* out = param.output_; // param.float_output.get(); auto pooled_height = param.pooled_height_; auto pooled_width = param.pooled_width_; @@ -85,18 +154,17 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { int width = in_dims[3]; int rois_num = rois->dims()[0]; - // TODO auto in_stride = framework::stride(in_dims); - // TODO auto out_stride = framework::stride(out->dims()); - auto in_stride = - framework::stride({batch_size, height, width, input_channels}); - auto out_stride = framework::stride( - {out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + auto data_nhwc = in->mutable_data(); + fpga::image::convert_to_chw(&data_nhwc, input_channels, height, width); + framework::DDim dims_out_new = framework::make_ddim( + {rois_num, (param.output_)->dims()[1], (((param.output_)->dims()[2])), + (param.output_)->dims()[3]}); + (param.output_)->Resize(dims_out_new); - const float* input_data = in->data(); + const float* input_data = data_nhwc; // in->data(); framework::Tensor rois_batch_id_list; rois_batch_id_list.Resize({rois_num}); auto rois_batch_id_data = rois_batch_id_list.mutable_data(); - return; PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); @@ -124,78 +192,18 @@ void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { auto input_rois = rois->data(); // calculate psroipooling, parallel processing can be implemented per ROI - for (int n = 0; n < rois_num; ++n) { - // set roi batch id - int roi_batch_id = rois_batch_id_data[n]; - - // [start, end) interval for spatial sampling - auto offset_input_rois = input_rois + n * 4; - auto roi_start_w = - static_cast(round(offset_input_rois[0])) * spatial_scale; - auto roi_start_h = - static_cast(round(offset_input_rois[1])) * spatial_scale; - auto roi_end_w = - static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; - auto roi_end_h = - static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; - - // Force too small rois to be 1 x 1 - auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 - auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); - - // Compute bin size w and h at input feature map - auto bin_size_h = roi_height / static_cast(pooled_height); - auto bin_size_w = roi_width / static_cast(pooled_width); - DLOG << 3; - - // calculate each pixel of the output feature map. - int out_roi_offset = n * out_stride[0]; - for (int c = 0; c < output_channels; ++c) { - // per category - // int out_plane_offset = out_roi_offset + c * out_stride[1]; - int out_plane_offset = out_roi_offset + c; - for (int ph = 0; ph < pooled_height; ++ph) { - // TODO int out_row_offset = out_plane_offset + ph * - // out_stride[2]; - int out_row_offset = out_plane_offset + ph * out_stride[1]; - for (int pw = 0; pw < pooled_width; ++pw) { - // calculate w and h at input feature map - int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); - int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); - int hend = - ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); - int wend = - ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); - // Add roi offsets and clip to input boundaries - hstart = std::min(std::max(hstart, 0), height); - wstart = std::min(std::max(wstart, 0), width); - hend = std::min(std::max(hend, 0), height); - wend = std::min(std::max(wend, 0), width); - - // TODO int output_index = out_row_offset + pw; - int output_index = out_row_offset + pw * output_channels; - int input_channel = (c * pooled_height + ph) * pooled_width + pw; - // TODO int input_plane_offset = - // TODO roi_batch_id * in_stride[0] + input_channel * - // in_stride[1]; - int input_plane_offset = roi_batch_id * in_stride[0] + input_channel; - auto offset_input_data = input_data + input_plane_offset; - float out_sum = 0.; - bool is_empty = (hend <= hstart) || (wend <= wstart); - for (int ih = hstart; ih < hend; ++ih) { - for (int iw = wstart; iw < wend; ++iw) { - int input_index = ih * in_stride[1] + iw * input_channel; - out_sum += offset_input_data[input_index]; - } - } - float bin_area = (hend - hstart) * (wend - wstart); - output_data[output_index] = is_empty ? 0. : out_sum / bin_area; - } - } - } + + int index = pooled_height * pooled_width * output_channels * rois_num; + for (int idx = 0; idx < index; idx++) { + PSROIPooling(input_data, spatial_scale, input_channels, height, + width, pooled_height, pooled_width, input_rois, + output_channels, pooled_height, output_data, idx, + rois_batch_id_data); } - fpga::format_image(out); - fpga::PerformBypass(param.output_arg); + // + fpga::image::convert_to_hwc(&output_data, output_channels, pooled_height, + pooled_width, rois_num); + out->reset_data_ptr(output_data); } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp index 9e5ce02658adb5fe94935b8d7f4d412405a0727e..647ecb5a6501371c74c8762cf81cee206f1dca68 100644 --- a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp @@ -47,21 +47,11 @@ bool Reshape2Kernel::Init(Reshape2Param *param) { void reshape(LoDTensor *input, LoDTensor *output) { // Subscript r means after reshape - // TODO zhangyang verify this function - float *input_ptr_f, *output_ptr_f; - half *input_ptr_h, *output_ptr_h; - bool is_float = false; - - if (input->type() == typeid(float)) { - input_ptr_f = input->data(); - output_ptr_f = output->data(); - is_float = true; - - } else { - input_ptr_h = input->data(); - output_ptr_h = output->data(); - } + auto input_ptr = input->data(); + auto output_ptr = output->data(); + output->scale[0] = input->scale[0]; + output->scale[1] = input->scale[1]; auto C = static_cast(input->dims()[1]); auto H = static_cast(input->dims()[2]); @@ -77,6 +67,8 @@ void reshape(LoDTensor *input, LoDTensor *output) { auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT); auto HWr = Hr * Wr; + fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half)); + int offset_align = 0; int offset_r = 0, offset_align_r = 0; int cr = 0, hr = 0, wr = 0; @@ -87,21 +79,17 @@ void reshape(LoDTensor *input, LoDTensor *output) { int offset1 = w * C + offset0; for (int c = 0; c < C; c++) { offset_align = offset1 + c; - offset_r = c * HW + h * W + c; + offset_r = c * HW + h * W + w; cr = offset_r / HWr; hr = offset_r % HWr / Wr; wr = offset_r % Wr; offset_align_r = hr * WCr_align + wr * Cr + cr; - // DLOG << "hwc"<< h<< " " << w << " " << c; - // DLOG << "hrwrcr" << hr<< " " << wr << " " << cr; - if (is_float) { - output_ptr_f[offset_align_r] = input_ptr_f[offset_align]; - } else { - output_ptr_h[offset_align_r] = input_ptr_h[offset_align]; - } + output_ptr[offset_align_r] = input_ptr[offset_align]; } } } + + fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half)); } template <> @@ -123,6 +111,9 @@ void Reshape2Kernel::Compute(const Reshape2Param ¶m) { output->Resize(framework::make_ddim(shape)); if (output->dims() == input->dims()) { DLOG << "No need to reshape"; + output->ShareDataWith(*input); + framework::LoD lod = input->lod(); + output->set_lod(lod); return; } diff --git a/src/operators/kernel/fpga/V1/slice_kernel.cpp b/src/operators/kernel/fpga/V1/slice_kernel.cpp index 5d0ac1fe61caa9cce0e1af6f8ac5c53b315573db..39e5c64b34c2a6b0629a7f2ab07a8683e9c45edd 100644 --- a/src/operators/kernel/fpga/V1/slice_kernel.cpp +++ b/src/operators/kernel/fpga/V1/slice_kernel.cpp @@ -33,13 +33,18 @@ bool SliceKernel::Init(SliceParam* param) { template <> void SliceKernel::Compute(const SliceParam& param) { // Only support slicing in channel dimension + // Only support half data + // W must be aligned to 16 auto input = param.input_; - DLOG << input; + auto output = param.output_; int HW = input->dims()[2] * input->dims()[3]; int channel = input->dims()[1]; auto input_ptr = input->data(); - auto output_ptr = param.output_->data(); + auto output_ptr = output->data(); + + output->scale[0] = input->scale[0]; + output->scale[1] = input->scale[1]; int start = param.starts_[0], end = param.ends_[0]; start = start < 0 ? start + channel : start; @@ -47,9 +52,10 @@ void SliceKernel::Compute(const SliceParam& param) { start = start > channel ? channel : start; end = end > channel ? channel : end; int len = end - start; + size_t size = len * sizeof(half); for (int i = 0; i < HW; i++) { - memcpy(output_ptr + len * i, input_ptr + i * channel + start, len); + memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 69308ea5538b01c627b92ef41cc2b3768f7fdd67..bbe5296582cb29e81bc4ec161a283891ceb3ae3f 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -23,14 +23,21 @@ namespace operators { template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + auto dims = framework::vectorize(input->dims()); + half *input_ptr; auto out = param->Out(); + if (input->type() == typeid(float)) { + out->Resize(framework::make_ddim(dims)); + out->mutable_data(framework::make_ddim(dims)); + } else { + input_ptr = input->data(); + } auto float_input = new Tensor; PADDLE_MOBILE_ENFORCE(input->dims().size() == 4, "Softmax should have 4-order input"); - auto dims = framework::vectorize(input->dims()); + auto channel = dims[3]; if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1] PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op"); @@ -41,9 +48,12 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { float_input->Resize(framework::make_ddim(dims)); if (channel != 2) { // Use CPU + out->Resize(framework::make_ddim(dims)); + out->mutable_data(framework::make_ddim(dims)); float_input->init(typeid(float)); - fpga::format_fp32_ofm(float_input); - fpga::format_fp32_ofm(out); + float_input->mutable_data(framework::make_ddim(dims)); + // fpga::format_fp32_ofm(float_input); + // fpga::format_fp32_ofm(out); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; args.input_layout_type = fpga::LAYOUT_HWC; @@ -51,7 +61,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { args.input_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP32; args.image.address = input_ptr; - args.image.height = (uint32_t)dims[1]; + args.image.height = (uint32_t)dims[1] * dims[0]; args.image.width = (uint32_t)dims[2]; args.image.channels = (uint32_t)dims[3]; args.output.address = float_input->data(); @@ -80,14 +90,23 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { template <> void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - fpga::PerformBypass(param.FpgaArgs()); - - if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { - Tensor *out = param.Out(); - Tensor *in_x = param.FloatInput(); - fpga::fpga_invalidate(in_x->data(), in_x->numel() * sizeof(float)); - math::SoftmaxFuntor()(in_x, out); - fpga::fpga_flush(out->data(), out->memory_size()); + auto *in_x = (param.InputX()); + if (in_x->type() == typeid(half)) { + fpga::PerformBypass(param.FpgaArgs()); + if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { + Tensor *out = param.Out(); + Tensor *in_x2 = param.FloatInput(); + + fpga::fpga_invalidate(in_x2->data(), + in_x2->numel() * sizeof(float)); + math::SoftmaxFuntor()(in_x2, out); + fpga::fpga_flush(out->data(), out->memory_size()); + } + } else { + if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { + Tensor *out = param.Out(); + math::SoftmaxFuntor()(in_x, out); + } } } diff --git a/src/operators/kernel/pad2d_kernel.h b/src/operators/kernel/pad2d_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..58b8c1a15884b00dc0c309c99da7de0706524cdd --- /dev/null +++ b/src/operators/kernel/pad2d_kernel.h @@ -0,0 +1,32 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +template +class Pad2dKernel + : public framework::OpKernelBase> { + public: + void Compute(const Pad2dParam ¶m); + bool Init(Pad2dParam *param); +}; + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 8cd804444a2d8f65d027ecccb240b5ada9aa274f..5683138ef1341a42c69fca33dc892a01e79736e4 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1221,6 +1221,7 @@ class FetchParam : public OpParam { RType *input_x_; Tensor *out_; #ifdef PADDLE_MOBILE_FPGA + public: fpga::BypassArgs fpga_bypass_args; @@ -2415,6 +2416,120 @@ class FusionDeconvAddParam : public ConvTransposeParam { template using FusionDeconvAddReluParam = FusionDeconvAddParam; #endif +#ifdef FUSION_DECONVADDBN_OP +template +class FusionDeconvAddBNParam : public ConvTransposeParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDeconvAddBNParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) + : ConvTransposeParam(inputs, outputs, attrs, scope) { + output_ = OpParam::OutFrom(outputs, scope); + input_bias_ = OpParam::InputBiasFrom(inputs, scope); + input_mean_ = OpParam::InputMeanFrom(inputs, scope); + input_scale_ = OpParam::InputScaleFrom(inputs, scope); + input_variance_ = OpParam::InputVarianceFrom(inputs, scope); + epsilon_ = OpParam::GetAttr("epsilon", attrs); + momentum_ = OpParam::GetAttr("momentum", attrs); + // is_test_ = OpParam::GetAttr("is_test", attrs); + } + RType *Output() const { return output_; } + + const RType *InputBias() const { return input_bias_; } + + const RType *InputMean() const { return input_mean_; } + + const RType *InputScale() const { return input_scale_; } + + const RType *InputVariance() const { return input_variance_; } + + const float &Epsilon() const { return epsilon_; } + + const float &Momentum() const { return momentum_; } + + const bool &IsTest() const { return is_test_; } + + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + + protected: + RType *output_; + RType *input_bias_; + RType *input_mean_; + RType *input_scale_; + RType *input_variance_; + float epsilon_; + float momentum_; + bool is_test_; + RType *new_bias_; + RType *new_scale_; +}; +#endif +#ifdef FUSION_DECONVADDBNRELU_OP +template +class FusionDeconvAddBNReluParam : public ConvTransposeParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDeconvAddBNReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) + : ConvTransposeParam(inputs, outputs, attrs, scope) { + output_ = OpParam::OutFrom(outputs, scope); + input_bias_ = OpParam::InputBiasFrom(inputs, scope); + input_mean_ = OpParam::InputMeanFrom(inputs, scope); + input_scale_ = OpParam::InputScaleFrom(inputs, scope); + input_variance_ = OpParam::InputVarianceFrom(inputs, scope); + epsilon_ = OpParam::GetAttr("epsilon", attrs); + momentum_ = OpParam::GetAttr("momentum", attrs); + // is_test_ = OpParam::GetAttr("is_test", attrs); + } + RType *Output() const { return output_; } + + const RType *InputBias() const { return input_bias_; } + + const RType *InputMean() const { return input_mean_; } + + const RType *InputScale() const { return input_scale_; } + + const RType *InputVariance() const { return input_variance_; } + + const float &Epsilon() const { return epsilon_; } + + const float &Momentum() const { return momentum_; } + + const bool &IsTest() const { return is_test_; } + + void SetNewScale(RType *new_scale) { new_scale_ = new_scale; } + + void SetNewBias(RType *new_bias) { new_bias_ = new_bias; } + + const RType *NewScale() const { return new_scale_; } + + const RType *NewBias() const { return new_bias_; } + + protected: + RType *output_; + RType *input_bias_; + RType *input_mean_; + RType *input_scale_; + RType *input_variance_; + float epsilon_; + float momentum_; + bool is_test_; + RType *new_bias_; + RType *new_scale_; +}; +#endif #ifdef FUSION_DECONVRELU_OP template @@ -3114,6 +3229,26 @@ class IncrementParam : public OpParam { int step_; }; #endif // INCREMENT_OP +#ifdef PAD2D_OP +template +class Pad2dParam : public OpParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + Pad2dParam(const VariableNameMap &inputs, const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) { + input_x_ = InputXFrom(inputs, scope); + out_ = OutFrom(outputs, scope); + } + const RType *InputX() const { return input_x_; } + RType *Out() const { return out_; } + + private: + RType *input_x_; + RType *out_; +}; +#endif } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/pad2d_op.cpp b/src/operators/pad2d_op.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e7eda00d0830f719f8d7aa76ab77544b585d9b45 --- /dev/null +++ b/src/operators/pad2d_op.cpp @@ -0,0 +1,43 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PAD2D_OP + +#include "operators/pad2d_op.h" +namespace paddle_mobile { +namespace operators { + +template +void Pad2dOp::InferShape() const { + auto input_dims = this->param_.InputX()->dims(); + auto input_n = input_dims[0]; + auto input_c = input_dims[1]; + auto input_h = input_dims[2]; + auto input_w = input_dims[3]; + + this->param_.Out()->Resize({input_n, input_c, input_h + 1, input_w + 1}); +} + +} // namespace operators +} // namespace paddle_mobile + +namespace ops = paddle_mobile::operators; +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(pad2d, ops::Pad2dOp); +#endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(pad2d, ops::Pad2dOp); +#endif + +#endif diff --git a/src/operators/pad2d_op.h b/src/operators/pad2d_op.h new file mode 100644 index 0000000000000000000000000000000000000000..761e2b837d34b8d51629b883a8cd6797037e5d9b --- /dev/null +++ b/src/operators/pad2d_op.h @@ -0,0 +1,49 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PAD2D_OP + +#pragma once + +#include + +#include "framework/operator.h" +#include "operators/kernel/pad2d_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { +using framework::AttributeMap; +using framework::OperatorWithKernel; +using framework::Scope; +using std::string; +template +class Pad2dOp + : public OperatorWithKernel, + operators::Pad2dKernel> { + public: + Pad2dOp(const string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + std::shared_ptr scope) + : OperatorWithKernel, + operators::Pad2dKernel>( + type, inputs, outputs, attrs, scope) {} + void InferShape() const override; + + private: +}; +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index 723e4ea3e3ff35e0d555703391adcafacccb42f1..e48ad33f36cdee1e57ffba9bf64c6546691f0566 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -1,140 +1,140 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ -#include -#include -#include -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -void readStream(std::string filename, float *buf) { - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - string strOne; - int i = 0; - while (!in.eof()) { - in >> buf[i]; - i++; - } - in.close(); -} - -void convert_to_chw(int16_t **data_in, int channel, int height, int width, - int16_t *data_tmp) { - int64_t amount_per_side = width * height; - for (int h = 0; h < height; h++) { - for (int w = 0; w < width; w++) { - for (int c = 0; c < channel; c++) { - *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); - } - } - } -} - -void dump(std::string filename, Tensor input_tensor) { - auto dataptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - for (int i = 0; i < input_tensor.numel(); ++i) { - result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]); - out << result << std::endl; - } - out.close(); -} -void dump_stride_half(std::string filename, Tensor input_tensor, - const int dumpnum) { - int c = (input_tensor.dims())[1]; - int h = (input_tensor.dims())[2]; - int w = (input_tensor.dims())[3]; - auto data_ptr = input_tensor.get_data(); - auto *data_tmp = - reinterpret_cast(malloc(c * h * w * sizeof(int16_t))); - auto *data_ptr_16 = reinterpret_cast(data_ptr); - convert_to_chw(&data_ptr_16, c, h, w, data_tmp); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); - out << result << std::endl; - } - out.close(); - free(data_tmp); -} - -void dump_stride_float(std::string filename, Tensor input_tensor, - const int dumpnum) { - auto data_ptr = reinterpret_cast(input_tensor.get_data()); - std::ofstream out(filename.c_str()); - float result = 0; - int stride = input_tensor.numel() / dumpnum; - stride = stride > 0 ? stride : 1; - for (int i = 0; i < input_tensor.numel(); i += stride) { - result = data_ptr[i]; - out << result << std::endl; - } - out.close(); -} -static const char *g_resnet50 = "../models/resnet50"; -const std::string g_image_src_float = "../images/image_src_float"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - if (paddle_mobile.Load(std::string(g_resnet50), true)) { - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(2), - static_cast(2)); - readStream(g_image_src_float, - input_tensor.mutable_data({1, 3, 224, 224})); - paddle_mobile.FeedData(input_tensor); - paddle_mobile.Predict_To(-1); - for (int i = 0; i < 73; i++) { - auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "resnet50_result_" + std::to_string(i); - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), - tensor_ptr->numel() * sizeof(half)); - dump_stride_half(saveName, (*tensor_ptr), 20); - // dump(saveName, (*tensor_ptr)); - } - - auto tensor_ptr = paddle_mobile.FetchResult(73); - dump_stride_float("resnet50_result_73", (*tensor_ptr), 20); - tensor_ptr = paddle_mobile.FetchResult(74); - dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999); - - float max = 0; - auto data_ptr = tensor_ptr->data(); - int maximumIdx = 0; - for (int i = 0; i < (*tensor_ptr).numel(); i++) { - if (data_ptr[i] > max) { - maximumIdx = i; - max = data_ptr[i]; - } - } - std::cout << "index : " << std::dec << maximumIdx << ", value : " << max - << std::endl; - std::cout << "Computation done" << std::endl; - return 0; - } -} +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ +#include +#include +#include +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + +void readStream(std::string filename, float *buf) { + std::ifstream in; + in.open(filename, std::ios::in); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + string strOne; + int i = 0; + while (!in.eof()) { + in >> buf[i]; + i++; + } + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width, + int16_t *data_tmp) { + int64_t amount_per_side = width * height; + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + c * amount_per_side + width * h + w) = *((*data_in)++); + } + } + } +} + +void dump(std::string filename, Tensor input_tensor) { + auto dataptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + for (int i = 0; i < input_tensor.numel(); ++i) { + result = paddle_mobile::fpga::fp16_2_fp32(dataptr[i]); + out << result << std::endl; + } + out.close(); +} +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum) { + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + auto data_ptr = input_tensor.get_data(); + auto *data_tmp = + reinterpret_cast(malloc(c * h * w * sizeof(int16_t))); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + convert_to_chw(&data_ptr_16, c, h, w, data_tmp); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + free(data_tmp); +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} +static const char *g_resnet50 = "../models/resnet50"; +const std::string g_image_src_float = "../images/image_src_float"; // NOLINT +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + if (paddle_mobile.Load(std::string(g_resnet50), true)) { + Tensor input_tensor; + SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(2), + static_cast(2)); + readStream(g_image_src_float, + input_tensor.mutable_data({1, 3, 224, 224})); + paddle_mobile.FeedData(input_tensor); + paddle_mobile.Predict_To(-1); + for (int i = 0; i < 73; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "resnet50_result_" + std::to_string(i); + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), + tensor_ptr->numel() * sizeof(half)); + // dump_stride_half(saveName, (*tensor_ptr), 20); + // dump(saveName, (*tensor_ptr)); + } + + auto tensor_ptr = paddle_mobile.FetchResult(73); + // dump_stride_float("resnet50_result_73", (*tensor_ptr), 20); + tensor_ptr = paddle_mobile.FetchResult(74); + // dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999); + + float max = 0; + auto data_ptr = tensor_ptr->data(); + int maximumIdx = 0; + for (int i = 0; i < (*tensor_ptr).numel(); i++) { + if (data_ptr[i] > max) { + maximumIdx = i; + max = data_ptr[i]; + } + } + std::cout << "index : " << std::dec << maximumIdx << ", value : " << max + << std::endl; + std::cout << "Computation done" << std::endl; + return 0; + } +} diff --git a/test/fpga/test_rfcn.cpp b/test/fpga/test_rfcn.cpp index e1d13541ef8000da18ceda4c356d158198d7b9f4..a45666365b876abe18d5e24a79525160f3cd8e93 100644 --- a/test/fpga/test_rfcn.cpp +++ b/test/fpga/test_rfcn.cpp @@ -1,62 +1,175 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#include -#include "../test_helper.h" -#include "../test_include.h" - -#ifdef PADDLE_MOBILE_FPGA_V1 -#include "fpga/V1/api.h" -#endif -#ifdef PADDLE_MOBILE_FPGA_V2 -#include "fpga/V2/api.h" -#endif - -void readStream(std::string filename, uint8_t *buf) { - std::ifstream in; - in.open(filename, std::ios::in); - if (!in.is_open()) { - std::cout << "open File Failed." << std::endl; - return; - } - int i = 0; - while (!in.eof()) { - in >> buf[i]; - i++; - } - in.close(); -} - -static const char *g_rfcn_combine = "../models/rfcn"; -static const char *g_image_src_float = "../models/rfcn/data.bin"; -int main() { - paddle_mobile::fpga::open_device(); - paddle_mobile::PaddleMobile paddle_mobile; - - if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", - std::string(g_rfcn_combine) + "/params", true, false, - 1, true)) { - float img_info[3] = {768, 1536, 768.0f / 960.0f}; - auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)); - readStream(g_image_src_float, reinterpret_cast(img)); - std::vector v(3, nullptr); - paddle_mobile.FeedData({img_info, img}); - paddle_mobile.Predict_To(-1); - paddle_mobile.GetResults(&v); - DLOG << "Computation done"; - fpga::fpga_free(img); - } - - return 0; -} +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + +#include + +void readStream(std::string filename, char *buf) { + std::ifstream in; + in.open(filename, std::ios::in | std::ios::binary); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + + in.seekg(0, std::ios::end); // go to the end + auto length = in.tellg(); // report location (this is the length) + in.seekg(0, std::ios::beg); // go back to the beginning + in.read(buf, length); + DLOG << length; + in.close(); +} + +void convert_to_chw(int16_t **data_in, int channel, int height, int width, + int num, int16_t *data_tmp) { + int64_t amount_per_side = width * height; + for (int n = 0; n < num; n++) { + for (int h = 0; h < height; h++) { + for (int w = 0; w < width; w++) { + for (int c = 0; c < channel; c++) { + *(data_tmp + n * amount_per_side * channel + c * amount_per_side + + width * h + w) = *((*data_in)++); + } + } + } + } +} + +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum, bool use_chw) { + // bool use_chw = true; + if (input_tensor.dims().size() != 4) return; + int c = (input_tensor.dims())[1]; + int h = (input_tensor.dims())[2]; + int w = (input_tensor.dims())[3]; + int n = (input_tensor.dims())[0]; + auto data_ptr = input_tensor.get_data(); + auto *data_ptr_16 = reinterpret_cast(data_ptr); + auto data_tmp = data_ptr_16; + if (use_chw) { + data_tmp = + reinterpret_cast(malloc(n * c * h * w * sizeof(int16_t))); + convert_to_chw(&data_ptr_16, c, h, w, n, data_tmp); + } + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = paddle_mobile::fpga::fp16_2_fp32(data_tmp[i]); + out << result << std::endl; + } + out.close(); + if (data_tmp != data_ptr_16) { + free(data_tmp); + } +} + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} + +void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum, + bool use_chw) { + static int i = 0; + if (input_tensor.numel() == 0) { + return; + } + if (input_tensor.type() == typeid(float)) { + DLOG << "op: " << i++ << ", float data " << input_tensor.numel(); + + dump_stride_float(filename, input_tensor, dumpnum); + } else { + DLOG << "op: " << i++ << ", half data " << input_tensor.numel(); + + dump_stride_half(filename, input_tensor, dumpnum, use_chw); + } + DLOG << "dump input address: " << input_tensor.get_data(); +} + +static const char *g_rfcn_combine = "../models/rfcn"; +static const char *g_image_src_float = "../models/rfcn/data.bin"; +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + + if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", + std::string(g_rfcn_combine) + "/params", true, false, + 1, true)) { + float img_info[3] = {768, 1536, 768.0f / 960.0f}; + auto img = reinterpret_cast( + fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float))); + readStream(g_image_src_float, reinterpret_cast(img)); + + std::vector v(3, nullptr); + paddle_mobile.FeedData({img_info, img}); + paddle_mobile.Predict_To(-1); + + for (int i = 55; i < 69; i++) { + auto tensor_ptr = paddle_mobile.FetchResult(i); + std::string saveName = "rfcn_" + std::to_string(i); + // if(i != 58) + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), + tensor_ptr->numel() * sizeof(float)); + // tensor_ptr->numel() * sizeof(float)); + if ((i == 48) || (i == 47)) { + dump_stride(saveName, (*tensor_ptr), 20, + false); // 20);//tensor_ptr->numel()); + } else if (i == 55) { + dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), + true); // 20);//tensor_ptr->numel()); + } else { + dump_stride(saveName, (*tensor_ptr), tensor_ptr->numel(), + true); // 20);//tensor_ptr->numel()); + } + /* float result = 0; + std::string str = "softmax_input_data"; + float* data = + static_cast(fpga::fpga_malloc(tensor_ptr->numel() * + sizeof(float))); str = "softmax_output_data"; auto output_ptr = + static_cast((*tensor_ptr).get_data()); for (int idx = 0; idx < + tensor_ptr->numel(); ++idx) + { + data[idx] = fpga::fp16_2_fp32(output_ptr[idx]); + } + fpga::savefile(str,data, tensor_ptr->numel(), result ); */ + } + + // paddle_mobile.GetResults(&v); + DLOG << "Computation done"; + fpga::fpga_free(img); + } + + return 0; +} diff --git a/tools/op.cmake b/tools/op.cmake old mode 100644 new mode 100755 index 0ceacaa15f6a37f580ea415401d76701908e8455..3b613473df8e7aa99276b864569ef55146bd0ad6 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -131,7 +131,12 @@ if (CON GREATER -1) set(PROPOSAL_OP ON) set(ANCHOR_GENERATOR_OP ON) set(SLICE_OP ON) - + set(SIGMOID_OP ON) + set(CONCAT_OP ON) + set(PAD2D_OP ON) + set(CONV_TRANSPOSE_OP ON) + set(FUSION_DECONVADDBNRELU_OP ON) + set(FUSION_DECONVADDBN_OP ON) set(FOUND_MATCH ON) endif() @@ -573,7 +578,6 @@ endif() if (FUSION_DECONVADDRELU_OP) add_definitions(-DFUSION_DECONVADDRELU_OP) endif() - if (WHILE_OP) add_definitions(-DWHILE_OP) endif() @@ -602,3 +606,12 @@ endif() if (ROI_PERSPECTIVE_OP) add_definitions(-DROI_PERSPECTIVE_OP) endif() +if (FUSION_DECONVADDBNRELU_OP) + add_definitions(-DFUSION_DECONVADDBNRELU_OP) +endif() +if (FUSION_DECONVADDBN_OP) + add_definitions(-DFUSION_DECONVADDBN_OP) +endif() +if (PAD2D_OP) + add_definitions(-DPAD2D_OP) +endif()