diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 9a408a8f2fbe3c600679ddb2e3eadb493f323165..b462cc52302115df02f1fd5ad71cd7c13904f42e 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -28,11 +28,13 @@ void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = image_tensor->data(); - size_t memory_size = channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - image::format_image(&new_data, channel, height, width); - image_tensor->reset_data_ptr(new_data); + auto external_ptr = reinterpret_cast(image_tensor->external_data); + float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; + float *old_p = p_data; + image::format_image(&p_data, channel, height, width); + if (old_p != p_data) { + image_tensor->reset_data_ptr(p_data); + } } void format_fp16_ofm(framework::Tensor *ofm_tensor) { @@ -50,6 +52,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { auto p = fpga_malloc(memory_size); memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); + ofm_tensor->set_type(typeid(half)); } void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { @@ -67,6 +70,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { auto p = fpga_malloc(memory_size); memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); + ofm_tensor->set_type(typeid(half)); } void format_fp32_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); @@ -83,6 +87,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) { auto p = fpga_malloc(memory_size); memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); + ofm_tensor->set_type(typeid(float)); } float filter_find_max(framework::Tensor *filter_tensor) { @@ -139,6 +144,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, filter::format_filter(&new_data, num, channel, height, width, group_num, max_value); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { auto dims = filter_tensor->dims(); @@ -149,6 +155,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { fpga_copy(new_data, data_ptr, memory_size); filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, @@ -173,6 +180,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, // framework::make_ddim({num, 1, height, width}); // filter_tensor->Resize(dims_new); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { @@ -187,6 +195,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { filter::format_fc_filter(&new_data, num, channel, height, width, 1, max_value); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, int group_num, int stride) { @@ -213,6 +222,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, framework::make_ddim({num, channel, height, width}); filter_tensor->Resize(dims_new); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_bias_scale_array(float **bias_scale_array, @@ -236,6 +246,7 @@ void format_concat_output(framework::Tensor *out, int height, int width, auto ddim = framework::make_ddim({1, sum_channel, height, width}); out->Resize(ddim); out->reset_data_ptr(data_ptr); + out->set_type(typeid(half)); } void format_conv_data(framework::Tensor *filter_tensor, framework::Tensor *ofm_tensor, float **bs_ptr, @@ -447,9 +458,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); - auto out_ptr = out->data(); + auto input_ptr = input->data(); + auto filter_ptr = filter->data(); + auto out_ptr = out->data(); auto deleter = [](void *p) { fpga_free(p); }; arg->group_num = (uint32_t)group_num; @@ -571,8 +582,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); + auto input_ptr = input->data(); + auto filter_ptr = filter->data(); auto deleter = [](void *p) { fpga_free(p); }; arg->group_num = (uint32_t)group_num; @@ -603,7 +614,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, framework::DDim dims_out_new = framework::make_ddim( {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width}); fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); + auto out_ptr = out->data(); arg->output.address = (half *)out_ptr + // NOLINT omit_size * sizeof(half) * @@ -793,7 +804,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i]->conv_arg[j].output.scale_address), deleter)); } - arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( + arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( arg->split_conv_args[i]->conv_arg[j].output.address); arg->split_conv_args[i]->concat_arg.scales_in[j] = arg->split_conv_args[i]->conv_arg[j].output.scale_address; @@ -818,9 +829,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int stride_h, int stride_w, int padding_h, int padding_w, float *bias_ptr) { - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); + auto filter_ptr = filter->data(); + auto input_ptr = input->data(); + auto output_ptr = out->mutable_data(); arg->sub_conv_num = 1; // arg->relu_enabled = relu_enabled; arg->output.activation.activation_type = activation_enable; @@ -848,9 +859,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int stride_h, int stride_w, int padding_h, int padding_w, float *bias_ptr) { - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); + auto filter_ptr = filter->data(); + auto input_ptr = input->data(); auto deleter = [](void *p) { fpga_free(p); }; @@ -885,7 +895,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, framework::DDim dims_out_new = framework::make_ddim( {1, arg->filter_num, real_out_height, real_out_width}); fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); + auto out_ptr = out->data(); /*====For Addition arg->output.address = diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index c79a5c3a8e7c4f47cd11c2c4af14feb69efed48d..ebba4f3eaf7ff822bae240f8565b4b5f86f1a796 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -22,7 +22,6 @@ namespace fpga { namespace image { void convert_to_hwc(float **data_in, int channel, int height, int width) { - float *tmp = *data_in; float *data_tmp = (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT int64_t amount_per_row = width * channel; @@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) { } } *data_in = data_tmp; - fpga_free(tmp); } void align_element_conv(float **data_in, int height, int cw) { int h = 0; int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - if (align_cw != cw) { - float *tmp = *data_in; - float *data_tmp = - (float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT - memset(data_tmp, 0, height * align_cw * sizeof(float)); + float *data_tmp = + (float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT - for (h = 0; h < height; h++) { - memcpy((void *)(data_tmp + h * align_cw), // NOLINT - (void *)(*data_in + h * cw), // NOLINT - cw * sizeof(float)); - } + memset(data_tmp, 0, height * align_cw * sizeof(float)); - *data_in = data_tmp; - fpga_free(tmp); + for (h = 0; h < height; h++) { + memcpy((void *)(data_tmp + h * align_cw), // NOLINT + (void *)(*data_in + h * cw), // NOLINT + cw * sizeof(float)); } + + *data_in = data_tmp; } void format_image(float **data_in, int channel, int height, int width) { convert_to_hwc(data_in, channel, height, width); - align_element_conv(data_in, height, channel * width); + int cw = channel * width; + int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); + if (align_cw != cw) { + float *hwc_temp = *data_in; + align_element_conv(data_in, height, channel * width); + fpga_free(hwc_temp); + } fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(float)); } diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp index bf90a3a11926b1f90ed8a659db908a061f79b0e9..ad238c51efb33cc1d3a35bc9d6bc1dc2dcec75dd 100644 --- a/src/fpga/common/fpga_common.cpp +++ b/src/fpga/common/fpga_common.cpp @@ -164,7 +164,7 @@ void fpga_free(void *ptr) { // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " // << counter << " bytes"; } else { - DLOG << "Invalid pointer"; + DLOG << "Address: " << ptr << " Invalid pointer"; } } void fpga_copy(void *dest, const void *src, size_t num) { diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index 60753e5cde1e39a1dbf4a1016667db748fc6b9f9..826e1c7402585127d6731878737e0edd678e6a76 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -19,17 +19,16 @@ limitations under the License. */ #include #include -namespace paddle_mobile { -namespace fpga { - #ifdef PADDLE_MOBILE_FPGA_V1 -#define IMAGE_ALIGNMENT 16 // Aligned to 16 -#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 -#define BS_NUM_ALIGNMENT 8 -#define BIAS_NUM_ALIGNMENT 16 +#define IMAGE_ALIGNMENT (16) // Aligned to 16 +#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32 +#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 +#define BS_NUM_ALIGNMENT (8) +#define BIAS_NUM_ALIGNMENT (16) #endif +namespace paddle_mobile { +namespace fpga { enum DataType { DATA_TYPE_FP32 = 1, DATA_TYPE_FP16 = 0, @@ -49,7 +48,7 @@ enum ActivationType { }; struct ActivationArgs { - enum ActivationType activation_type; + enum ActivationType activation_type = NONE; int16_t leaky_relu_negative_slope; }; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 8b54619ae3bcdca081bab02953769406294e5a56..09f02f22fe5165f868aed2ec04366f9f10001d1b 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -84,6 +84,11 @@ Executor::Executor(const Program &program, InitMemory(); } +#ifdef PADDLE_MOBILE_FPGA + program_.scope->EraseVars({"feed", "fetch"}); + program_.scope->print_vars(); +#endif + int count = 0; for (int block_id = 0; block_id < ops_of_block_.size(); ++block_id) { for (auto &op_handler : ops_of_block_[block_id]) { @@ -92,14 +97,6 @@ Executor::Executor(const Program &program, ops_list_.push_back(op_handler); } } -#ifdef PADDLE_MOBILE_FPGA - TalorFeedOp(); - DLOG << "TalorFeed finished"; - TalorFetchdOp(); - DLOG << "TalorFetch finished"; - program_.scope->print_vars(); - -#endif } template @@ -451,49 +448,6 @@ std::shared_ptr Executor::GetOutput( } #ifdef PADDLE_MOBILE_FPGA -template -void Executor::TalorFeedOp() { - auto &ops = ops_of_block_[0]; - int num = 0; - program_.scope->EraseVars(std::vector{string("feed")}); - for (auto op : ops) { - if (op->Type() == "feed") { - auto new_name = string("feed") + std::to_string(num++); - auto var = program_.scope->Var(new_name); - auto tensor = var->template GetMutable(); - auto output_map = op->Outputs(); - std::vector out_keys = op->GetOutKeys(); - PADDLE_MOBILE_ENFORCE(!out_keys.empty(), "this op contains no output"); - auto output_tensor = - GetVarValue(out_keys[0], output_map, *(program_.scope)); - tensor->Resize(output_tensor->dims()); - tensor->init(typeid(float)); - op->ChangeNameMap("X", std::vector{new_name}); - } - } -} -template -void Executor::TalorFetchdOp() { - auto &ops = ops_of_block_[0]; - int num = 0; - program_.scope->EraseVars(std::vector{string("fetch")}); - for (auto op : ops) { - if (op->Type() == "fetch") { - auto new_name = string("fetch") + std::to_string(num++); - auto var = program_.scope->Var(new_name); - auto tensor = var->template GetMutable(); - auto input_map = op->Inputs(); - std::vector in_keys = op->GetInputKeys(); - PADDLE_MOBILE_ENFORCE(!in_keys.empty(), "this op contains no input"); - auto input_tensor = - GetVarValue(in_keys[0], input_map, *(program_.scope)); - tensor->Resize(input_tensor->dims()); - tensor->init(typeid(float)); - op->ChangeNameMap("Out", std::vector{new_name}); - } - } -} - template void Executor::InjectVariable(const Tensor &t, std::string var_name) { @@ -509,18 +463,29 @@ void Executor::FeedData(const Tensor &t) { } template -void Executor::FeedData(const std::vector &v) { +void Executor::FeedData(const std::vector &v) { auto input_size = v.size(); - PADDLE_MOBILE_ENFORCE(input_size > 0, "Empty input"); - int counter = 0; auto vars = program_.scope->VarContain("feed"); - for (auto var : vars) { - Tensor *feed_tensor = var->template GetMutable(); - feed_tensor->Resize(v[counter].dims()); - feed_tensor->ShareDataWith(v[counter]); - if (++counter > v.size()) { - return; - } + PADDLE_MOBILE_ENFORCE(input_size == vars.size(), + "input data number not correct"); + for (int i = 0; i < input_size; i++) { + auto var = program_.scope->Var("feed", i); + auto feed_tensor = var->template GetMutable(); + feed_tensor->external_data = v[i]; + } +} + +template +void Executor::GetResults(std::vector *v) { + auto output_size = v->size(); + PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output"); + auto vars = program_.scope->VarContain("fetch"); + PADDLE_MOBILE_ENFORCE(output_size == vars.size(), + "output data number not correct"); + for (int i = 0; i < output_size; i++) { + auto var = program_.scope->Var("fetch", i); + auto fetch_tensor = var->template GetMutable(); + (*v)[i] = fetch_tensor->template data(); } } diff --git a/src/framework/executor.h b/src/framework/executor.h index 2bce5c39b5c74e6f0bb8a475a83b4bbc65eaf652..ee285acac3e8bdf500452b6494bb37d79a2089e4 100644 --- a/src/framework/executor.h +++ b/src/framework/executor.h @@ -50,11 +50,10 @@ class Executor { std::shared_ptr GetOutput(const std::string &var_name); #ifdef PADDLE_MOBILE_FPGA - void TalorFeedOp(); - void TalorFetchdOp(); void InjectVariable(const Tensor &t, std::string var_name); void FeedData(const Tensor &t); - void FeedData(const std::vector &v); + void FeedData(const std::vector &v); + void GetResults(std::vector *v); std::shared_ptr FetchResult(int id = -1); void Predict_From_To(int start = 0, int end = -1); void Predict_From(int start); diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index 0d861e542f19e113b9e2f262b3d0d33080c11738..bc01c37751ef0e2acee1cf469c015b321d9c9680 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -50,6 +50,9 @@ OperatorBase::OperatorBase(const std::string &type, attrs_(attrs), scope_(scope) { CheckAllInputOutputSet(); +#ifdef PADDLE_MOBILE_FPGA + InsertTensors(); +#endif } template @@ -133,15 +136,19 @@ void OperatorBase::Run() { #ifdef PADDLE_MOBILE_FPGA template -void OperatorBase::ChangeNameMap(string key, std::vector value) { - auto it = inputs_.find(key); - if (it != inputs_.end()) { - inputs_[key] = value; - return; - } - it = outputs_.find(key); - if (it != outputs_.end()) { - inputs_[key] = value; +void OperatorBase::InsertTensors() { + static int feed_num = 0; + static int fetch_num = 0; + if (type_ == "feed") { + auto new_name = string("feed") + std::to_string(feed_num++); + auto var = scope_->Var(new_name); + var->template GetMutable(); + inputs_.at("X") = {string(new_name)}; + } else if (type_ == "fetch") { + auto new_name = string("fetch") + std::to_string(fetch_num++); + auto var = scope_->Var(new_name); + var->template GetMutable(); + outputs_.at("Out") = {string(new_name)}; } } #endif diff --git a/src/framework/operator.h b/src/framework/operator.h index 28bef0eec872a312ef5a4c1c4491bc88cc2ca916..2a68ef802eaabd0c5f2a7f95227ebaa39a9442e7 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -79,6 +79,7 @@ class OperatorBase { } } #ifdef PADDLE_MOBILE_FPGA + void InsertTensors(); void ChangeNameMap(string key, std::vector value); #endif protected: @@ -95,6 +96,7 @@ class OperatorBase { template class OperatorWithKernel : public OperatorBase { public: +#ifndef PADDLE_MOBILE_FPGA1 OperatorWithKernel(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, std::shared_ptr scope) @@ -104,7 +106,25 @@ class OperatorWithKernel : public OperatorBase { kernel_.InitCLHelper(scope->GetCLScpoe()); #endif } - +#else + OperatorWithKernel(const std::string &type, const VariableNameMap inputs, + const VariableNameMap &outputs, const AttributeMap &attrs, + std::shared_ptr scope) + : OperatorBase(type, inputs, outputs, attrs, scope) { + static int feed_num = 0; + static int fetch_num = 0; + if (type == "feed") { + auto new_name = string("feed") + std::to_string(feed_num++); + auto var = scope->Var(new_name); + (const_cast(inputs)).at("X") = {string(new_name)}; + } else if (type == "fetch") { + auto new_name = string("fetch") + std::to_string(fetch_num++); + auto var = scope->Var(new_name); + (const_cast(outputs)).at("Out") = {string(new_name)}; + } + param_ = ParamType(inputs, outputs, attrs, *scope); + } +#endif virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void InferShape() const = 0; diff --git a/src/framework/tensor.h b/src/framework/tensor.h index afbba4d801e5d5dce2ba2edb1fd78c06ce66029e..c684169ce21474b4c68de9db523035866859818a 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -202,6 +202,10 @@ class Tensor : public TensorBase { inline void reset_data_ptr(void *p) { ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT } + inline void set_type(std::type_index type) { holder_->set_type(type); } + inline void *get_data() { + return (void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get()); + } // NOLINT inline void *init(std::type_index type) { if (holder_ != nullptr) { @@ -217,7 +221,8 @@ class Tensor : public TensorBase { reinterpret_cast(holder_->ptr()) + offset_); } - float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX + float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX + void *external_data = nullptr; // only used for Feed #endif }; diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index edc36514cdb8048c1563119c42c41a556c17ef07..7c391c0bf84c34f0ea884a171e5a014711150d77 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -177,6 +177,23 @@ bool PaddleMobilePredictor::Run( return true; } +template +void PaddleMobilePredictor::FeedData( + const std::vector &inputs) { + paddle_mobile_->FeedData(inputs); +} + +template +void PaddleMobilePredictor::GetResults( + std::vector *outputs) { + paddle_mobile_->GetResults(outputs); +} + +template +void PaddleMobilePredictor::Predict_From_To(int start, int end) { + paddle_mobile_->Predict_From_To(start, end); +} + #endif template PaddleMobilePredictor::~PaddleMobilePredictor() { diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h index d1713ea2fe2cb204d2487151c69c050548eb51a3..0cadd71c226b20331c8399d2cfd8873c093a6b84 100644 --- a/src/io/api_paddle_mobile.h +++ b/src/io/api_paddle_mobile.h @@ -35,6 +35,9 @@ class PaddleMobilePredictor : public PaddlePredictor { bool Run(const std::vector& inputs, std::vector* output_data, std::vector* index_data, int batch_size = -1) override; + void FeedData(const std::vector& inputs) override; + void GetResults(std::vector* outputs) override; + void Predict_From_To(int start = 0, int end = -1) override; #endif ~PaddleMobilePredictor() override; diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h index 2ba85a58ac37faf9601ed12fb98293a4c46352ea..42509915d13cf7e632ed20c73f1320ec8bac09d1 100644 --- a/src/io/paddle_inference_api.h +++ b/src/io/paddle_inference_api.h @@ -119,6 +119,9 @@ class PaddlePredictor { virtual bool Run(const std::vector& inputs, std::vector* output_data, std::vector* index_data, int batch_size = -1) = 0; + virtual void FeedData(const std::vector& inputs) = 0; + virtual void GetResults(std::vector* outputs) = 0; + virtual void Predict_From_To(int start = 0, int end = -1) = 0; #endif protected: diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index ea76d8a67c7af74a252fecc81ba3f9cf8419c480..0dfa9d0500847c80e78a156b9c82a33d1dfd4a00 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -228,10 +228,14 @@ void PaddleMobile::FeedData(const framework::Tensor &t) { executor_->FeedData(t); } template -void PaddleMobile::FeedData( - const std::vector &v) { +void PaddleMobile::FeedData(const std::vector &v) { executor_->FeedData(v); }; +template +void PaddleMobile::GetResults(std::vector *v) { + executor_->GetResults(v); +} + template std::shared_ptr PaddleMobile::FetchResult( int id) { diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index 02a1ed1b50655e243f9be666443ee5179322caf1..d608abcac79d2a5ae79ad375a8cb93d4594d1e8a 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -90,7 +90,8 @@ class PaddleMobile { #ifdef PADDLE_MOBILE_FPGA void InjectVariable(const framework::Tensor &t, std::string var_name); void FeedData(const framework::Tensor &t); - void FeedData(const std::vector &v); + void FeedData(const std::vector &v); + void GetResults(std::vector *v); std::shared_ptr FetchResult(int id = -1); void Predict_From_To(int start = 0, int end = -1); void Predict_From(int start); diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h index de3c5a3a3ddd15f8485c92185c131210ba3899f9..417c68fff7d0e88d2e1fcc1dc8c1f14aa3a4399b 100644 --- a/src/operators/kernel/detection_kernel.h +++ b/src/operators/kernel/detection_kernel.h @@ -103,6 +103,10 @@ class ProposalParam : public OpParam { float nms_thresh_; float min_size_; float eta_; +#ifdef PADDLE_MOBILE_FPGA + std::shared_ptr float_score, float_bbox; + fpga::BypassArgs score_arg, bbox_arg; +#endif }; DECLARE_KERNEL(Proposal, ProposalParam); @@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam { int pooled_height_; int pooled_width_; float spatial_scale_; +#ifdef PADDLE_MOBILE_FPGA + std::shared_ptr float_input, float_output; + fpga::BypassArgs input_arg, output_arg; +#endif }; DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam); diff --git a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp index fcea99227155fc8f0f2cac69f53b2b7cb403c751..4e68b5e30ccc53ae84deb0866f982d70e175d8eb 100644 --- a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp +++ b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp @@ -23,15 +23,46 @@ namespace operators { template <> bool AnchorGeneratorKernel::Init( AnchorGeneratorParam *param) { - // TODO zhangyang + auto input = param->input_; + auto anchors = param->output_anchors_; + auto anchor_ptr = anchors->mutable_data(); + auto stride = param->stride_; + auto feature_width = input->dims()[3], feature_height = input->dims()[2]; + auto stride_width = stride[0], stride_height = stride[1]; + + int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, + -20, 39, 36, -43, -34, 59, 49, -63, -54, + 79, 69, -96, -77, 112, 93, -137, -118, 153, + 134, -204, -188, 220, 204, -281, -395, 296, 441}; + int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4); + + // DLOG << "feature_height: " << feature_height; + // DLOG << "feature_width: " << feature_width; + // DLOG << "num_anchors: " << num_anchors; + // DLOG << "stride_width: " << stride_width; + // DLOG << "stride_height: " << stride_height; + + for (int h_idx = 0; h_idx < feature_height; ++h_idx) { + for (int w_idx = 0; w_idx < feature_width; ++w_idx) { + int offset = h_idx * w_idx * num_anchors * 4; + for (int idx = 0; idx < num_anchors; idx++) { + anchor_ptr[offset + 0] = + anchors_offset[idx * 4 + 0] + w_idx * stride_width; + anchor_ptr[offset + 1] = + anchors_offset[idx * 4 + 1] + h_idx * stride_height; + anchor_ptr[offset + 2] = + anchors_offset[idx * 4 + 2] + w_idx * stride_width; + anchor_ptr[offset + 3] = + anchors_offset[idx * 4 + 3] + h_idx * stride_height; + } + } + } return true; } template <> void AnchorGeneratorKernel::Compute( - const AnchorGeneratorParam ¶m) { - // TODO(hjchen2) -} + const AnchorGeneratorParam ¶m) {} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/V1/concat_kernel.cpp b/src/operators/kernel/fpga/V1/concat_kernel.cpp index 6644bfd83e57a7fd147c0cc6383e64eb2ad79e51..7690f41ad3fbbebf59cd546a24370056eeb123d9 100644 --- a/src/operators/kernel/fpga/V1/concat_kernel.cpp +++ b/src/operators/kernel/fpga/V1/concat_kernel.cpp @@ -38,7 +38,7 @@ bool ConcatKernel::Init(ConcatParam *param) { PADDLE_MOBILE_ENFORCE( input->dims()[2] == height && input->dims()[3] == width, "Image height & width should be unified"); - images_in[i] = (half *)input->data(); // NOLINT + images_in[i] = input->data(); channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT scales_in[i] = input->scale; } @@ -48,7 +48,7 @@ bool ConcatKernel::Init(ConcatParam *param) { concatArgs.image_num = image_num; concatArgs.images_in = images_in; concatArgs.scales_in = scales_in; - concatArgs.image_out = (half *)out->data(); // NOLINT + concatArgs.image_out = out->data(); concatArgs.scale_out = out->scale; concatArgs.channel_num = channel_num; concatArgs.height = height; diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp index 27eee7e5ba7045473ff035f45236d04e080a692e..a830996524cba9ff05259bf7ccf3a55c99749a87 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp @@ -27,10 +27,10 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); + auto out_ptr = out->mutable_data(); fpga::EWAddArgs ewaddArgs = {0}; // ewaddArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp index fbbe679d4b6a6d4b0ca0a25ebb7aacf93a133943..f36206a8a15451144a00a16aad176ca67c4a4114 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp @@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel::Init( auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); + auto out_ptr = out->mutable_data(); fpga::EWAddArgs ewaddArgs = {0}; // ewaddArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp index 9c6468404e334a5a3002f8702d4f3b9818028f77..89e35f8a42d66aad6734ad6643b1b7204ad207ea 100644 --- a/src/operators/kernel/fpga/V1/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp @@ -19,19 +19,35 @@ namespace operators { template <> bool FeedKernel::Init(FeedParam *param) { - Tensor *output = param->Out(); + auto output = param->Out(); + auto input = const_cast(param->InputX()); + input->init(typeid(float)); + input->Resize(output->dims()); + + if (output->dims().size() != 4) { + auto input_ptr = input->mutable_data(); + size_t size = output->numel() * sizeof(float); + auto p = fpga::fpga_malloc(size); + memcpy(p, input_ptr, size); + output->reset_data_ptr(p); + return true; + } fpga::format_fp16_ofm(output); return true; } template <> void FeedKernel::Compute(const FeedParam ¶m) { - auto input = - reinterpret_cast(const_cast(param.InputX())); + auto output = param.Out(); + auto input = const_cast(param.InputX()); + + if (input->dims().size() != 4) { + return; + } + fpga::format_image(input); auto input_ptr = input->data(); - Tensor *output = param.Out(); - auto output_ptr = output->data(); + auto output_ptr = output->data(); fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; @@ -39,7 +55,7 @@ void FeedKernel::Compute(const FeedParam ¶m) { args.output_data_type = fpga::DATA_TYPE_FP16; args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = reinterpret_cast(input_ptr); + args.image.address = input_ptr; args.image.channels = (uint32_t)input->dims()[1]; args.image.height = (uint32_t)input->dims()[2]; args.image.width = (uint32_t)input->dims()[3]; @@ -48,6 +64,8 @@ void FeedKernel::Compute(const FeedParam ¶m) { args.output.address = output_ptr; args.output.scale_address = output->scale; fpga::PerformBypass(args); + + input->external_data = nullptr; } template class FeedKernel; diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index c00bdf57a259e24669c33f011d7b77eb20d4b308..1cf6f141af88f2b206de6d0f6efe6c7999ac8674 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -19,20 +19,15 @@ namespace operators { template <> bool FetchKernel::Init(FetchParam *param) { - Tensor *output = param->Out(); - // fpga::format_fp16_ofm(output); - return true; -} - -template <> -void FetchKernel::Compute(const FetchParam ¶m) { - param.Out()->ShareDataWith(*(param.InputX())); - /*auto input = - reinterpret_cast(const_cast(param.InputX())); - fpga::format_image(input); - auto input_ptr = input->data(); - Tensor *output = param.Out(); - auto output_ptr = output->data(); + auto input = const_cast(param->InputX()); + auto output = param->Out(); + if (input->type() == typeid(float)) { + output->ShareDataWith(*input); + return true; + } + output->init(typeid(float)); + output->Resize(input->dims()); + fpga::format_fp32_ofm(output); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -40,13 +35,28 @@ void FetchKernel::Compute(const FetchParam ¶m) { args.output_data_type = fpga::DATA_TYPE_FP32; args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = reinterpret_cast(input_ptr); - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : - 1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3] - : 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address - = output_ptr; args.output.scale_address = output->scale; - fpga::PerformBypass(args);*/ + args.image.address = input->data(); + args.image.channels = (uint32_t)product(input->dims()); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output->data(); + args.output.scale_address = output->scale; + param->fpga_bypass_args = args; + + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + auto input = param.InputX(); + if (input->type() == typeid(float)) { + return; + } + fpga::PerformBypass(param.fpga_bypass_args); + + // TODO: DEalign: get rid of extra 0 } template class FetchKernel; diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp index 8eefc3e9bea0b3662b4c08409f16f86dab60968a..6dd43bf8cb95336d071cee52cfab52838f62ce88 100644 --- a/src/operators/kernel/fpga/V1/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp @@ -22,10 +22,10 @@ namespace operators { template <> bool PoolKernel::Init(PoolParam *param) { auto *input = const_cast(param->Input()); - auto input_ptr = input->data(); + auto input_ptr = input->data(); Tensor *output = param->Output(); fpga::format_fp16_ofm(output); - auto output_ptr = output->mutable_data(); + auto output_ptr = output->mutable_data(); vector ksize = param->Ksize(); vector strides = param->Strides(); vector paddings = param->Paddings(); diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp index d16e057da97eec8d4509eaf8be9f98696a8bed40..4f50d6edb10c2f0cd7f75c4f4395a7b90c993e4a 100644 --- a/src/operators/kernel/fpga/V1/proposal_kernel.cpp +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -14,20 +14,422 @@ limitations under the License. */ #ifdef PROPOSAL_OP +#include #include #include "operators/kernel/detection_kernel.h" namespace paddle_mobile { namespace operators { +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); template <> bool ProposalKernel::Init(ProposalParam *param) { + int post_nms_top_n = param->post_nms_topn_; + int64_t batch = param->scores_->dims()[0]; + auto total = post_nms_top_n * batch; + param->rpn_rois_->mutable_data({total, 4}); + param->rpn_probs_->mutable_data({total, 1}); + + // DLOG << *param->rpn_rois_; + // DLOG << *param->rpn_probs_; + + param->float_bbox = std::make_shared(); + param->float_bbox->Resize(param->bbox_deltas_->dims()); + param->float_bbox->init(typeid(float)); + fpga::format_fp32_ofm(param->float_bbox.get()); + param->float_score = std::make_shared(); + param->float_score->Resize(param->scores_->dims()); + param->float_score->init(typeid(float)); + fpga::format_fp32_ofm(param->float_score.get()); + + auto input = param->bbox_deltas_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_bbox->mutable_data(); + args.output.scale_address = param->float_bbox->scale; + param->bbox_arg = args; + + input = param->scores_; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_score->mutable_data(); + args.output.scale_address = param->float_score->scale; + param->score_arg = args; + return true; } +void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { + auto *out_data = dst->data(); + auto *to_add_data = src.data(); + size_t size_of_t = framework::SizeOfType(src.type()); + offset *= size_of_t; + std::memcpy( + reinterpret_cast(reinterpret_cast(out_data) + offset), + to_add_data, src.numel() * size_of_t); +} + +template +static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, + Tensor *variances, Tensor *proposals) { + T *proposals_data = proposals->mutable_data(); + + int64_t row = all_anchors->dims()[0]; + int64_t len = all_anchors->dims()[1]; + + auto *bbox_deltas_data = bbox_deltas->data(); + auto *anchor_data = all_anchors->data(); + const T *variances_data = nullptr; + if (variances) { + variances_data = variances->data(); + } + + for (int64_t i = 0; i < row; ++i) { + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; + + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; + + T bbox_center_x = 0, bbox_center_y = 0; + T bbox_width = 0, bbox_height = 0; + + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } else { + bbox_center_x = + bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } + + proposals_data[i * len] = bbox_center_x - bbox_width / 2; + proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + } + // return proposals; +} + +template +static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) { + T *boxes_data = boxes->mutable_data(); + const T *im_info_data = im_info.data(); + T zero(0); + for (int64_t i = 0; i < boxes->numel(); ++i) { + if (i % 4 == 0) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); + } else if (i % 4 == 1) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); + } else if (i % 4 == 2) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); + } else { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); + } + } +} + +template +static inline void FilterBoxes(Tensor *boxes, float min_size, + const Tensor &im_info, Tensor *keep) { + const T *im_info_data = im_info.data(); + T *boxes_data = boxes->mutable_data(); + T im_scale = im_info_data[2]; + keep->Resize({boxes->dims()[0]}); + min_size = std::max(min_size, 1.0f); + int *keep_data = keep->mutable_data(); + + int keep_len = 0; + for (int i = 0; i < boxes->dims()[0]; ++i) { + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T ws_origin_scale = + (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; + T hs_origin_scale = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; + if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } + keep->Resize({keep_len}); +} + +template +static inline std::vector> GetSortedScoreIndex( + const std::vector &scores) { + std::vector> sorted_indices; + sorted_indices.reserve(scores.size()); + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices.emplace_back(scores[i], i); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair &a, const std::pair &b) { + return a.first < b.first; + }); + return sorted_indices; +} + +template +static inline T BBoxArea(const T *box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline Tensor VectorToTensor(const std::vector &selected_indices, + int selected_num) { + Tensor keep_nms; + keep_nms.Resize({selected_num}); + auto *keep_data = keep_nms.mutable_data(); + for (int i = 0; i < selected_num; ++i) { + keep_data[i] = selected_indices[i]; + } + return keep_nms; +} + +template +static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1); + const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1); + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold, + float eta) { + int64_t num_boxes = bbox->dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox->dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores->data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices = + GetSortedScoreIndex(scores_data); + + std::vector selected_indices; + int selected_num = 0; + T adaptive_threshold = nms_threshold; + const T *bbox_data = bbox->data(); + while (sorted_indices.size() != 0) { + int idx = sorted_indices.back().second; + bool flag = true; + for (int kept_idx : selected_indices) { + if (flag) { + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + ++selected_num; + } + sorted_indices.erase(sorted_indices.end() - 1); + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + return VectorToTensor(selected_indices, selected_num); +} + +template +std::pair ProposalForOneImage( + const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas_slice, // [M, 4] + const Tensor &scores_slice, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + auto *scores_data = scores_slice.data(); + + // Sort index + Tensor index_t; + index_t.Resize({scores_slice.numel()}); + int *index = index_t.mutable_data(); + for (int i = 0; i < scores_slice.numel(); ++i) { + index[i] = i; + } + auto compare = [scores_data](const int64_t &i, const int64_t &j) { + return scores_data[i] > scores_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { + std::sort(index, index + scores_slice.numel(), compare); + } else { + std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(), + compare); + index_t.Resize({pre_nms_top_n}); + } + + Tensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.mutable_data({index_t.numel(), 1}); + bbox_sel.mutable_data({index_t.numel(), 4}); + anchor_sel.mutable_data({index_t.numel(), 4}); + var_sel.mutable_data({index_t.numel(), 4}); + + Tensor proposals; + proposals.mutable_data({index_t.numel(), 4}); + BoxCoder(&anchor_sel, &bbox_sel, &var_sel, &proposals); + + ClipTiledBoxes(im_info_slice, &proposals); + + Tensor keep; + FilterBoxes(&proposals, min_size, im_info_slice, &keep); + + Tensor scores_filter; + bbox_sel.mutable_data({keep.numel(), 4}); + scores_filter.mutable_data({keep.numel(), 1}); + + if (nms_thresh <= 0) { + return std::make_pair(bbox_sel, scores_filter); + } + + Tensor keep_nms = NMS(&bbox_sel, &scores_filter, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + proposals.mutable_data({keep_nms.numel(), 4}); + scores_sel.mutable_data({keep_nms.numel(), 1}); + + return std::make_pair(proposals, scores_sel); +} + template <> void ProposalKernel::Compute(const ProposalParam ¶m) { - // TODO(hjchen2) + auto score_tensor = param.float_score.get(); + fpga::PerformBypass(param.score_arg); + fpga::fpga_invalidate(score_tensor->data(), + score_tensor->numel() * sizeof(float)); + + auto bbox_tensor = param.float_bbox.get(); + fpga::PerformBypass(param.bbox_arg); + fpga::fpga_invalidate(bbox_tensor->data(), + bbox_tensor->numel() * sizeof(float)); + + auto *scores = param.float_score.get(); + auto *bbox_deltas = param.float_bbox.get(); + auto *im_info = param.im_info_; + auto anchors = *param.anchors_; + auto variances = *param.variances_; + + auto *rpn_rois = param.rpn_rois_; + auto *rpn_roi_probs = param.rpn_probs_; + + int pre_nms_top_n = param.pre_nms_topn_; + int post_nms_top_n = param.post_nms_topn_; + float nms_thresh = param.nms_thresh_; + float min_size = param.min_size_; + float eta = param.eta_; + + auto &scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto &bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + // + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}); + scores_swap.mutable_data({num, h_score, w_score, c_score}); + + framework::LoD lod; + lod.resize(1); + auto &lod0 = lod[0]; + lod0.push_back(0); + anchors.Resize({anchors.numel() / 4, 4}); + + int64_t num_proposals = 0; + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair tensor_pair = ProposalForOneImage( + im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, + pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); + Tensor &proposals = tensor_pair.first; + Tensor &scores = tensor_pair.second; + + AppendProposals(rpn_rois, 4 * num_proposals, proposals); + AppendProposals(rpn_roi_probs, num_proposals, scores); + num_proposals += proposals.dims()[0]; + lod0.push_back(num_proposals); + } + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp index 8bb1de6b13c90c0b5b3d0bbeaa0106ad2e9be30a..97e820e83c434dc4d552a7b0e83329fc5f6d6888 100644 --- a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp @@ -14,6 +14,7 @@ limitations under the License. */ #ifdef PSROI_POOL_OP +#include #include #include "operators/kernel/detection_kernel.h" @@ -21,13 +22,180 @@ namespace paddle_mobile { namespace operators { template <> -bool PSRoiPoolKernel::Init(PSRoiPoolParam *param) { +bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { + auto dims = param->input_x_->dims(); + PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, + "data not aligned"); + + param->float_input = std::make_shared(); + param->float_input->mutable_data(param->input_x_->dims()); + param->float_output = std::make_shared(); + param->float_output->mutable_data(param->output_->dims()); + + auto input = param->input_x_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_input->mutable_data(); + args.output.scale_address = param->float_input->scale; + param->input_arg = args; + + fpga::format_fp16_ofm(param->output_); + + input = param->float_output.get(); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->output_->mutable_data(); + args.output.scale_address = param->output_->scale; + param->input_arg = args; + return true; } template <> -void PSRoiPoolKernel::Compute(const PSRoiPoolParam ¶m) { - // TODO(hjchen2) +void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { + auto input_tensor = param.float_input.get(); + fpga::PerformBypass(param.input_arg); + fpga::fpga_invalidate(input_tensor->data(), + input_tensor->numel() * sizeof(float)); + + auto* in = input_tensor; + auto* rois = param.input_rois_; + auto* out = param.float_output.get(); + + auto pooled_height = param.pooled_height_; + auto pooled_width = param.pooled_width_; + auto spatial_scale = param.spatial_scale_; + auto output_channels = param.output_channels_; + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + // TODO auto in_stride = framework::stride(in_dims); + // TODO auto out_stride = framework::stride(out->dims()); + auto in_stride = + framework::stride({batch_size, height, width, input_channels}); + auto out_stride = framework::stride( + {out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + + const float* input_data = in->data(); + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + auto rois_batch_id_data = rois_batch_id_list.mutable_data(); + return; + + PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_MOBILE_ENFORCE( + rois_batch_size == batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_MOBILE_ENFORCE( + input_channels == output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + auto output_data = out->mutable_data(); + auto input_rois = rois->data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // set roi batch id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + auto offset_input_rois = input_rois + n * 4; + auto roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + auto roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + auto roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + auto roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small rois to be 1 x 1 + auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 + auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); + + // Compute bin size w and h at input feature map + auto bin_size_h = roi_height / static_cast(pooled_height); + auto bin_size_w = roi_width / static_cast(pooled_width); + DLOG << 3; + + // calculate each pixel of the output feature map. + int out_roi_offset = n * out_stride[0]; + for (int c = 0; c < output_channels; ++c) { + // per category + // int out_plane_offset = out_roi_offset + c * out_stride[1]; + int out_plane_offset = out_roi_offset + c; + for (int ph = 0; ph < pooled_height; ++ph) { + // TODO int out_row_offset = out_plane_offset + ph * + // out_stride[2]; + int out_row_offset = out_plane_offset + ph * out_stride[1]; + for (int pw = 0; pw < pooled_width; ++pw) { + // calculate w and h at input feature map + int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); + int hend = + ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); + int wend = + ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + wstart = std::min(std::max(wstart, 0), width); + hend = std::min(std::max(hend, 0), height); + wend = std::min(std::max(wend, 0), width); + + // TODO int output_index = out_row_offset + pw; + int output_index = out_row_offset + pw * output_channels; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + // TODO int input_plane_offset = + // TODO roi_batch_id * in_stride[0] + input_channel * + // in_stride[1]; + int input_plane_offset = roi_batch_id * in_stride[0] + input_channel; + auto offset_input_data = input_data + input_plane_offset; + float out_sum = 0.; + bool is_empty = (hend <= hstart) || (wend <= wstart); + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * in_stride[1] + iw * input_channel; + out_sum += offset_input_data[input_index]; + } + } + float bin_area = (hend - hstart) * (wend - wstart); + output_data[output_index] = is_empty ? 0. : out_sum / bin_area; + } + } + } + } + fpga::format_image(out); + fpga::PerformBypass(param.output_arg); } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp index 5370df6899cea632dbc2d8f3434760eb7e0c4c38..e92be9124fd8b46c50a3b6a0e2bbb78cf62e47b8 100644 --- a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp @@ -15,18 +15,61 @@ limitations under the License. */ #ifdef RESHAPE2_OP #include "operators/kernel/reshape2_kernel.h" +#include "framework/ddim.h" namespace paddle_mobile { namespace operators { template <> bool Reshape2Kernel::Init(Reshape2Param *param) { + auto input = const_cast(param->InputX()); + auto output = param->Out(); + auto shape = param->Shape(); + output->ShareDataWith(*input); + + auto num_in = framework::product(input->dims()); + auto num_shape = framework::product(framework::make_ddim(shape)); + PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); + + for (int i = 0; i < shape.size(); i++) { + if (shape[i] == -1) { + shape[i] = static_cast(-num_in / num_shape); + break; + } + } + output->Resize(framework::make_ddim(shape)); + DLOG << "input: " << input; + DLOG << "output: " << output; + return true; } template <> void Reshape2Kernel::Compute(const Reshape2Param ¶m) { - return; + auto input = const_cast(param.InputX()); + auto output = param.Out(); + auto shape = param.Shape(); + + if (output->type() != typeid(half)) { + DLOG << "wrong type"; + } + + auto num_in = framework::product(input->dims()); + auto num_shape = framework::product(framework::make_ddim(shape)); + PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); + + for (int i = 0; i < shape.size(); i++) { + if (shape[i] == -1) { + shape[i] = static_cast(-num_in / num_shape); + break; + } + } + output->Resize(framework::make_ddim(shape)); + if (output->type() != typeid(half)) { + DLOG << "wrong type"; + DLOG << output; + } + // } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp index 6c836e2776891f283677287eae54019f0dbef39b..bf36873a1fb442a4d5ff6f57056515009d275cd6 100644 --- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp +++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp @@ -25,7 +25,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { paddle_mobile::fpga::SIGMOID; int16_t leaky_relu_negative_slope = 0; auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto out = param->Out(); fpga::format_fp16_ofm(out); @@ -38,7 +38,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1; args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = out->data(); + args.output.address = out->data(); args.output.scale_address = out->scale; args.output.activation.activation_type = activation_enable; args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; diff --git a/src/operators/kernel/fpga/V1/slice_kernel.cpp b/src/operators/kernel/fpga/V1/slice_kernel.cpp index ea5c3ebca4b78e6052284df94fe3292e4002fcc7..5d0ac1fe61caa9cce0e1af6f8ac5c53b315573db 100644 --- a/src/operators/kernel/fpga/V1/slice_kernel.cpp +++ b/src/operators/kernel/fpga/V1/slice_kernel.cpp @@ -21,10 +21,37 @@ namespace operators { template <> bool SliceKernel::Init(SliceParam* param) { + auto output = param->output_; + fpga::format_fp16_ofm(output); + DLOG << "input: " << param->input_; + DLOG << "output: " << param->output_; + if (param->input_->type() != typeid(half)) { + DLOG << "wrong type"; + } return true; } template <> -void SliceKernel::Compute(const SliceParam& param) {} +void SliceKernel::Compute(const SliceParam& param) { + // Only support slicing in channel dimension + + auto input = param.input_; + DLOG << input; + int HW = input->dims()[2] * input->dims()[3]; + int channel = input->dims()[1]; + auto input_ptr = input->data(); + auto output_ptr = param.output_->data(); + + int start = param.starts_[0], end = param.ends_[0]; + start = start < 0 ? start + channel : start; + end = end < 0 ? end + channel : end; + start = start > channel ? channel : start; + end = end > channel ? channel : end; + int len = end - start; + + for (int i = 0; i < HW; i++) { + memcpy(output_ptr + len * i, input_ptr + i * channel + start, len); + } +} } // namespace operators } // namespace paddle_mobile #endif diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 2698fdece49409aec017112e8613a706c248cf48..69308ea5538b01c627b92ef41cc2b3768f7fdd67 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -23,49 +23,72 @@ namespace operators { template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto out = param->Out(); - fpga::format_fp32_ofm(out); + auto float_input = new Tensor; - if (input->dims().size() == 2) { - float_input->mutable_data({1, input->dims()[1]}); - } else if (input->dims().size() == 4) { - float_input->mutable_data( - {1, input->dims()[2], input->dims()[3], input->dims()[1]}); - } else { - DLOG << "wrong dimension of softmax input"; + + PADDLE_MOBILE_ENFORCE(input->dims().size() == 4, + "Softmax should have 4-order input"); + auto dims = framework::vectorize(input->dims()); + auto channel = dims[3]; + if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1] + PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op"); + dims[3] = dims[1]; + dims[1] = 1; + } + input->Resize(framework::make_ddim(dims)); + float_input->Resize(framework::make_ddim(dims)); + + if (channel != 2) { // Use CPU + float_input->init(typeid(float)); + fpga::format_fp32_ofm(float_input); + fpga::format_fp32_ofm(out); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input_ptr; + args.image.height = (uint32_t)dims[1]; + args.image.width = (uint32_t)dims[2]; + args.image.channels = (uint32_t)dims[3]; + args.output.address = float_input->data(); + args.output.scale_address = float_input->scale; + param->SetFloatInput(float_input); + param->SetFpgaArgs(args); + } else { // Use FPGA + fpga::format_fp16_ofm(out); + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = input_ptr; + args.image.height = (uint32_t)input->dims()[1]; + args.image.width = (uint32_t)input->dims()[2]; + args.image.channels = (uint32_t)input->dims()[3]; + args.output.address = out->data(); + args.output.scale_address = out->scale; + args.output.activation.activation_type = fpga::SOFTMAX; + param->SetFpgaArgs(args); } - fpga::format_fp32_ofm(float_input); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input_ptr; - args.image.height = - (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1; - args.image.width = - (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = float_input->data(); - args.output.scale_address = float_input->scale; - param->SetFloatInput(float_input); - param->SetFpgaArgs(args); return true; } template <> void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - Tensor *in_x = param.FloatInput(); - Tensor *out = param.Out(); - fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate((void *)in_x->data(), // NOLINT - in_x->numel() * sizeof(float)); - // TODO: In general case, 0 should be squeezed before softmax input // NOLINT - math::SoftmaxFuntor()(in_x, out); - fpga::fpga_flush(out->data(), out->memory_size()); + + if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { + Tensor *out = param.Out(); + Tensor *in_x = param.FloatInput(); + fpga::fpga_invalidate(in_x->data(), in_x->numel() * sizeof(float)); + math::SoftmaxFuntor()(in_x, out); + fpga::fpga_flush(out->data(), out->memory_size()); + } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/split_kernel.cpp b/src/operators/kernel/fpga/V1/split_kernel.cpp index b8c0bb3be64d2393b61b0f82375c695000f52b65..35a559ae6475533978188a7f0e018b614db1415c 100644 --- a/src/operators/kernel/fpga/V1/split_kernel.cpp +++ b/src/operators/kernel/fpga/V1/split_kernel.cpp @@ -34,16 +34,18 @@ bool SplitKernel::Init(SplitParam *param) { fpga::fpga_malloc(image_num * sizeof(float *))); auto out_channels = reinterpret_cast( fpga::fpga_malloc(image_num * sizeof(uint32_t))); + DLOG << "input: " << in; for (int i = 0; i < image_num; i++) { fpga::format_fp16_ofm(outs[i]); - images_out[i] = outs[i]->mutable_data(); + DLOG << "output: " << outs[i]; + images_out[i] = outs[i]->mutable_data(); scales_out[i] = outs[i]->scale; out_channels[i] = (uint32_t)sections[i]; } fpga::SplitArgs arg = {0}; arg.image_num = image_num; - arg.image_in = (half *)in->data(); + arg.image_in = in->data(); arg.scale_in = in->scale; arg.images_out = images_out; arg.scales_out = scales_out; diff --git a/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/src/operators/kernel/fpga/V1/tanh_kernel.cpp index 216cb726e3fe93e9ebfaf328a9ab4ca0725b6bb1..7b5e2153ceb777325bd80c445b96bcfc2d631303 100644 --- a/src/operators/kernel/fpga/V1/tanh_kernel.cpp +++ b/src/operators/kernel/fpga/V1/tanh_kernel.cpp @@ -22,8 +22,10 @@ namespace operators { template <> bool TanhKernel::Init(TanhParam *param) { auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + DLOG << "input: " << input; + auto input_ptr = input->data(); auto float_input = new Tensor; + float_input->mutable_data( {1, input->dims()[1], input->dims()[2], input->dims()[3]}); fpga::format_fp32_ofm(float_input); diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp index 48e84707fabb4ccd0618da672b82c5380d9533ba..f74839f1fc06e0b5bf391187f5ecab461f7c00f5 100644 --- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp @@ -20,7 +20,21 @@ namespace operators { template <> bool Transpose2Kernel::Init(Transpose2Param *param) { - param->Out()->ShareDataWith(*param->InputX()); + auto input = param->InputX(); + auto output = param->Out(); + auto axis = param->Axis(); + auto dim = input->dims(); + output->ShareDataWith(*input); + + auto dim_v = vectorize(dim); + + for (int i = 0; i < axis.size(); i++) { + dim_v[i] = dim[axis[i]]; + } + output->Resize(framework::make_ddim(dim_v)); + + DLOG << "input: " << input; + DLOG << "output: " << output; return true; } diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 00fbfbc771cfe9329b8ba76f120a5bc304dc80fc..8cd804444a2d8f65d027ecccb240b5ada9aa274f 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1172,6 +1172,12 @@ class FeedParam : public OpParam { public: FeedParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { +#ifdef PADDLE_MOBILE_FPGA + static int feed_num = 0; + auto new_name = std::string("feed") + std::to_string(feed_num++); + const_cast(inputs).at("X") = {string(new_name)}; +#endif + input_x_ = InputXFrom(inputs, scope); out_ = OutFrom(outputs, scope); auto var = scope.FindVar("batch_size"); @@ -1195,6 +1201,11 @@ class FetchParam : public OpParam { public: FetchParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { +#ifdef PADDLE_MOBILE_FPGA + static int fetch_num = 0; + auto new_name = std::string("fetch") + std::to_string(fetch_num++); + const_cast(outputs).at("Out") = {string(new_name)}; +#endif input_x_ = InputXFrom(inputs, scope); out_ = OutFrom(outputs, scope); } @@ -1210,18 +1221,9 @@ class FetchParam : public OpParam { RType *input_x_; Tensor *out_; #ifdef PADDLE_MOBILE_FPGA - - private: - std::shared_ptr float_input_x_; + public: fpga::BypassArgs fpga_bypass_args; - public: - RType *FloatInput() const { - return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); - } - void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } - const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } - void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } #endif }; diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index 1a5daafe2b784b98b102fa2eab04f71c67260d9c..218550ca6b0478bbee985c16c6d3b111171f1745 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width, } } -void dump(std::string filename, const Tensor input_tensor) { - auto dataptr = input_tensor.data(); +void dump(std::string filename, Tensor input_tensor) { + auto dataptr = reinterpret_cast(input_tensor.get_data()); std::ofstream out(filename.c_str()); float result = 0; for (int i = 0; i < input_tensor.numel(); ++i) { @@ -61,12 +61,11 @@ void dump(std::string filename, const Tensor input_tensor) { } out.close(); } -void dump_stride(std::string filename, const Tensor input_tensor, - const int dumpnum) { +void dump_stride(std::string filename, Tensor input_tensor, const int dumpnum) { int c = (input_tensor.dims())[1]; int h = (input_tensor.dims())[2]; int w = (input_tensor.dims())[3]; - auto data_ptr = input_tensor.data(); + auto data_ptr = input_tensor.get_data(); int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t)); int16_t *data_ptr_16 = (int16_t *)data_ptr; convert_to_chw(&data_ptr_16, c, h, w, data_tmp); @@ -98,9 +97,9 @@ int main() { for (int i = 0; i < 73; i++) { auto tensor_ptr = paddle_mobile.FetchResult(i); std::string saveName = "resnet50_result_" + std::to_string(i); - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data(), + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), tensor_ptr->numel() * sizeof(half)); - dump_stride(saveName, (*tensor_ptr), 20); + // dump_stride(saveName, (*tensor_ptr), 20); // dump(saveName, (*tensor_ptr)); } diff --git a/test/fpga/test_rfcn.cpp b/test/fpga/test_rfcn.cpp index f4d7657bf0b06a2954914da56a863e1c417ba176..2af521aa7f50c83aa77bb18921eb45bd06eb7490 100644 --- a/test/fpga/test_rfcn.cpp +++ b/test/fpga/test_rfcn.cpp @@ -23,29 +23,38 @@ limitations under the License. */ #include "fpga/V2/api.h" #endif -// static const char *g_densebox_combine = "../models/densebox"; -static const char *g_densebox_combine = "../models/rfcn"; +void readStream(std::string filename, uint8_t *buf) { + std::ifstream in; + in.open(filename, std::ios::in); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + int i = 0; + while (!in.eof()) { + in >> buf[i]; + i++; + } + in.close(); +} + +static const char *g_rfcn_combine = "../models/rfcn"; +const std::string g_image_src_float = "../models/rfcn/data.bin"; int main() { paddle_mobile::fpga::open_device(); paddle_mobile::PaddleMobile paddle_mobile; - // paddle_mobile.SetThreadNum(4); - if (paddle_mobile.Load(std::string(g_densebox_combine) + "/model", - std::string(g_densebox_combine) + "/params", true, - false, 1, true)) { - // std::vector input; - // std::vector dims{1, 3, 512, 1024}; - // GetInput(g_test_image_1x3x224x224_banana, &input, dims); - - // auto vec_result = paddle_mobile.Predict(input, dims); - return 0; - - Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 512, 1024}, static_cast(0), - static_cast(1)); - // readStream(g_image_src_float, - // input_tensor.mutable_data({1, 3, 224, 224})); - paddle_mobile.FeedData(input_tensor); + + if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", + std::string(g_rfcn_combine) + "/params", true, false, + 1, true)) { + float img_info[3] = {768, 1536, 768.0f / 960.0f}; + auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)); + readStream(g_image_src_float, reinterpret_cast(img)); + std::vector v(3, nullptr); + paddle_mobile.FeedData({img_info, img}); paddle_mobile.Predict_To(-1); + paddle_mobile.GetResults(&v); + DLOG << "Computation done"; } return 0;