diff --git a/CMakeLists.txt b/CMakeLists.txt index cd34dd62a408beec5e399ee5f63ed34165352656..bf3809b5810a34b0a7c70a64d9d70359c46ebc98 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -4,7 +4,7 @@ option(USE_OPENMP "build with openmp support" ON) option(USE_EXCEPTION "build with exception" ON) option(WITH_LOGGING "print logging for debug" ON) option(WITH_SYMBOL "build with all symbols" ON) # turn off if use jni or ios io -option(WITH_PROFILE "print op profile for debug" ON) +option(WITH_PROFILE "print op profile for debug" OFF) option(WITH_TEST "build with unit tests" ON) # select the platform to build diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 9a408a8f2fbe3c600679ddb2e3eadb493f323165..5c960bbea7f8e65053998a29cd72d7b78f2fb97a 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -28,13 +28,22 @@ void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = image_tensor->data(); - size_t memory_size = channel * height * width * sizeof(float); - auto new_data = (float *)fpga_malloc(memory_size); // NOLINT - fpga_copy(new_data, data_ptr, memory_size); - image::format_image(&new_data, channel, height, width); - image_tensor->reset_data_ptr(new_data); + auto external_ptr = reinterpret_cast(image_tensor->external_data); + float *p_data = external_ptr == nullptr ? data_ptr : external_ptr; + float *old_p = p_data; + image::format_image(&p_data, channel, height, width); + if (old_p != p_data) { + image_tensor->reset_data_ptr(p_data); + } } +void format_ofm(framework::Tensor *ofm_tensor) { + if (ofm_tensor->type() == typeid(float)) { + format_fp32_ofm(ofm_tensor); + } else { + format_fp16_ofm(ofm_tensor); + } +} void format_fp16_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; @@ -50,6 +59,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { auto p = fpga_malloc(memory_size); memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); + ofm_tensor->set_type(typeid(half)); } void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { @@ -67,6 +77,7 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { auto p = fpga_malloc(memory_size); memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); + ofm_tensor->set_type(typeid(half)); } void format_fp32_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); @@ -83,6 +94,7 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) { auto p = fpga_malloc(memory_size); memset(p, 0, memory_size); ofm_tensor->reset_data_ptr(p); + ofm_tensor->set_type(typeid(float)); } float filter_find_max(framework::Tensor *filter_tensor) { @@ -139,6 +151,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, filter::format_filter(&new_data, num, channel, height, width, group_num, max_value); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { auto dims = filter_tensor->dims(); @@ -149,6 +162,7 @@ void format_dwconv_filter(framework::Tensor *filter_tensor, float *scale_ptr) { fpga_copy(new_data, data_ptr, memory_size); filter::format_dwconv_filter(&new_data, num, height, width, scale_ptr); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, @@ -173,6 +187,7 @@ void format_DWDconv_filter(framework::Tensor *filter_tensor, float *scale_ptr, // framework::make_ddim({num, 1, height, width}); // filter_tensor->Resize(dims_new); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { @@ -187,6 +202,7 @@ void format_fc_filter(framework::Tensor *filter_tensor, float max_value) { filter::format_fc_filter(&new_data, num, channel, height, width, 1, max_value); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, int group_num, int stride) { @@ -213,6 +229,7 @@ void format_deconv_filter(framework::Tensor *filter_tensor, float max_value, framework::make_ddim({num, channel, height, width}); filter_tensor->Resize(dims_new); filter_tensor->reset_data_ptr(new_data); + filter_tensor->set_type(typeid(int8_t)); } void format_bias_scale_array(float **bias_scale_array, @@ -236,6 +253,7 @@ void format_concat_output(framework::Tensor *out, int height, int width, auto ddim = framework::make_ddim({1, sum_channel, height, width}); out->Resize(ddim); out->reset_data_ptr(data_ptr); + out->set_type(typeid(half)); } void format_conv_data(framework::Tensor *filter_tensor, framework::Tensor *ofm_tensor, float **bs_ptr, @@ -447,9 +465,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); - auto out_ptr = out->data(); + auto input_ptr = input->data(); + auto filter_ptr = filter->data(); + auto out_ptr = out->data(); auto deleter = [](void *p) { fpga_free(p); }; arg->group_num = (uint32_t)group_num; @@ -571,8 +589,8 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int group_num, int stride_h, int stride_w, int padding_h, int padding_w, float *bs_ptr) { - auto input_ptr = input->data(); - auto filter_ptr = filter->data(); + auto input_ptr = input->data(); + auto filter_ptr = filter->data(); auto deleter = [](void *p) { fpga_free(p); }; arg->group_num = (uint32_t)group_num; @@ -603,9 +621,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, framework::DDim dims_out_new = framework::make_ddim( {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width}); fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); + auto out_ptr = out->data(); arg->output.address = - (half *)out_ptr + // NOLINT + out_ptr + omit_size * sizeof(half) * (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); arg->output.scale_address = out->scale; @@ -695,7 +713,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, } for (int j = 0; j < split_num; ++j) { - // arg->split_conv_args[i]->conv_arg[j].relu_enabled = relu_enabled; arg->split_conv_args[i]->conv_arg[j].output.activation.activation_type = activation_enable; arg->split_conv_args[i] @@ -741,9 +758,9 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, align_to_x(arg->split_conv_args[i]->conv_arg[j].filter_num, FILTER_NUM_ALIGNMENT) * sizeof(int8_t); - auto filter_head = &(( - int8_t *)filter_ptr)[j * element_num * filter_num_per_div + // NOLINT - i * filter_sub_conv_offset]; + auto filter_head = + &filter_ptr[j * element_num * filter_num_per_div + // NOLINT + i * filter_sub_conv_offset]; arg->split_conv_args[i]->conv_arg[j].filter_address = fpga_malloc(filter_size); arg->split_conv_args[i]->vector_conv_space.push_back( @@ -793,7 +810,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i]->conv_arg[j].output.scale_address), deleter)); } - arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( + arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast( arg->split_conv_args[i]->conv_arg[j].output.address); arg->split_conv_args[i]->concat_arg.scales_in[j] = arg->split_conv_args[i]->conv_arg[j].output.scale_address; @@ -818,9 +835,13 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int stride_h, int stride_w, int padding_h, int padding_w, float *bias_ptr) { - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); + auto deleter = [](void *p) { fpga_free(p); }; + arg->vector_dwconv_space.push_back( + std::shared_ptr(reinterpret_cast(bias_ptr), deleter)); + + auto filter_ptr = filter->data(); + auto input_ptr = input->data(); + auto output_ptr = out->mutable_data(); arg->sub_conv_num = 1; // arg->relu_enabled = relu_enabled; arg->output.activation.activation_type = activation_enable; @@ -848,9 +869,8 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, int16_t leaky_relu_negative_slope, int stride_h, int stride_w, int padding_h, int padding_w, float *bias_ptr) { - auto filter_ptr = filter->data(); - auto input_ptr = input->data(); - auto output_ptr = out->mutable_data(); + auto filter_ptr = filter->data(); + auto input_ptr = input->data(); auto deleter = [](void *p) { fpga_free(p); }; @@ -885,7 +905,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input, framework::DDim dims_out_new = framework::make_ddim( {1, arg->filter_num, real_out_height, real_out_width}); fpga::format_fp16_ofm(out, dims_out_new); - auto out_ptr = out->data(); + auto out_ptr = out->data(); /*====For Addition arg->output.address = diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index 05a30ddce4828bf8ac0f049ea0db4f18dc1dba79..33a5d3d33fe610f872f2e0846cd99f2b42d589f3 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -23,6 +23,7 @@ namespace paddle_mobile { namespace fpga { void format_image(framework::Tensor* image_tensor); +void format_ofm(framework::Tensor* ofm_tensor); void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); void format_fp32_ofm(framework::Tensor* ofm_tensor); diff --git a/src/fpga/V1/deconv_filter.cpp b/src/fpga/V1/deconv_filter.cpp index 7c87452f5a7264ad069d8508cb1e9dc24f5cdc3d..36a02578bca6698b510c18947d1e8463108cad8b 100644 --- a/src/fpga/V1/deconv_filter.cpp +++ b/src/fpga/V1/deconv_filter.cpp @@ -247,6 +247,7 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, fpga_copy(ptr_space + i * align_offset, ptr_tmp, align_offset); fpga_free(ptr_tmp); } + fpga_free(ptr_ptr_data); *data_in = reinterpret_cast(ptr_space); /* { diff --git a/src/fpga/V1/image.cpp b/src/fpga/V1/image.cpp index c79a5c3a8e7c4f47cd11c2c4af14feb69efed48d..ebba4f3eaf7ff822bae240f8565b4b5f86f1a796 100644 --- a/src/fpga/V1/image.cpp +++ b/src/fpga/V1/image.cpp @@ -22,7 +22,6 @@ namespace fpga { namespace image { void convert_to_hwc(float **data_in, int channel, int height, int width) { - float *tmp = *data_in; float *data_tmp = (float *)fpga_malloc(channel * height * width * sizeof(float)); // NOLINT int64_t amount_per_row = width * channel; @@ -35,33 +34,35 @@ void convert_to_hwc(float **data_in, int channel, int height, int width) { } } *data_in = data_tmp; - fpga_free(tmp); } void align_element_conv(float **data_in, int height, int cw) { int h = 0; int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); - if (align_cw != cw) { - float *tmp = *data_in; - float *data_tmp = - (float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT - memset(data_tmp, 0, height * align_cw * sizeof(float)); + float *data_tmp = + (float *)fpga_malloc(height * align_cw * sizeof(float)); // NOLINT - for (h = 0; h < height; h++) { - memcpy((void *)(data_tmp + h * align_cw), // NOLINT - (void *)(*data_in + h * cw), // NOLINT - cw * sizeof(float)); - } + memset(data_tmp, 0, height * align_cw * sizeof(float)); - *data_in = data_tmp; - fpga_free(tmp); + for (h = 0; h < height; h++) { + memcpy((void *)(data_tmp + h * align_cw), // NOLINT + (void *)(*data_in + h * cw), // NOLINT + cw * sizeof(float)); } + + *data_in = data_tmp; } void format_image(float **data_in, int channel, int height, int width) { convert_to_hwc(data_in, channel, height, width); - align_element_conv(data_in, height, channel * width); + int cw = channel * width; + int align_cw = align_to_x(cw, IMAGE_ALIGNMENT); + if (align_cw != cw) { + float *hwc_temp = *data_in; + align_element_conv(data_in, height, channel * width); + fpga_free(hwc_temp); + } fpga_flush(*data_in, align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(float)); } diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 5a81e2422979f08b2113bd9b46022fe4d77154cb..37feeb9dfa1a0e9a8c4dc9f789c0ab673e0f4d65 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -290,14 +290,11 @@ int ComputeBasicConv(const struct ConvArgs &args) { reg_writeq(args.driver.deconv_param, 0xd18); reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); reg_writeq(args.driver.cmd, REG_CONV_CMD); - DLOG << "before reg poll"; if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; ret = -EIO; DLOG << "Conv Wait Irq Timeout!"; } - DLOG << "after reg poll"; - output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = (output_scale << 32) | (output_scale >> 32); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); diff --git a/src/fpga/common/fpga_common.cpp b/src/fpga/common/fpga_common.cpp index bf90a3a11926b1f90ed8a659db908a061f79b0e9..ad238c51efb33cc1d3a35bc9d6bc1dc2dcec75dd 100644 --- a/src/fpga/common/fpga_common.cpp +++ b/src/fpga/common/fpga_common.cpp @@ -164,7 +164,7 @@ void fpga_free(void *ptr) { // DLOG << "Address: " << ptr << ", " << size << " bytes freed. Total " // << counter << " bytes"; } else { - DLOG << "Invalid pointer"; + DLOG << "Address: " << ptr << " Invalid pointer"; } } void fpga_copy(void *dest, const void *src, size_t num) { diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index 60753e5cde1e39a1dbf4a1016667db748fc6b9f9..898e76a65425c357a00e76eaedf39c003c9603f3 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -19,17 +19,16 @@ limitations under the License. */ #include #include -namespace paddle_mobile { -namespace fpga { - #ifdef PADDLE_MOBILE_FPGA_V1 -#define IMAGE_ALIGNMENT 16 // Aligned to 16 -#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32 -#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16 -#define BS_NUM_ALIGNMENT 8 -#define BIAS_NUM_ALIGNMENT 16 +#define IMAGE_ALIGNMENT (16) // Aligned to 16 +#define FILTER_NUM_ALIGNMENT (32) // Filter number aligned to 32 +#define FILTER_ELEMENT_ALIGNMENT (16) // Filter element number aligned to 16 +#define BS_NUM_ALIGNMENT (8) +#define BIAS_NUM_ALIGNMENT (16) #endif +namespace paddle_mobile { +namespace fpga { enum DataType { DATA_TYPE_FP32 = 1, DATA_TYPE_FP16 = 0, @@ -49,7 +48,7 @@ enum ActivationType { }; struct ActivationArgs { - enum ActivationType activation_type; + enum ActivationType activation_type = NONE; int16_t leaky_relu_negative_slope; }; @@ -188,6 +187,7 @@ struct SplitArgs { uint32_t* out_channel_nums; uint32_t height; uint32_t width; + std::vector> vector_split_space; }; struct PoolingArgs { @@ -237,6 +237,7 @@ struct DWconvArgs { struct KernelArgs kernel; struct ImageInputArgs image; struct ImageOutputArgs output; + std::vector> vector_dwconv_space; }; struct DWDeconvArgs { diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 44351b12561adc27b4b01fbafd2559f4f5fe9d54..203effd03d7c63f065df9ae06c337446e17ba73a 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -83,6 +83,11 @@ Executor::Executor(const Program &program, // resize feed and fetch list InitFeedFetchList(); +#ifdef PADDLE_MOBILE_FPGA + program_.scope->EraseVars({"feed", "fetch"}); + program_.scope->print_vars(); +#endif + int count = 0; for (auto &op_handler : ops_of_block0_) { DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type(); @@ -291,6 +296,7 @@ template bool Executor::varInputMemory( const std::shared_ptr &var_desc, Variable *var) const { #ifdef PADDLE_MOBILE_FPGA + framework::LoDTensor *tensor = var->template GetMutable(); tensor->init(typeid(float)); return true; #endif @@ -506,14 +512,41 @@ template void Executor::InjectVariable(const Tensor &t, std::string var_name) { Variable *g_feed_value = program_.scope->Var(var_name); - Tensor *feed_tensor = g_feed_value->GetMutable(); + Tensor *feed_tensor = g_feed_value->template GetMutable(); feed_tensor->Resize(t.dims()); feed_tensor->ShareDataWith(t); } template void Executor::FeedData(const Tensor &t) { - InjectVariable(t, "feed"); + InjectVariable(t, "feed0"); +} + +template +void Executor::FeedData(const std::vector &v) { + auto input_size = v.size(); + auto vars = program_.scope->VarContain("feed"); + PADDLE_MOBILE_ENFORCE(input_size == vars.size(), + "input data number not correct"); + for (int i = 0; i < input_size; i++) { + auto var = program_.scope->Var("feed", i); + auto feed_tensor = var->template GetMutable(); + feed_tensor->external_data = v[i]; + } +} + +template +void Executor::GetResults(std::vector *v) { + auto output_size = v->size(); + PADDLE_MOBILE_ENFORCE(output_size > 0, "Empty output"); + auto vars = program_.scope->VarContain("fetch"); + PADDLE_MOBILE_ENFORCE(output_size == vars.size(), + "output data number not correct"); + for (int i = 0; i < output_size; i++) { + auto var = program_.scope->Var("fetch", i); + auto fetch_tensor = var->template GetMutable(); + (*v)[i] = fetch_tensor->template data(); + } } template diff --git a/src/framework/executor.h b/src/framework/executor.h index 045e6a83e89ffc83905f0cc1925484f715796261..a706af54f9ab3c7b165993d4ffe9e627ed68a6a3 100644 --- a/src/framework/executor.h +++ b/src/framework/executor.h @@ -52,6 +52,8 @@ class Executor { #ifdef PADDLE_MOBILE_FPGA void InjectVariable(const Tensor &t, std::string var_name); void FeedData(const Tensor &t); + void FeedData(const std::vector &v); + void GetResults(std::vector *v); std::shared_ptr FetchResult(int id = -1); void Predict_From_To(int start = 0, int end = -1); void Predict_From(int start); diff --git a/src/framework/operator.cpp b/src/framework/operator.cpp index b4eea6cb8e9583f7d2bb21c634837bdfbe33ab75..12fc3d7f1439d160e19db5773cead7bff5b4f155 100644 --- a/src/framework/operator.cpp +++ b/src/framework/operator.cpp @@ -50,6 +50,9 @@ OperatorBase::OperatorBase(const std::string &type, attrs_(attrs), scope_(scope) { CheckAllInputOutputSet(); +#ifdef PADDLE_MOBILE_FPGA + InsertTensors(); +#endif } template @@ -133,6 +136,25 @@ void OperatorBase::Run() { } #endif +#ifdef PADDLE_MOBILE_FPGA +template +void OperatorBase::InsertTensors() { + static int feed_num = 0; + static int fetch_num = 0; + if (type_ == "feed") { + auto new_name = string("feed") + std::to_string(feed_num++); + auto var = scope_->Var(new_name); + var->template GetMutable(); + inputs_.at("X") = {string(new_name)}; + } else if (type_ == "fetch") { + auto new_name = string("fetch") + std::to_string(fetch_num++); + auto var = scope_->Var(new_name); + var->template GetMutable(); + outputs_.at("Out") = {string(new_name)}; + } +} +#endif + template class OperatorBase; template class OperatorBase; template class OperatorBase; diff --git a/src/framework/operator.h b/src/framework/operator.h index 6d5c9c404f494ec5527eff32efb35ab671dcf5f6..9b8226c5efb27553d56960762c8400a2d10e6b71 100644 --- a/src/framework/operator.h +++ b/src/framework/operator.h @@ -78,6 +78,9 @@ class OperatorBase { this->scope_->EraseVars(var_names); } } +#ifdef PADDLE_MOBILE_FPGA + void InsertTensors(); +#endif protected: framework::Scope *scope_; @@ -102,7 +105,6 @@ class OperatorWithKernel : public OperatorBase { kernel_.InitCLHelper(scope->GetCLScpoe()); #endif } - virtual void RunImpl() { this->kernel_.Compute(this->param_); } virtual void InferShape() const = 0; diff --git a/src/framework/program/program_desc.cpp b/src/framework/program/program_desc.cpp index 6866ab9c75cb06ad1af86ab99a32d59dfa7b45f5..b66c7a0dcf97ef8517e1122d2834aa992736c6e7 100644 --- a/src/framework/program/program_desc.cpp +++ b/src/framework/program/program_desc.cpp @@ -72,7 +72,8 @@ void ProgramDesc::Description(std::string header) { } } for (auto &attr : op->GetAttrMap()) { - LOG(kLOG_DEBUG2) << "attr name:: " << attr.first; + if (attr.first == "op_callstack") continue; + LOG(kLOG_DEBUG2) << "attr name: " << attr.first; LOG(kLOG_DEBUG3) << "argument - " << attr.second; } } diff --git a/src/framework/scope.cpp b/src/framework/scope.cpp index a1f5789aa52d2a70f54cef5c622c3a15907a4683..5ddb71aaf700b96b0630c1d0a4a8779f3ac1ddcb 100644 --- a/src/framework/scope.cpp +++ b/src/framework/scope.cpp @@ -111,5 +111,29 @@ Variable *Scope::FindVarLocally(const std::string &name) const { return nullptr; } +#ifdef PADDLE_MOBILE_FPGA +Variable *Scope::Var(const std::string &name, const int id) { + return Var(name + std::to_string(id)); +} + +std::vector Scope::VarContain(const std::string substring) { + std::vector v; + for (auto pair : vars_) { + if (pair.first.find(substring) == 0) { + v.push_back(pair.second); + } + } + return v; +} + +void Scope::print_vars() { + DLOG << "====================start to print variables================="; + for (auto pair : vars_) { + DLOG << pair.first; + } + DLOG << "==================complete printing variables================"; +} +#endif + } // namespace framework } // namespace paddle_mobile diff --git a/src/framework/scope.h b/src/framework/scope.h index 6b6e638bc4d19610c23f2d6b7f5a5c01890e3dac..08eebf8935abb52d01179837a0c76f24fae3f36d 100644 --- a/src/framework/scope.h +++ b/src/framework/scope.h @@ -75,6 +75,12 @@ class Scope { Variable *FindVarLocally(const std::string &name) const; +#ifdef PADDLE_MOBILE_FPGA + Variable *Var(const std::string &name, const int id); + std::vector VarContain(const std::string substring); + void print_vars(); +#endif + #ifdef PADDLE_MOBILE_CL CLScope *GetCLScpoe() { return cl_scope_; } #endif diff --git a/src/framework/tensor.h b/src/framework/tensor.h index afbba4d801e5d5dce2ba2edb1fd78c06ce66029e..16656c08b866aa4db08481bc4ac91f6b5e86a728 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -202,6 +202,11 @@ class Tensor : public TensorBase { inline void reset_data_ptr(void *p) { ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT } + inline void set_type(std::type_index type) { holder_->set_type(type); } + inline void *get_data() { + return ( + void *)(((PlaceholderImpl *)(holder_.get()))->ptr_.get()); // NOLINT + } inline void *init(std::type_index type) { if (holder_ != nullptr) { @@ -217,7 +222,8 @@ class Tensor : public TensorBase { reinterpret_cast(holder_->ptr()) + offset_); } - float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX + float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX + void *external_data = nullptr; // only used for Feed #endif }; diff --git a/src/io/api_paddle_mobile.cc b/src/io/api_paddle_mobile.cc index dd3b1b7317ecbebc1f6c65da66db65b7368f23f1..7c391c0bf84c34f0ea884a171e5a014711150d77 100644 --- a/src/io/api_paddle_mobile.cc +++ b/src/io/api_paddle_mobile.cc @@ -110,6 +110,91 @@ bool PaddleMobilePredictor::Run( return true; } +#ifdef PADDLE_MOBILE_FPGA +template +bool PaddleMobilePredictor::Run( + const std::vector &inputs, + std::vector *output_data, std::vector *index_data, + int batch_size) { + if (inputs.empty()) { + LOG(kLOG_ERROR) << "At least one output should be set with tensors' names."; + return false; + } + auto input = inputs[0]; + + if (input.shape.size() != 4) { + LOG(kLOG_ERROR) << "input shape not equal to 4!"; + return false; + } + std::vector dims; + for (auto d : input.shape) { + dims.push_back(static_cast(d)); + } + + // use tensor + framework::DDim ddim = + framework::make_ddim({dims[0], dims[1], dims[2], dims[3]}); + + framework::Tensor input_tensor; + input_tensor.Resize(ddim); + int input_length = framework::product(ddim); + auto input_ptr = input_tensor.mutable_data(); + + memcpy(input_ptr, static_cast(input.data.data()), + input_length * sizeof(T)); + paddle_mobile_->Predict(input_tensor); + auto num_result = index_data->size(); + if (output_data->size() != num_result) { + LOG(kLOG_ERROR) << "index and output number don't match"; + return false; + } + + for (int i = 0; i < num_result; i++) { + auto output_tensor = paddle_mobile_->FetchResult((*index_data)[i]); + + if (output_data->empty()) { + LOG(kLOG_ERROR) + << "At least one output should be set with tensors' names."; + return false; + } + + auto &output = (*output_data)[i]; + int output_length = output_tensor->numel(); + std::vector tensor_shape = + framework::vectorize(output_tensor->dims()); + + for (auto d : tensor_shape) { + output.shape.push_back(static_cast(d)); + } + + if (output.data.length() < output_length * sizeof(T)) { + output.data.Resize(output_length * sizeof(T)); + } + + memcpy(output.data.data(), output_tensor->template data(), + output_length * sizeof(T)); + } + + return true; +} +template +void PaddleMobilePredictor::FeedData( + const std::vector &inputs) { + paddle_mobile_->FeedData(inputs); +} + +template +void PaddleMobilePredictor::GetResults( + std::vector *outputs) { + paddle_mobile_->GetResults(outputs); +} + +template +void PaddleMobilePredictor::Predict_From_To(int start, int end) { + paddle_mobile_->Predict_From_To(start, end); +} + +#endif template PaddleMobilePredictor::~PaddleMobilePredictor() { paddle_mobile_->Clear(); diff --git a/src/io/api_paddle_mobile.h b/src/io/api_paddle_mobile.h index bca169a2ed7786ce5dbd58ddecf6d637e4c4854c..0cadd71c226b20331c8399d2cfd8873c093a6b84 100644 --- a/src/io/api_paddle_mobile.h +++ b/src/io/api_paddle_mobile.h @@ -31,7 +31,14 @@ class PaddleMobilePredictor : public PaddlePredictor { bool Run(const std::vector& inputs, std::vector* output_data, int batch_size = -1) override; - +#ifdef PADDLE_MOBILE_FPGA + bool Run(const std::vector& inputs, + std::vector* output_data, std::vector* index_data, + int batch_size = -1) override; + void FeedData(const std::vector& inputs) override; + void GetResults(std::vector* outputs) override; + void Predict_From_To(int start = 0, int end = -1) override; +#endif ~PaddleMobilePredictor() override; private: diff --git a/src/io/paddle_inference_api.h b/src/io/paddle_inference_api.h index afbd93dede6b5406f572c3b20b48a5904660e5e3..42509915d13cf7e632ed20c73f1320ec8bac09d1 100644 --- a/src/io/paddle_inference_api.h +++ b/src/io/paddle_inference_api.h @@ -26,8 +26,16 @@ limitations under the License. */ #include #include +// #define PADDLE_MOBILE_FPGA + namespace paddle_mobile { +#ifdef PADDLE_MOBILE_FPGA +namespace fpga { +int open_device(); +} +#endif + enum PaddleDType { FLOAT32, INT64, @@ -107,6 +115,14 @@ class PaddlePredictor { std::string prog_file; std::string param_file; }; +#ifdef PADDLE_MOBILE_FPGA + virtual bool Run(const std::vector& inputs, + std::vector* output_data, + std::vector* index_data, int batch_size = -1) = 0; + virtual void FeedData(const std::vector& inputs) = 0; + virtual void GetResults(std::vector* outputs) = 0; + virtual void Predict_From_To(int start = 0, int end = -1) = 0; +#endif protected: PaddlePredictor() = default; diff --git a/src/io/paddle_mobile.cpp b/src/io/paddle_mobile.cpp index 7ea501fc7582e28180aa464edb950d56e250a741..0b47d595c4a5a02d13524c78866c126d827a5805 100644 --- a/src/io/paddle_mobile.cpp +++ b/src/io/paddle_mobile.cpp @@ -228,6 +228,16 @@ void PaddleMobile::FeedData(const framework::Tensor &t) { executor_->FeedData(t); } +template +void PaddleMobile::FeedData(const std::vector &v) { + executor_->FeedData(v); +} + +template +void PaddleMobile::GetResults(std::vector *v) { + executor_->GetResults(v); +} + template std::shared_ptr PaddleMobile::FetchResult( int id) { diff --git a/src/io/paddle_mobile.h b/src/io/paddle_mobile.h index b651028f29fa10111ccef334ddf41b9fbec46c1e..c0ef24f7f2d4d70c1c6043cc0227dc33a072f2a0 100644 --- a/src/io/paddle_mobile.h +++ b/src/io/paddle_mobile.h @@ -90,6 +90,8 @@ class PaddleMobile { #ifdef PADDLE_MOBILE_FPGA void InjectVariable(const framework::Tensor &t, std::string var_name); void FeedData(const framework::Tensor &t); + void FeedData(const std::vector &v); + void GetResults(std::vector *v); std::shared_ptr FetchResult(int id = -1); void Predict_From_To(int start = 0, int end = -1); void Predict_From(int start); diff --git a/src/operators/detection_ops.cpp b/src/operators/detection_ops.cpp index 38a149a355f089b9c270b00e783ca0a28ae51062..630b672225f139891d136844558f9e418ac54508 100644 --- a/src/operators/detection_ops.cpp +++ b/src/operators/detection_ops.cpp @@ -22,6 +22,7 @@ namespace operators { template void AnchorGeneratorOp::InferShape() const { const auto &input_dims = this->param_.input_->dims(); + // DLOG << "AnchorGenerator input dim =" << input_dims.size(); PADDLE_MOBILE_ENFORCE(input_dims.size() == 4, "The layout of input is NCHW."); const auto &anchor_sizes = this->param_.anchor_sizes_; const auto &aspect_ratios = this->param_.aspect_ratios_; @@ -98,3 +99,15 @@ REGISTER_OPERATOR_CPU(psroi_pool, ops::PSRoiPoolOp); REGISTER_OPERATOR_CPU(roi_perspective_transform, ops::RoiPerspectiveOp); #endif #endif + +#ifdef PADDLE_MOBILE_FPGA +#ifdef ANCHOR_GENERATOR_OP +REGISTER_OPERATOR_FPGA(anchor_generator, ops::AnchorGeneratorOp); +#endif +#ifdef PROPOSAL_OP +REGISTER_OPERATOR_FPGA(generate_proposals, ops::ProposalOp); +#endif +#ifdef PSROI_POOL_OP +REGISTER_OPERATOR_FPGA(psroi_pool, ops::PSRoiPoolOp); +#endif +#endif diff --git a/src/operators/kernel/detection_kernel.h b/src/operators/kernel/detection_kernel.h index de3c5a3a3ddd15f8485c92185c131210ba3899f9..417c68fff7d0e88d2e1fcc1dc8c1f14aa3a4399b 100644 --- a/src/operators/kernel/detection_kernel.h +++ b/src/operators/kernel/detection_kernel.h @@ -103,6 +103,10 @@ class ProposalParam : public OpParam { float nms_thresh_; float min_size_; float eta_; +#ifdef PADDLE_MOBILE_FPGA + std::shared_ptr float_score, float_bbox; + fpga::BypassArgs score_arg, bbox_arg; +#endif }; DECLARE_KERNEL(Proposal, ProposalParam); @@ -133,6 +137,10 @@ class PSRoiPoolParam : public OpParam { int pooled_height_; int pooled_width_; float spatial_scale_; +#ifdef PADDLE_MOBILE_FPGA + std::shared_ptr float_input, float_output; + fpga::BypassArgs input_arg, output_arg; +#endif }; DECLARE_KERNEL(PSRoiPool, PSRoiPoolParam); diff --git a/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4e68b5e30ccc53ae84deb0866f982d70e175d8eb --- /dev/null +++ b/src/operators/kernel/fpga/V1/anchor_generator_kernel.cpp @@ -0,0 +1,70 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef ANCHOR_GENERATOR_OP + +#include +#include "operators/kernel/detection_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool AnchorGeneratorKernel::Init( + AnchorGeneratorParam *param) { + auto input = param->input_; + auto anchors = param->output_anchors_; + auto anchor_ptr = anchors->mutable_data(); + auto stride = param->stride_; + auto feature_width = input->dims()[3], feature_height = input->dims()[2]; + auto stride_width = stride[0], stride_height = stride[1]; + + int anchors_offset[] = {-2, -2, 18, 18, -10, -9, 26, 25, -23, + -20, 39, 36, -43, -34, 59, 49, -63, -54, + 79, 69, -96, -77, 112, 93, -137, -118, 153, + 134, -204, -188, 220, 204, -281, -395, 296, 441}; + int num_anchors = sizeof(anchors_offset) / (sizeof(int) * 4); + + // DLOG << "feature_height: " << feature_height; + // DLOG << "feature_width: " << feature_width; + // DLOG << "num_anchors: " << num_anchors; + // DLOG << "stride_width: " << stride_width; + // DLOG << "stride_height: " << stride_height; + + for (int h_idx = 0; h_idx < feature_height; ++h_idx) { + for (int w_idx = 0; w_idx < feature_width; ++w_idx) { + int offset = h_idx * w_idx * num_anchors * 4; + for (int idx = 0; idx < num_anchors; idx++) { + anchor_ptr[offset + 0] = + anchors_offset[idx * 4 + 0] + w_idx * stride_width; + anchor_ptr[offset + 1] = + anchors_offset[idx * 4 + 1] + h_idx * stride_height; + anchor_ptr[offset + 2] = + anchors_offset[idx * 4 + 2] + w_idx * stride_width; + anchor_ptr[offset + 3] = + anchors_offset[idx * 4 + 3] + h_idx * stride_height; + } + } + } + return true; +} + +template <> +void AnchorGeneratorKernel::Compute( + const AnchorGeneratorParam ¶m) {} + +} // namespace operators +} // namespace paddle_mobile + +#endif // ANCHOR_GENERATOR_OP diff --git a/src/operators/kernel/fpga/V1/concat_kernel.cpp b/src/operators/kernel/fpga/V1/concat_kernel.cpp index 6644bfd83e57a7fd147c0cc6383e64eb2ad79e51..7690f41ad3fbbebf59cd546a24370056eeb123d9 100644 --- a/src/operators/kernel/fpga/V1/concat_kernel.cpp +++ b/src/operators/kernel/fpga/V1/concat_kernel.cpp @@ -38,7 +38,7 @@ bool ConcatKernel::Init(ConcatParam *param) { PADDLE_MOBILE_ENFORCE( input->dims()[2] == height && input->dims()[3] == width, "Image height & width should be unified"); - images_in[i] = (half *)input->data(); // NOLINT + images_in[i] = input->data(); channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT scales_in[i] = input->scale; } @@ -48,7 +48,7 @@ bool ConcatKernel::Init(ConcatParam *param) { concatArgs.image_num = image_num; concatArgs.images_in = images_in; concatArgs.scales_in = scales_in; - concatArgs.image_out = (half *)out->data(); // NOLINT + concatArgs.image_out = out->data(); concatArgs.scale_out = out->scale; concatArgs.channel_num = channel_num; concatArgs.height = height; diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp index 3e41efdf76ed5b14d408a1278c7dba0bd1f30a1f..c052805dfdc361965c4fc5068ab386367f087797 100644 --- a/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_bn_kernel.cpp @@ -26,11 +26,11 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto bias = param->Bias(); auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); @@ -59,8 +59,6 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i] = new_bias_ptr[i]; } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; @@ -70,6 +68,9 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); + delete new_scale; + delete new_bias; + return true; } diff --git a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp index b7b99be78acae80c46b9d1bd1f3cb72d5f4a7cfb..a7a93de9baed8711a66665ac9510094811ca44d9 100644 --- a/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_bn_relu_kernel.cpp @@ -27,10 +27,10 @@ bool ConvAddBNReluKernel::Init( paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto bias = param->Bias(); auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); vector paddings = param->Paddings(); @@ -60,8 +60,6 @@ bool ConvAddBNReluKernel::Init( bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i] = new_bias_ptr[i]; } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); const int groups = param->Groups(); if (groups == channel) { @@ -71,6 +69,8 @@ bool ConvAddBNReluKernel::Init( leaky_relu_negative_slope, strides[0], strides[1], paddings[0], paddings[1], new_bias_ptr); param->SetFpgaArgs(dwconv_arg); + fpga::fpga_free(new_scale_ptr); + fpga::fpga_free(bs_ptr); } else { fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; @@ -78,6 +78,8 @@ bool ConvAddBNReluKernel::Init( leaky_relu_negative_slope, param->Groups(), strides[0], strides[1], paddings[0], paddings[1], bs_ptr); param->SetFpgaArgs(conv_arg); + delete new_scale; + delete new_bias; } return true; } diff --git a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp index 153be5a4f888c2a39a7b05b9a7fbb72e305acb8d..da16af58f117b2fbb0e4b6442f9496ea9b824317 100644 --- a/src/operators/kernel/fpga/V1/conv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_kernel.cpp @@ -25,10 +25,10 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); + auto input = const_cast(param->Input()); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], diff --git a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp index eef35bf74b6b28e3ec0c49d6b7ace0a350f3f194..f1f61da4217d4ecf3ce12e75b9fba3d3447cb4f6 100644 --- a/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_add_relu_kernel.cpp @@ -25,10 +25,10 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); + auto input = const_cast(param->Input()); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], diff --git a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp index c4c2bf184d536ace31e52defb59e97c154386464..54d99f22d185b0252ad4b5b5b48ceaa1e424b1c6 100644 --- a/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_kernel.cpp @@ -26,8 +26,8 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); + auto input = const_cast(param->Input()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); @@ -51,8 +51,6 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i] = new_bias_ptr[i]; } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; @@ -61,6 +59,8 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); + delete new_scale; + delete new_bias; return true; } diff --git a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp index 463c90d1bb0dcd48a7b41aff73b830d14f989c73..eb5b913b730183be88d2470b1f57783aba15eb92 100644 --- a/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/conv_bn_relu_kernel.cpp @@ -26,8 +26,8 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); - auto filter = const_cast(param->Filter()); + auto input = const_cast(param->Input()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); @@ -51,8 +51,6 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bs_ptr[i + channel] = new_scale_ptr[i]; bs_ptr[i] = new_bias_ptr[i]; } - param->SetNewScale(new_scale); - param->SetNewBias(new_bias); fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; @@ -61,6 +59,9 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { param->Strides()[0], param->Strides()[1], param->Paddings()[0], param->Paddings()[1], bs_ptr); param->SetFpgaArgs(conv_arg); + + delete new_scale; + delete new_bias; return true; } diff --git a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp index 97a4d5516b52939a3a1d90a22c8050679810d405..41844d008b2c8313fc8f1ac75a00d9864b5a20a5 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp @@ -27,10 +27,10 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); + auto input = const_cast(param->Input()); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp index f0b29943d7731d716a19cff1e3cfc904d7610c0b..c6fc9d195511ae3218450fa58393ba420444eb92 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp @@ -28,10 +28,10 @@ bool DeconvAddReluKernel::Init( paddle_mobile::fpga::ActivationType activation_enable = paddle_mobile::fpga::LEAKYRELU; int16_t leaky_relu_negative_slope = 0; - auto input = const_cast(param->Input()); + auto input = const_cast(param->Input()); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - auto filter = const_cast(param->Filter()); + auto filter = const_cast(param->Filter()); auto out = param->Output(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], diff --git a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp index 27eee7e5ba7045473ff035f45236d04e080a692e..a830996524cba9ff05259bf7ccf3a55c99749a87 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_kernel.cpp @@ -27,10 +27,10 @@ bool ElementwiseAddKernel::Init(ElementwiseAddParam *param) { auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); + auto out_ptr = out->mutable_data(); fpga::EWAddArgs ewaddArgs = {0}; // ewaddArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp index fbbe679d4b6a6d4b0ca0a25ebb7aacf93a133943..f36206a8a15451144a00a16aad176ca67c4a4114 100644 --- a/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/elementwise_add_relu_kernel.cpp @@ -28,10 +28,10 @@ bool ElementwiseAddReluKernel::Init( auto *input_x = const_cast(param->InputX()); auto *input_y = const_cast(param->InputY()); auto *out = param->Out(); - auto input_x_ptr = input_x->data(); - auto input_y_ptr = input_y->data(); + auto input_x_ptr = input_x->data(); + auto input_y_ptr = input_y->data(); fpga::format_fp16_ofm(out); - auto out_ptr = out->mutable_data(); + auto out_ptr = out->mutable_data(); fpga::EWAddArgs ewaddArgs = {0}; // ewaddArgs.relu_enabled = relu_enabled; diff --git a/src/operators/kernel/fpga/V1/feed_kernel.cpp b/src/operators/kernel/fpga/V1/feed_kernel.cpp index 9c6468404e334a5a3002f8702d4f3b9818028f77..a4b3ec85f3688066d00b37753a6533a7ef72a552 100644 --- a/src/operators/kernel/fpga/V1/feed_kernel.cpp +++ b/src/operators/kernel/fpga/V1/feed_kernel.cpp @@ -19,19 +19,37 @@ namespace operators { template <> bool FeedKernel::Init(FeedParam *param) { - Tensor *output = param->Out(); + auto output = param->Out(); + int col = param->Col(); + auto input = const_cast(¶m->InputX()->at(col)); + input->init(typeid(float)); + input->Resize(output->dims()); + + if (output->dims().size() != 4) { + auto input_ptr = input->mutable_data(); + size_t size = output->numel() * sizeof(float); + auto p = fpga::fpga_malloc(size); + memcpy(p, input_ptr, size); + output->reset_data_ptr(p); + return true; + } fpga::format_fp16_ofm(output); return true; } template <> void FeedKernel::Compute(const FeedParam ¶m) { - auto input = - reinterpret_cast(const_cast(param.InputX())); + auto output = param.Out(); + int col = param.Col(); + auto input = const_cast(¶m.InputX()->at(col)); + + if (input->dims().size() != 4) { + return; + } + fpga::format_image(input); auto input_ptr = input->data(); - Tensor *output = param.Out(); - auto output_ptr = output->data(); + auto output_ptr = output->data(); fpga::BypassArgs args = {fpga::DATA_TYPE_FP32}; @@ -39,7 +57,7 @@ void FeedKernel::Compute(const FeedParam ¶m) { args.output_data_type = fpga::DATA_TYPE_FP16; args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = reinterpret_cast(input_ptr); + args.image.address = input_ptr; args.image.channels = (uint32_t)input->dims()[1]; args.image.height = (uint32_t)input->dims()[2]; args.image.width = (uint32_t)input->dims()[3]; @@ -48,6 +66,8 @@ void FeedKernel::Compute(const FeedParam ¶m) { args.output.address = output_ptr; args.output.scale_address = output->scale; fpga::PerformBypass(args); + + input->external_data = nullptr; } template class FeedKernel; diff --git a/src/operators/kernel/fpga/V1/fetch_kernel.cpp b/src/operators/kernel/fpga/V1/fetch_kernel.cpp index c00bdf57a259e24669c33f011d7b77eb20d4b308..545fff88168a6cb245cfe4cdfd26d8e3de64a825 100644 --- a/src/operators/kernel/fpga/V1/fetch_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fetch_kernel.cpp @@ -19,20 +19,15 @@ namespace operators { template <> bool FetchKernel::Init(FetchParam *param) { - Tensor *output = param->Out(); - // fpga::format_fp16_ofm(output); - return true; -} - -template <> -void FetchKernel::Compute(const FetchParam ¶m) { - param.Out()->ShareDataWith(*(param.InputX())); - /*auto input = - reinterpret_cast(const_cast(param.InputX())); - fpga::format_image(input); - auto input_ptr = input->data(); - Tensor *output = param.Out(); - auto output_ptr = output->data(); + auto input = const_cast(param->InputX()); + int col = param->Col(); + auto output = &(param->Out()->at(col)); + if (input->type() == typeid(float)) { + return true; + } + output->init(typeid(float)); + output->Resize(input->dims()); + fpga::format_fp32_ofm(output); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -40,13 +35,33 @@ void FetchKernel::Compute(const FetchParam ¶m) { args.output_data_type = fpga::DATA_TYPE_FP32; args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; - args.image.address = reinterpret_cast(input_ptr); - args.image.channels = (uint32_t)input->dims()[1]; - args.image.height = (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : - 1; args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3] - : 1; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address - = output_ptr; args.output.scale_address = output->scale; - fpga::PerformBypass(args);*/ + args.image.address = input->data(); + args.image.channels = (uint32_t)product(input->dims()); + args.image.height = 1; + args.image.width = 1; + args.image.pad_height = 0; + args.image.pad_width = 0; + args.output.address = output->data(); + args.output.scale_address = output->scale; + param->fpga_bypass_args = args; + + return true; +} + +template <> +void FetchKernel::Compute(const FetchParam ¶m) { + auto input = param.InputX(); + if (input->type() == typeid(float)) { + int col = param.Col(); + auto output = &(param.Out()->at(col)); + output->ShareDataWith(*input); + return; + } + fpga::PerformBypass(param.fpga_bypass_args); + fpga::fpga_invalidate(param.fpga_bypass_args.output.address, + param.fpga_bypass_args.image.channels * sizeof(float)); + + // TODO: DEalign: get rid of extra 0 } template class FetchKernel; diff --git a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp index fadeae324ff8f5160bc5ff410c2e02b09539a01e..944dd20a55cbbec0abda2543c1ea6ea09f17bce8 100644 --- a/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/V1/fusion_fc_kernel.cpp @@ -25,7 +25,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { paddle_mobile::fpga::NONE; int16_t leaky_relu_negative_slope = 0; auto input_x = const_cast(param->InputX()); - auto filter = const_cast(param->InputY()); + auto filter = const_cast(param->InputY()); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); auto out = param->Out(); diff --git a/src/operators/kernel/fpga/V1/pool_kernel.cpp b/src/operators/kernel/fpga/V1/pool_kernel.cpp index 8eefc3e9bea0b3662b4c08409f16f86dab60968a..c249c1a18db7eca9dfe27bbbe8c25ec6acffd7f8 100644 --- a/src/operators/kernel/fpga/V1/pool_kernel.cpp +++ b/src/operators/kernel/fpga/V1/pool_kernel.cpp @@ -21,11 +21,11 @@ namespace operators { template <> bool PoolKernel::Init(PoolParam *param) { - auto *input = const_cast(param->Input()); - auto input_ptr = input->data(); + auto *input = const_cast(param->Input()); + auto input_ptr = input->data(); Tensor *output = param->Output(); fpga::format_fp16_ofm(output); - auto output_ptr = output->mutable_data(); + auto output_ptr = output->mutable_data(); vector ksize = param->Ksize(); vector strides = param->Strides(); vector paddings = param->Paddings(); diff --git a/src/operators/kernel/fpga/V1/proposal_kernel.cpp b/src/operators/kernel/fpga/V1/proposal_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9f5f1134a74ef51dce2c28c73b503328f234a370 --- /dev/null +++ b/src/operators/kernel/fpga/V1/proposal_kernel.cpp @@ -0,0 +1,440 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PROPOSAL_OP + +#include +#include +#include +#include "operators/kernel/detection_kernel.h" + +namespace paddle_mobile { +namespace operators { + +static const double kBBoxClipDefault = std::log(1000.0 / 16.0); + +template <> +bool ProposalKernel::Init(ProposalParam *param) { + int post_nms_top_n = param->post_nms_topn_; + int64_t batch = param->scores_->dims()[0]; + auto total = post_nms_top_n * batch; + param->rpn_rois_->mutable_data({total, 4}); + param->rpn_probs_->mutable_data({total, 1}); + + // DLOG << *param->rpn_rois_; + // DLOG << *param->rpn_probs_; + + param->float_bbox = std::make_shared(); + param->float_bbox->Resize(param->bbox_deltas_->dims()); + param->float_bbox->init(typeid(float)); + fpga::format_fp32_ofm(param->float_bbox.get()); + param->float_score = std::make_shared(); + param->float_score->Resize(param->scores_->dims()); + param->float_score->init(typeid(float)); + fpga::format_fp32_ofm(param->float_score.get()); + + auto input = param->bbox_deltas_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_bbox->mutable_data(); + args.output.scale_address = param->float_bbox->scale; + param->bbox_arg = args; + + input = param->scores_; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_score->mutable_data(); + args.output.scale_address = param->float_score->scale; + param->score_arg = args; + + return true; +} + +void AppendProposals(Tensor *dst, int64_t offset, const Tensor &src) { + auto *out_data = dst->data(); + auto *to_add_data = src.data(); + size_t size_of_t = framework::SizeOfType(src.type()); + offset *= size_of_t; + std::memcpy( + reinterpret_cast(reinterpret_cast(out_data) + offset), + to_add_data, src.numel() * size_of_t); +} + +template +static inline void BoxCoder(Tensor *all_anchors, Tensor *bbox_deltas, + Tensor *variances, Tensor *proposals) { + T *proposals_data = proposals->mutable_data(); + + int64_t row = all_anchors->dims()[0]; + int64_t len = all_anchors->dims()[1]; + + auto *bbox_deltas_data = bbox_deltas->data(); + auto *anchor_data = all_anchors->data(); + const T *variances_data = nullptr; + if (variances) { + variances_data = variances->data(); + } + + for (int64_t i = 0; i < row; ++i) { + T anchor_width = anchor_data[i * len + 2] - anchor_data[i * len] + 1.0; + T anchor_height = anchor_data[i * len + 3] - anchor_data[i * len + 1] + 1.0; + + T anchor_center_x = anchor_data[i * len] + 0.5 * anchor_width; + T anchor_center_y = anchor_data[i * len + 1] + 0.5 * anchor_height; + + T bbox_center_x = 0, bbox_center_y = 0; + T bbox_width = 0, bbox_height = 0; + + if (variances) { + bbox_center_x = + variances_data[i * len] * bbox_deltas_data[i * len] * anchor_width + + anchor_center_x; + bbox_center_y = variances_data[i * len + 1] * + bbox_deltas_data[i * len + 1] * anchor_height + + anchor_center_y; + bbox_width = std::exp(std::min(variances_data[i * len + 2] * + bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(variances_data[i * len + 3] * + bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } else { + bbox_center_x = + bbox_deltas_data[i * len] * anchor_width + anchor_center_x; + bbox_center_y = + bbox_deltas_data[i * len + 1] * anchor_height + anchor_center_y; + bbox_width = std::exp(std::min(bbox_deltas_data[i * len + 2], + kBBoxClipDefault)) * + anchor_width; + bbox_height = std::exp(std::min(bbox_deltas_data[i * len + 3], + kBBoxClipDefault)) * + anchor_height; + } + + proposals_data[i * len] = bbox_center_x - bbox_width / 2; + proposals_data[i * len + 1] = bbox_center_y - bbox_height / 2; + proposals_data[i * len + 2] = bbox_center_x + bbox_width / 2 - 1; + proposals_data[i * len + 3] = bbox_center_y + bbox_height / 2 - 1; + } + // return proposals; +} + +template +static inline void ClipTiledBoxes(const Tensor &im_info, Tensor *boxes) { + T *boxes_data = boxes->mutable_data(); + const T *im_info_data = im_info.data(); + T zero(0); + for (int64_t i = 0; i < boxes->numel(); ++i) { + if (i % 4 == 0) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); + } else if (i % 4 == 1) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); + } else if (i % 4 == 2) { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[1] - 1), zero); + } else { + boxes_data[i] = + std::max(std::min(boxes_data[i], im_info_data[0] - 1), zero); + } + } +} + +template +static inline void FilterBoxes(Tensor *boxes, float min_size, + const Tensor &im_info, Tensor *keep) { + const T *im_info_data = im_info.data(); + T *boxes_data = boxes->mutable_data(); + T im_scale = im_info_data[2]; + keep->Resize({boxes->dims()[0]}); + min_size = std::max(min_size, 1.0f); + int *keep_data = keep->mutable_data(); + + int keep_len = 0; + for (int i = 0; i < boxes->dims()[0]; ++i) { + T ws = boxes_data[4 * i + 2] - boxes_data[4 * i] + 1; + T hs = boxes_data[4 * i + 3] - boxes_data[4 * i + 1] + 1; + T ws_origin_scale = + (boxes_data[4 * i + 2] - boxes_data[4 * i]) / im_scale + 1; + T hs_origin_scale = + (boxes_data[4 * i + 3] - boxes_data[4 * i + 1]) / im_scale + 1; + T x_ctr = boxes_data[4 * i] + ws / 2; + T y_ctr = boxes_data[4 * i + 1] + hs / 2; + if (ws_origin_scale >= min_size && hs_origin_scale >= min_size && + x_ctr <= im_info_data[1] && y_ctr <= im_info_data[0]) { + keep_data[keep_len++] = i; + } + } + keep->Resize({keep_len}); +} + +template +static inline std::vector> GetSortedScoreIndex( + const std::vector &scores) { + std::vector> sorted_indices; + sorted_indices.reserve(scores.size()); + for (size_t i = 0; i < scores.size(); ++i) { + sorted_indices.emplace_back(scores[i], i); + } + // Sort the score pair according to the scores in descending order + std::stable_sort(sorted_indices.begin(), sorted_indices.end(), + [](const std::pair &a, const std::pair &b) { + return a.first < b.first; + }); + return sorted_indices; +} + +template +static inline T BBoxArea(const T *box, bool normalized) { + if (box[2] < box[0] || box[3] < box[1]) { + // If coordinate values are is invalid + // (e.g. xmax < xmin or ymax < ymin), return 0. + return static_cast(0.); + } else { + const T w = box[2] - box[0]; + const T h = box[3] - box[1]; + if (normalized) { + return w * h; + } else { + // If coordinate values are not within range [0, 1]. + return (w + 1) * (h + 1); + } + } +} + +template +static inline Tensor VectorToTensor(const std::vector &selected_indices, + int selected_num) { + Tensor keep_nms; + keep_nms.Resize({selected_num}); + auto *keep_data = keep_nms.mutable_data(); + for (int i = 0; i < selected_num; ++i) { + keep_data[i] = selected_indices[i]; + } + return keep_nms; +} + +template +static inline T JaccardOverlap(const T *box1, const T *box2, bool normalized) { + if (box2[0] > box1[2] || box2[2] < box1[0] || box2[1] > box1[3] || + box2[3] < box1[1]) { + return static_cast(0.); + } else { + const T inter_xmin = std::max(box1[0], box2[0]); + const T inter_ymin = std::max(box1[1], box2[1]); + const T inter_xmax = std::min(box1[2], box2[2]); + const T inter_ymax = std::min(box1[3], box2[3]); + const T inter_w = std::max(T(0), inter_xmax - inter_xmin + 1); + const T inter_h = std::max(T(0), inter_ymax - inter_ymin + 1); + const T inter_area = inter_w * inter_h; + const T bbox1_area = BBoxArea(box1, normalized); + const T bbox2_area = BBoxArea(box2, normalized); + return inter_area / (bbox1_area + bbox2_area - inter_area); + } +} + +template +static inline Tensor NMS(Tensor *bbox, Tensor *scores, T nms_threshold, + float eta) { + int64_t num_boxes = bbox->dims()[0]; + // 4: [xmin ymin xmax ymax] + int64_t box_size = bbox->dims()[1]; + + std::vector scores_data(num_boxes); + std::copy_n(scores->data(), num_boxes, scores_data.begin()); + std::vector> sorted_indices = + GetSortedScoreIndex(scores_data); + + std::vector selected_indices; + int selected_num = 0; + T adaptive_threshold = nms_threshold; + const T *bbox_data = bbox->data(); + while (sorted_indices.size() != 0) { + int idx = sorted_indices.back().second; + bool flag = true; + for (int kept_idx : selected_indices) { + if (flag) { + T overlap = JaccardOverlap(bbox_data + idx * box_size, + bbox_data + kept_idx * box_size, false); + flag = (overlap <= adaptive_threshold); + } else { + break; + } + } + if (flag) { + selected_indices.push_back(idx); + ++selected_num; + } + sorted_indices.erase(sorted_indices.end() - 1); + if (flag && eta < 1 && adaptive_threshold > 0.5) { + adaptive_threshold *= eta; + } + } + return VectorToTensor(selected_indices, selected_num); +} + +template +std::pair ProposalForOneImage( + const Tensor &im_info_slice, const Tensor &anchors, const Tensor &variances, + const Tensor &bbox_deltas_slice, // [M, 4] + const Tensor &scores_slice, // [N, 1] + int pre_nms_top_n, int post_nms_top_n, float nms_thresh, float min_size, + float eta) { + auto *scores_data = scores_slice.data(); + + // Sort index + Tensor index_t; + index_t.Resize({scores_slice.numel()}); + int *index = index_t.mutable_data(); + for (int i = 0; i < scores_slice.numel(); ++i) { + index[i] = i; + } + auto compare = [scores_data](const int64_t &i, const int64_t &j) { + return scores_data[i] > scores_data[j]; + }; + + if (pre_nms_top_n <= 0 || pre_nms_top_n >= scores_slice.numel()) { + std::sort(index, index + scores_slice.numel(), compare); + } else { + std::nth_element(index, index + pre_nms_top_n, index + scores_slice.numel(), + compare); + index_t.Resize({pre_nms_top_n}); + } + + Tensor scores_sel, bbox_sel, anchor_sel, var_sel; + scores_sel.mutable_data({index_t.numel(), 1}); + bbox_sel.mutable_data({index_t.numel(), 4}); + anchor_sel.mutable_data({index_t.numel(), 4}); + var_sel.mutable_data({index_t.numel(), 4}); + + Tensor proposals; + proposals.mutable_data({index_t.numel(), 4}); + BoxCoder(&anchor_sel, &bbox_sel, &var_sel, &proposals); + + ClipTiledBoxes(im_info_slice, &proposals); + + Tensor keep; + FilterBoxes(&proposals, min_size, im_info_slice, &keep); + + Tensor scores_filter; + bbox_sel.mutable_data({keep.numel(), 4}); + scores_filter.mutable_data({keep.numel(), 1}); + + if (nms_thresh <= 0) { + return std::make_pair(bbox_sel, scores_filter); + } + + Tensor keep_nms = NMS(&bbox_sel, &scores_filter, nms_thresh, eta); + + if (post_nms_top_n > 0 && post_nms_top_n < keep_nms.numel()) { + keep_nms.Resize({post_nms_top_n}); + } + + proposals.mutable_data({keep_nms.numel(), 4}); + scores_sel.mutable_data({keep_nms.numel(), 1}); + + return std::make_pair(proposals, scores_sel); +} + +template <> +void ProposalKernel::Compute(const ProposalParam ¶m) { + auto score_tensor = param.float_score.get(); + fpga::PerformBypass(param.score_arg); + fpga::fpga_invalidate(score_tensor->data(), + score_tensor->numel() * sizeof(float)); + + auto bbox_tensor = param.float_bbox.get(); + fpga::PerformBypass(param.bbox_arg); + fpga::fpga_invalidate(bbox_tensor->data(), + bbox_tensor->numel() * sizeof(float)); + + auto *scores = param.float_score.get(); + auto *bbox_deltas = param.float_bbox.get(); + auto *im_info = param.im_info_; + auto anchors = *param.anchors_; + auto variances = *param.variances_; + + auto *rpn_rois = param.rpn_rois_; + auto *rpn_roi_probs = param.rpn_probs_; + + int pre_nms_top_n = param.pre_nms_topn_; + int post_nms_top_n = param.post_nms_topn_; + float nms_thresh = param.nms_thresh_; + float min_size = param.min_size_; + float eta = param.eta_; + + auto &scores_dim = scores->dims(); + int64_t num = scores_dim[0]; + int64_t c_score = scores_dim[1]; + int64_t h_score = scores_dim[2]; + int64_t w_score = scores_dim[3]; + + auto &bbox_dim = bbox_deltas->dims(); + int64_t c_bbox = bbox_dim[1]; + int64_t h_bbox = bbox_dim[2]; + int64_t w_bbox = bbox_dim[3]; + + // + Tensor bbox_deltas_swap, scores_swap; + bbox_deltas_swap.mutable_data({num, h_bbox, w_bbox, c_bbox}); + scores_swap.mutable_data({num, h_score, w_score, c_score}); + + framework::LoD lod; + lod.resize(1); + auto &lod0 = lod[0]; + lod0.push_back(0); + anchors.Resize({anchors.numel() / 4, 4}); + + int64_t num_proposals = 0; + for (int64_t i = 0; i < num; ++i) { + Tensor im_info_slice = im_info->Slice(i, i + 1); + Tensor bbox_deltas_slice = bbox_deltas_swap.Slice(i, i + 1); + Tensor scores_slice = scores_swap.Slice(i, i + 1); + + bbox_deltas_slice.Resize({h_bbox * w_bbox * c_bbox / 4, 4}); + scores_slice.Resize({h_score * w_score * c_score, 1}); + + std::pair tensor_pair = ProposalForOneImage( + im_info_slice, anchors, variances, bbox_deltas_slice, scores_slice, + pre_nms_top_n, post_nms_top_n, nms_thresh, min_size, eta); + Tensor &proposals = tensor_pair.first; + Tensor &scores = tensor_pair.second; + + AppendProposals(rpn_rois, 4 * num_proposals, proposals); + AppendProposals(rpn_roi_probs, num_proposals, scores); + num_proposals += proposals.dims()[0]; + lod0.push_back(num_proposals); + } + rpn_rois->set_lod(lod); + rpn_roi_probs->set_lod(lod); + rpn_rois->Resize({num_proposals, 4}); + rpn_roi_probs->Resize({num_proposals, 1}); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif // PROPOSAL_OP diff --git a/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..97e820e83c434dc4d552a7b0e83329fc5f6d6888 --- /dev/null +++ b/src/operators/kernel/fpga/V1/psroi_pool_kernel.cpp @@ -0,0 +1,204 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef PSROI_POOL_OP + +#include +#include +#include "operators/kernel/detection_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool PSRoiPoolKernel::Init(PSRoiPoolParam* param) { + auto dims = param->input_x_->dims(); + PADDLE_MOBILE_ENFORCE(dims[1] * dims[3] % IMAGE_ALIGNMENT == 0, + "data not aligned"); + + param->float_input = std::make_shared(); + param->float_input->mutable_data(param->input_x_->dims()); + param->float_output = std::make_shared(); + param->float_output->mutable_data(param->output_->dims()); + + auto input = param->input_x_; + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_HWC; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->float_input->mutable_data(); + args.output.scale_address = param->float_input->scale; + param->input_arg = args; + + fpga::format_fp16_ofm(param->output_); + + input = param->float_output.get(); + args.input_data_type = fpga::DATA_TYPE_FP32; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = input->data(); + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = param->output_->mutable_data(); + args.output.scale_address = param->output_->scale; + param->input_arg = args; + + return true; +} + +template <> +void PSRoiPoolKernel::Compute(const PSRoiPoolParam& param) { + auto input_tensor = param.float_input.get(); + fpga::PerformBypass(param.input_arg); + fpga::fpga_invalidate(input_tensor->data(), + input_tensor->numel() * sizeof(float)); + + auto* in = input_tensor; + auto* rois = param.input_rois_; + auto* out = param.float_output.get(); + + auto pooled_height = param.pooled_height_; + auto pooled_width = param.pooled_width_; + auto spatial_scale = param.spatial_scale_; + auto output_channels = param.output_channels_; + + auto in_dims = in->dims(); + int batch_size = in_dims[0]; + int input_channels = in_dims[1]; + int height = in_dims[2]; + int width = in_dims[3]; + int rois_num = rois->dims()[0]; + + // TODO auto in_stride = framework::stride(in_dims); + // TODO auto out_stride = framework::stride(out->dims()); + auto in_stride = + framework::stride({batch_size, height, width, input_channels}); + auto out_stride = framework::stride( + {out->dims()[0], out->dims()[2], out->dims()[3], out->dims()[1]}); + + const float* input_data = in->data(); + framework::Tensor rois_batch_id_list; + rois_batch_id_list.Resize({rois_num}); + auto rois_batch_id_data = rois_batch_id_list.mutable_data(); + return; + + PADDLE_MOBILE_ENFORCE(rois->NumLevels() > 0, "ROIS should not be empty"); + + auto rois_lod = rois->lod().back(); + int rois_batch_size = rois_lod.size() - 1; + PADDLE_MOBILE_ENFORCE( + rois_batch_size == batch_size, + "the rois_batch_size and input(X) batch_size should be the same."); + int rois_num_with_lod = rois_lod[rois_batch_size]; + PADDLE_MOBILE_ENFORCE(rois_num_with_lod == rois_num, + "the rois_num from input and lod must be the same"); + + PADDLE_MOBILE_ENFORCE( + input_channels == output_channels * pooled_height * pooled_width, + "the channels of input X should equal the product of " + "output_channels x pooled_height x pooled_width"); + + // calculate batch id index for each roi according to LoD + for (int n = 0; n < rois_batch_size; ++n) { + for (size_t i = rois_lod[n]; i < rois_lod[n + 1]; ++i) { + rois_batch_id_data[i] = n; + } + } + auto output_data = out->mutable_data(); + auto input_rois = rois->data(); + + // calculate psroipooling, parallel processing can be implemented per ROI + for (int n = 0; n < rois_num; ++n) { + // set roi batch id + int roi_batch_id = rois_batch_id_data[n]; + + // [start, end) interval for spatial sampling + auto offset_input_rois = input_rois + n * 4; + auto roi_start_w = + static_cast(round(offset_input_rois[0])) * spatial_scale; + auto roi_start_h = + static_cast(round(offset_input_rois[1])) * spatial_scale; + auto roi_end_w = + static_cast(round(offset_input_rois[2]) + 1.) * spatial_scale; + auto roi_end_h = + static_cast(round(offset_input_rois[3]) + 1.) * spatial_scale; + + // Force too small rois to be 1 x 1 + auto roi_height = std::max(roi_end_h - roi_start_h, 0.1f); // avoid 0 + auto roi_width = std::max(roi_end_w - roi_start_w, 0.1f); + + // Compute bin size w and h at input feature map + auto bin_size_h = roi_height / static_cast(pooled_height); + auto bin_size_w = roi_width / static_cast(pooled_width); + DLOG << 3; + + // calculate each pixel of the output feature map. + int out_roi_offset = n * out_stride[0]; + for (int c = 0; c < output_channels; ++c) { + // per category + // int out_plane_offset = out_roi_offset + c * out_stride[1]; + int out_plane_offset = out_roi_offset + c; + for (int ph = 0; ph < pooled_height; ++ph) { + // TODO int out_row_offset = out_plane_offset + ph * + // out_stride[2]; + int out_row_offset = out_plane_offset + ph * out_stride[1]; + for (int pw = 0; pw < pooled_width; ++pw) { + // calculate w and h at input feature map + int hstart = floor(static_cast(ph) * bin_size_h + roi_start_h); + int wstart = floor(static_cast(pw) * bin_size_w + roi_start_w); + int hend = + ceil(static_cast(ph + 1) * bin_size_h + roi_start_h); + int wend = + ceil(static_cast(pw + 1) * bin_size_w + roi_start_w); + // Add roi offsets and clip to input boundaries + hstart = std::min(std::max(hstart, 0), height); + wstart = std::min(std::max(wstart, 0), width); + hend = std::min(std::max(hend, 0), height); + wend = std::min(std::max(wend, 0), width); + + // TODO int output_index = out_row_offset + pw; + int output_index = out_row_offset + pw * output_channels; + int input_channel = (c * pooled_height + ph) * pooled_width + pw; + // TODO int input_plane_offset = + // TODO roi_batch_id * in_stride[0] + input_channel * + // in_stride[1]; + int input_plane_offset = roi_batch_id * in_stride[0] + input_channel; + auto offset_input_data = input_data + input_plane_offset; + float out_sum = 0.; + bool is_empty = (hend <= hstart) || (wend <= wstart); + for (int ih = hstart; ih < hend; ++ih) { + for (int iw = wstart; iw < wend; ++iw) { + int input_index = ih * in_stride[1] + iw * input_channel; + out_sum += offset_input_data[input_index]; + } + } + float bin_area = (hend - hstart) * (wend - wstart); + output_data[output_index] = is_empty ? 0. : out_sum / bin_area; + } + } + } + } + fpga::format_image(out); + fpga::PerformBypass(param.output_arg); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif // PSROI_POOL_OP diff --git a/src/operators/kernel/fpga/V1/reshape2_kernel.cpp b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..9e5ce02658adb5fe94935b8d7f4d412405a0727e --- /dev/null +++ b/src/operators/kernel/fpga/V1/reshape2_kernel.cpp @@ -0,0 +1,136 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef RESHAPE2_OP + +#include "operators/kernel/reshape2_kernel.h" +#include "framework/ddim.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool Reshape2Kernel::Init(Reshape2Param *param) { + auto input = const_cast(param->InputX()); + auto output = param->Out(); + auto shape = param->Shape(); + + auto num_in = framework::product(input->dims()); + auto num_shape = framework::product(framework::make_ddim(shape)); + PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); + + for (int i = 0; i < shape.size(); i++) { + if (shape[i] == -1) { + shape[i] = static_cast(-num_in / num_shape); + break; + } + } + output->Resize(framework::make_ddim(shape)); + output->set_type(input->type()); + fpga::format_ofm(output); + DLOG << "input: " << input; + DLOG << "output: " << output; + + return true; +} + +void reshape(LoDTensor *input, LoDTensor *output) { + // Subscript r means after reshape + // TODO zhangyang verify this function + + float *input_ptr_f, *output_ptr_f; + half *input_ptr_h, *output_ptr_h; + bool is_float = false; + + if (input->type() == typeid(float)) { + input_ptr_f = input->data(); + output_ptr_f = output->data(); + is_float = true; + + } else { + input_ptr_h = input->data(); + output_ptr_h = output->data(); + } + + auto C = static_cast(input->dims()[1]); + auto H = static_cast(input->dims()[2]); + auto W = static_cast(input->dims()[3]); + auto Cr = static_cast(output->dims()[1]); + auto Hr = static_cast(output->dims()[2]); + auto Wr = static_cast(output->dims()[3]); + PADDLE_MOBILE_ENFORCE(C * H * W == Cr * Hr * Wr, "Dims don't match"); + auto WC = W * C; + auto WC_align = fpga::align_to_x(WC, IMAGE_ALIGNMENT); + auto HW = H * W; + auto WCr = Wr * Cr; + auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT); + auto HWr = Hr * Wr; + + int offset_align = 0; + int offset_r = 0, offset_align_r = 0; + int cr = 0, hr = 0, wr = 0; + + for (int h = 0; h < H; h++) { + int offset0 = h * WC_align; + for (int w = 0; w < W; w++) { + int offset1 = w * C + offset0; + for (int c = 0; c < C; c++) { + offset_align = offset1 + c; + offset_r = c * HW + h * W + c; + cr = offset_r / HWr; + hr = offset_r % HWr / Wr; + wr = offset_r % Wr; + offset_align_r = hr * WCr_align + wr * Cr + cr; + // DLOG << "hwc"<< h<< " " << w << " " << c; + // DLOG << "hrwrcr" << hr<< " " << wr << " " << cr; + if (is_float) { + output_ptr_f[offset_align_r] = input_ptr_f[offset_align]; + } else { + output_ptr_h[offset_align_r] = input_ptr_h[offset_align]; + } + } + } + } +} + +template <> +void Reshape2Kernel::Compute(const Reshape2Param ¶m) { + auto input = const_cast(param.InputX()); + auto output = param.Out(); + auto shape = param.Shape(); + + auto num_in = framework::product(input->dims()); + auto num_shape = framework::product(framework::make_ddim(shape)); + PADDLE_MOBILE_ENFORCE(num_shape != 0, "0 index is not supported"); + + for (int i = 0; i < shape.size(); i++) { + if (shape[i] == -1) { + shape[i] = static_cast(-num_in / num_shape); + break; + } + } + output->Resize(framework::make_ddim(shape)); + if (output->dims() == input->dims()) { + DLOG << "No need to reshape"; + return; + } + + reshape(input, output); + // +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp index 6c836e2776891f283677287eae54019f0dbef39b..bf36873a1fb442a4d5ff6f57056515009d275cd6 100644 --- a/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp +++ b/src/operators/kernel/fpga/V1/sigmoid_kernel.cpp @@ -25,7 +25,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { paddle_mobile::fpga::SIGMOID; int16_t leaky_relu_negative_slope = 0; auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto out = param->Out(); fpga::format_fp16_ofm(out); @@ -38,7 +38,7 @@ bool SigmoidKernel::Init(SigmoidParam *param) { args.image.width = (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1; args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = out->data(); + args.output.address = out->data(); args.output.scale_address = out->scale; args.output.activation.activation_type = activation_enable; args.output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope; diff --git a/src/operators/kernel/fpga/V1/slice_kernel.cpp b/src/operators/kernel/fpga/V1/slice_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..5d0ac1fe61caa9cce0e1af6f8ac5c53b315573db --- /dev/null +++ b/src/operators/kernel/fpga/V1/slice_kernel.cpp @@ -0,0 +1,57 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef SLICE_OP + +#include "operators/kernel/slice_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool SliceKernel::Init(SliceParam* param) { + auto output = param->output_; + fpga::format_fp16_ofm(output); + DLOG << "input: " << param->input_; + DLOG << "output: " << param->output_; + if (param->input_->type() != typeid(half)) { + DLOG << "wrong type"; + } + return true; +} +template <> +void SliceKernel::Compute(const SliceParam& param) { + // Only support slicing in channel dimension + + auto input = param.input_; + DLOG << input; + int HW = input->dims()[2] * input->dims()[3]; + int channel = input->dims()[1]; + auto input_ptr = input->data(); + auto output_ptr = param.output_->data(); + + int start = param.starts_[0], end = param.ends_[0]; + start = start < 0 ? start + channel : start; + end = end < 0 ? end + channel : end; + start = start > channel ? channel : start; + end = end > channel ? channel : end; + int len = end - start; + + for (int i = 0; i < HW; i++) { + memcpy(output_ptr + len * i, input_ptr + i * channel + start, len); + } +} +} // namespace operators +} // namespace paddle_mobile +#endif diff --git a/src/operators/kernel/fpga/V1/softmax_kernel.cpp b/src/operators/kernel/fpga/V1/softmax_kernel.cpp index 2698fdece49409aec017112e8613a706c248cf48..683c5953b3c90bb387dce14b7941764272906ceb 100644 --- a/src/operators/kernel/fpga/V1/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/V1/softmax_kernel.cpp @@ -23,49 +23,72 @@ namespace operators { template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); + auto input_ptr = input->data(); auto out = param->Out(); - fpga::format_fp32_ofm(out); - auto float_input = new Tensor; - if (input->dims().size() == 2) { - float_input->mutable_data({1, input->dims()[1]}); - } else if (input->dims().size() == 4) { - float_input->mutable_data( - {1, input->dims()[2], input->dims()[3], input->dims()[1]}); - } else { - DLOG << "wrong dimension of softmax input"; + + auto float_input = new LoDTensor; + + PADDLE_MOBILE_ENFORCE(input->dims().size() == 4, + "Softmax should have 4-order input"); + auto dims = framework::vectorize(input->dims()); + auto channel = dims[3]; + if (channel == 1) { // This input is generated by FC op, dims = [N C 1 1] + PADDLE_MOBILE_ENFORCE(dims[2] == 1, "Softmax input must come from FC op"); + dims[3] = dims[1]; + dims[1] = 1; + } + input->Resize(framework::make_ddim(dims)); + float_input->Resize(framework::make_ddim(dims)); + + if (channel != 2) { // Use CPU + float_input->init(typeid(float)); + fpga::format_fp32_ofm(float_input); + fpga::format_fp32_ofm(out); + + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP32; + args.image.address = input_ptr; + args.image.height = (uint32_t)dims[1]; + args.image.width = (uint32_t)dims[2]; + args.image.channels = (uint32_t)dims[3]; + args.output.address = float_input->data(); + args.output.scale_address = float_input->scale; + param->SetFloatInput(float_input); + param->SetFpgaArgs(args); + } else { // Use FPGA + fpga::format_fp16_ofm(out); + fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; + args.input_layout_type = fpga::LAYOUT_HWC; + args.output_layout_type = fpga::LAYOUT_CHW; + args.input_data_type = fpga::DATA_TYPE_FP16; + args.output_data_type = fpga::DATA_TYPE_FP16; + args.image.address = input_ptr; + args.image.height = (uint32_t)input->dims()[1]; + args.image.width = (uint32_t)input->dims()[2]; + args.image.channels = (uint32_t)input->dims()[3]; + args.output.address = out->data(); + args.output.scale_address = out->scale; + args.output.activation.activation_type = fpga::SOFTMAX; + param->SetFpgaArgs(args); } - fpga::format_fp32_ofm(float_input); - fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; - args.input_layout_type = fpga::LAYOUT_HWC; - args.output_layout_type = fpga::LAYOUT_CHW; - args.input_data_type = fpga::DATA_TYPE_FP16; - args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = input_ptr; - args.image.height = - (input->dims().size() == 4) ? (uint32_t)input->dims()[2] : 1; - args.image.width = - (input->dims().size() == 4) ? (uint32_t)input->dims()[3] : 1; - args.image.channels = (uint32_t)input->dims()[1]; - args.output.address = float_input->data(); - args.output.scale_address = float_input->scale; - param->SetFloatInput(float_input); - param->SetFpgaArgs(args); return true; } template <> void SoftmaxKernel::Compute(const SoftmaxParam ¶m) { - Tensor *in_x = param.FloatInput(); - Tensor *out = param.Out(); - fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate((void *)in_x->data(), // NOLINT - in_x->numel() * sizeof(float)); - // TODO: In general case, 0 should be squeezed before softmax input // NOLINT - math::SoftmaxFuntor()(in_x, out); - fpga::fpga_flush(out->data(), out->memory_size()); + + if (param.FpgaArgs().output.activation.activation_type != fpga::SOFTMAX) { + Tensor *out = param.Out(); + Tensor *in_x = param.FloatInput(); + fpga::fpga_invalidate(in_x->data(), in_x->numel() * sizeof(float)); + math::SoftmaxFuntor()(in_x, out); + fpga::fpga_flush(out->data(), out->memory_size()); + } } } // namespace operators diff --git a/src/operators/kernel/fpga/V1/split_kernel.cpp b/src/operators/kernel/fpga/V1/split_kernel.cpp index b8c0bb3be64d2393b61b0f82375c695000f52b65..584cb41fb30b02c757430bd748d4672cc870b591 100644 --- a/src/operators/kernel/fpga/V1/split_kernel.cpp +++ b/src/operators/kernel/fpga/V1/split_kernel.cpp @@ -20,7 +20,7 @@ namespace paddle_mobile { namespace operators { template <> bool SplitKernel::Init(SplitParam *param) { - auto *in = const_cast(param->InputX()); + auto *in = const_cast(param->InputX()); auto outs = param->Outs(); auto sections = param->Sections(); int axis = param->Axis(); @@ -34,22 +34,32 @@ bool SplitKernel::Init(SplitParam *param) { fpga::fpga_malloc(image_num * sizeof(float *))); auto out_channels = reinterpret_cast( fpga::fpga_malloc(image_num * sizeof(uint32_t))); + DLOG << "input: " << in; for (int i = 0; i < image_num; i++) { fpga::format_fp16_ofm(outs[i]); - images_out[i] = outs[i]->mutable_data(); + DLOG << "output: " << outs[i]; + images_out[i] = outs[i]->mutable_data(); scales_out[i] = outs[i]->scale; out_channels[i] = (uint32_t)sections[i]; } + auto deleter = [](void *p) { fpga::fpga_free(p); }; + fpga::SplitArgs arg = {0}; arg.image_num = image_num; - arg.image_in = (half *)in->data(); + arg.image_in = in->data(); arg.scale_in = in->scale; arg.images_out = images_out; arg.scales_out = scales_out; arg.out_channel_nums = out_channels; arg.height = (uint32_t)in->dims()[2]; arg.width = (uint32_t)in->dims()[3]; + arg.vector_split_space.push_back( + std::shared_ptr(reinterpret_cast(images_out), deleter)); + arg.vector_split_space.push_back( + std::shared_ptr(reinterpret_cast(scales_out), deleter)); + arg.vector_split_space.push_back( + std::shared_ptr(reinterpret_cast(out_channels), deleter)); param->SetFpgaArgs(arg); return true; diff --git a/src/operators/kernel/fpga/V1/tanh_kernel.cpp b/src/operators/kernel/fpga/V1/tanh_kernel.cpp index 216cb726e3fe93e9ebfaf328a9ab4ca0725b6bb1..d7bbc5f0435aaca53be01d6c82d919a2df072ce2 100644 --- a/src/operators/kernel/fpga/V1/tanh_kernel.cpp +++ b/src/operators/kernel/fpga/V1/tanh_kernel.cpp @@ -21,9 +21,11 @@ namespace operators { template <> bool TanhKernel::Init(TanhParam *param) { - auto input = const_cast(param->InputX()); - auto input_ptr = input->data(); - auto float_input = new Tensor; + auto input = const_cast(param->InputX()); + DLOG << "input: " << input; + auto input_ptr = input->data(); + auto float_input = new LoDTensor; + float_input->mutable_data( {1, input->dims()[1], input->dims()[2], input->dims()[3]}); fpga::format_fp32_ofm(float_input); diff --git a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp index 48e84707fabb4ccd0618da672b82c5380d9533ba..f74839f1fc06e0b5bf391187f5ecab461f7c00f5 100644 --- a/src/operators/kernel/fpga/V1/transpose2_kernel.cpp +++ b/src/operators/kernel/fpga/V1/transpose2_kernel.cpp @@ -20,7 +20,21 @@ namespace operators { template <> bool Transpose2Kernel::Init(Transpose2Param *param) { - param->Out()->ShareDataWith(*param->InputX()); + auto input = param->InputX(); + auto output = param->Out(); + auto axis = param->Axis(); + auto dim = input->dims(); + output->ShareDataWith(*input); + + auto dim_v = vectorize(dim); + + for (int i = 0; i < axis.size(); i++) { + dim_v[i] = dim[axis[i]]; + } + output->Resize(framework::make_ddim(dim_v)); + + DLOG << "input: " << input; + DLOG << "output: " << output; return true; } diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 29abcf4b61712897e91f245342bbae15b9a27fc6..5eaeb784bd81b21d92a57fde282e7d80bb3f553e 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1053,7 +1053,7 @@ class SoftmaxParam : public OpParam { GType *FloatInput() const { return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); } - void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } + void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); } const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } #endif @@ -1212,18 +1212,8 @@ class FetchParam : public OpParam { framework::LoDTensorArray *out_; int col_; #ifdef PADDLE_MOBILE_FPGA - - private: - std::shared_ptr float_input_x_; - fpga::BypassArgs fpga_bypass_args; - public: - GType *FloatInput() const { - return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); - } - void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } - const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } - void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } + fpga::BypassArgs fpga_bypass_args; #endif }; @@ -1660,7 +1650,7 @@ class TanhParam : public OpParam { GType *FloatInput() const { return float_input_x_ == nullptr ? input_x_ : float_input_x_.get(); } - void SetFloatInput(Tensor *input) { float_input_x_.reset(input); } + void SetFloatInput(LoDTensor *input) { float_input_x_.reset(input); } const fpga::BypassArgs &FpgaArgs() const { return fpga_bypass_args; } void SetFpgaArgs(const fpga::BypassArgs &args) { fpga_bypass_args = args; } #endif diff --git a/src/operators/reshape2_op.cpp b/src/operators/reshape2_op.cpp index d1623076570d466fc53f885374060c5e744365ed..c0f2a2450d29b2f95edb2ff049cea8280913afc8 100644 --- a/src/operators/reshape2_op.cpp +++ b/src/operators/reshape2_op.cpp @@ -43,5 +43,8 @@ REGISTER_OPERATOR_CPU(reshape2, ops::Reshape2Op); #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(reshape2, ops::Reshape2Op); #endif +#ifdef PADDLE_MOBILE_FPGA +REGISTER_OPERATOR_FPGA(reshape2, ops::Reshape2Op); +#endif #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index f3dffbad1c065561d86da0e976792d206198c61e..fdd7c46fedc98b3f1811cd10ffe6bcec7d0e3a46 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -74,6 +74,9 @@ if (CON GREATER -1) ADD_EXECUTABLE(test-densebox fpga/test_densebox_combine.cpp test_helper.h test_include.h executor_for_test.h) target_link_libraries(test-densebox paddle-mobile) + ADD_EXECUTABLE(test-rfcn fpga/test_rfcn.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-rfcn paddle-mobile) + set(FOUND_MATCH ON) endif () diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index 1a5daafe2b784b98b102fa2eab04f71c67260d9c..723e4ea3e3ff35e0d555703391adcafacccb42f1 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -51,8 +51,8 @@ void convert_to_chw(int16_t **data_in, int channel, int height, int width, } } -void dump(std::string filename, const Tensor input_tensor) { - auto dataptr = input_tensor.data(); +void dump(std::string filename, Tensor input_tensor) { + auto dataptr = reinterpret_cast(input_tensor.get_data()); std::ofstream out(filename.c_str()); float result = 0; for (int i = 0; i < input_tensor.numel(); ++i) { @@ -61,16 +61,16 @@ void dump(std::string filename, const Tensor input_tensor) { } out.close(); } -void dump_stride(std::string filename, const Tensor input_tensor, - const int dumpnum) { +void dump_stride_half(std::string filename, Tensor input_tensor, + const int dumpnum) { int c = (input_tensor.dims())[1]; int h = (input_tensor.dims())[2]; int w = (input_tensor.dims())[3]; - auto data_ptr = input_tensor.data(); - int16_t *data_tmp = (int16_t *)malloc(c * h * w * sizeof(int16_t)); - int16_t *data_ptr_16 = (int16_t *)data_ptr; + auto data_ptr = input_tensor.get_data(); + auto *data_tmp = + reinterpret_cast(malloc(c * h * w * sizeof(int16_t))); + auto *data_ptr_16 = reinterpret_cast(data_ptr); convert_to_chw(&data_ptr_16, c, h, w, data_tmp); - // const int16_t *dataptr = input_tensor.data(); std::ofstream out(filename.c_str()); float result = 0; int stride = input_tensor.numel() / dumpnum; @@ -82,6 +82,20 @@ void dump_stride(std::string filename, const Tensor input_tensor, out.close(); free(data_tmp); } + +void dump_stride_float(std::string filename, Tensor input_tensor, + const int dumpnum) { + auto data_ptr = reinterpret_cast(input_tensor.get_data()); + std::ofstream out(filename.c_str()); + float result = 0; + int stride = input_tensor.numel() / dumpnum; + stride = stride > 0 ? stride : 1; + for (int i = 0; i < input_tensor.numel(); i += stride) { + result = data_ptr[i]; + out << result << std::endl; + } + out.close(); +} static const char *g_resnet50 = "../models/resnet50"; const std::string g_image_src_float = "../images/image_src_float"; int main() { @@ -98,24 +112,21 @@ int main() { for (int i = 0; i < 73; i++) { auto tensor_ptr = paddle_mobile.FetchResult(i); std::string saveName = "resnet50_result_" + std::to_string(i); - paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data(), + paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).get_data(), tensor_ptr->numel() * sizeof(half)); - dump_stride(saveName, (*tensor_ptr), 20); + dump_stride_half(saveName, (*tensor_ptr), 20); // dump(saveName, (*tensor_ptr)); } - std::shared_ptr output_tensor = paddle_mobile.FetchResult(73); - //(*output_tensor).dump("resnet50_result_73"); - output_tensor = paddle_mobile.FetchResult(74); - //(*output_tensor).dump("resnet50_result_74"); - // std::shared_ptr output_tensor = paddle_mobile.FetchResult(74); - - // output_tensor = paddle_mobile.FetchResult(74); + auto tensor_ptr = paddle_mobile.FetchResult(73); + dump_stride_float("resnet50_result_73", (*tensor_ptr), 20); + tensor_ptr = paddle_mobile.FetchResult(74); + dump_stride_float("resnet50_result_74", (*tensor_ptr), 9999); float max = 0; - auto data_ptr = output_tensor->data(); + auto data_ptr = tensor_ptr->data(); int maximumIdx = 0; - for (int i = 0; i < (*output_tensor).numel(); i++) { + for (int i = 0; i < (*tensor_ptr).numel(); i++) { if (data_ptr[i] > max) { maximumIdx = i; max = data_ptr[i]; diff --git a/test/fpga/test_rfcn.cpp b/test/fpga/test_rfcn.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e1d13541ef8000da18ceda4c356d158198d7b9f4 --- /dev/null +++ b/test/fpga/test_rfcn.cpp @@ -0,0 +1,62 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +#ifdef PADDLE_MOBILE_FPGA_V1 +#include "fpga/V1/api.h" +#endif +#ifdef PADDLE_MOBILE_FPGA_V2 +#include "fpga/V2/api.h" +#endif + +void readStream(std::string filename, uint8_t *buf) { + std::ifstream in; + in.open(filename, std::ios::in); + if (!in.is_open()) { + std::cout << "open File Failed." << std::endl; + return; + } + int i = 0; + while (!in.eof()) { + in >> buf[i]; + i++; + } + in.close(); +} + +static const char *g_rfcn_combine = "../models/rfcn"; +static const char *g_image_src_float = "../models/rfcn/data.bin"; +int main() { + paddle_mobile::fpga::open_device(); + paddle_mobile::PaddleMobile paddle_mobile; + + if (paddle_mobile.Load(std::string(g_rfcn_combine) + "/model", + std::string(g_rfcn_combine) + "/params", true, false, + 1, true)) { + float img_info[3] = {768, 1536, 768.0f / 960.0f}; + auto img = fpga::fpga_malloc(768 * 1536 * 3 * sizeof(float)); + readStream(g_image_src_float, reinterpret_cast(img)); + std::vector v(3, nullptr); + paddle_mobile.FeedData({img_info, img}); + paddle_mobile.Predict_To(-1); + paddle_mobile.GetResults(&v); + DLOG << "Computation done"; + fpga::fpga_free(img); + } + + return 0; +} diff --git a/tools/op.cmake b/tools/op.cmake index d25fce7cff14effbc1264dc46cba6364cee486bf..a7d79a71d1e67ac4cb2735c6463c538bfc58202a 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -126,6 +126,11 @@ if (CON GREATER -1) set(RESHAPE_OP ON) set(FUSION_CONVADDBNRELU_OP ON) set(FUSION_CONVADDBN_OP ON) + set(RESHAPE2_OP ON) + set(PSROI_POOL_OP ON) + set(PROPOSAL_OP ON) + set(ANCHOR_GENERATOR_OP ON) + set(SLICE_OP ON) set(FOUND_MATCH ON) endif()