diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 878544d07191cd7fdb24590b4c14335656b89813..4caf588a7fede0fb1f2798cce3e10af34449f6c5 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -52,6 +52,22 @@ void format_fp16_ofm(framework::Tensor *ofm_tensor) { ofm_tensor->reset_data_ptr(p); } +void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) { + // auto dims = ofm_tensor->dims(); + size_t memory_size = 0; + if (dims.size() == 4) { + auto channel = dims[1], height = dims[2], width = dims[3]; + memory_size = + height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); + } else if (dims.size() == 2) { + memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); + } else { + DLOG << "Wrong ofm dimension"; + } + auto p = fpga_malloc(memory_size); + memset(p, 0, memory_size); + ofm_tensor->reset_data_ptr(p); +} void format_fp32_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; @@ -211,8 +227,9 @@ void expand_conv_arg(ConvArgs *arg) { align_to_x(args.kernel.height * args.kernel.width * channel_per_group, FILTER_ELEMENT_ALIGNMENT); - auto output_amount_per_row = - align_to_x(output_width * args.filter_num, IMAGE_ALIGNMENT); + auto output_amount_per_row = align_to_x( + (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num, + IMAGE_ALIGNMENT); // find the opt partition strategy uint64_t res_win; @@ -243,7 +260,8 @@ void expand_conv_arg(ConvArgs *arg) { auto block_len = res_fit; auto block_last = output_width - res_fit * (block_num - 1); - auto res_amount_per_row = output_width * args.filter_num; + auto res_amount_per_row = + (output_width - (args.deconv_tx_param.omit_size) * 2) * args.filter_num; auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; auto image_block_amount_per_row = @@ -282,10 +300,14 @@ void expand_conv_arg(ConvArgs *arg) { : 0; auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + auto deconv_param = ((args.deconv_tx_param.deconv_en) << 24) | + ((args.deconv_tx_param.sub_conv_num) << 16) | + ((args.deconv_tx_param.omit_size) << 0); (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); (*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address); - (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address); + (*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address) + + args.deconv_tx_param.out_addr_offset; (*arg).driver.output_height = output_height; (*arg).driver.output_width = output_width; (*arg).driver.filter_per_group = filter_per_group; @@ -309,6 +331,7 @@ void expand_conv_arg(ConvArgs *arg) { (*arg).driver.post_prog_full_cnt = post_prog_full_cnt; (*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len; (*arg).driver.cmd = cmd; + (*arg).driver.deconv_param = deconv_param; } // expand_conv_arg() void expand_EW_arg(EWAddArgs *arg) { @@ -357,6 +380,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg = (ConvArgs *)fpga_malloc(arg->split_num * sizeof(ConvArgs)); // NOLINT + memset(arg->conv_arg, 0, arg->split_num * sizeof(struct ConvArgs)); + arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_out = out_ptr; arg->concat_arg.scale_out = out->scale; @@ -444,20 +469,19 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, float *bs_ptr) { auto input_ptr = input->data(); auto filter_ptr = filter->data(); - auto out_ptr = out->data(); arg->group_num = (uint32_t)group_num; arg->sub_conv_num = (uint32_t)stride_h; arg->filter_num = (uint32_t)filter->dims()[0]; - int sub_conv_num = arg->sub_conv_num; + uint32_t sub_conv_num = arg->sub_conv_num; int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], padding_w, stride_w); - int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis( + auto sub_filter_width = (uint32_t)deconv_filter::deconv_get_sub_filter_axis( (int)filter->dims()[3], stride_w); - int sub_output_width = deconv_filter::deconv_get_sub_out_axis( + auto sub_output_width = (uint32_t)deconv_filter::deconv_get_sub_out_axis( (int)input->dims()[3], sub_pad, sub_filter_width); - int sub_output_height = deconv_filter::deconv_get_sub_out_axis( + auto sub_output_height = (uint32_t)deconv_filter::deconv_get_sub_out_axis( (int)input->dims()[2], sub_pad, sub_filter_width); arg->sub_output_width = (uint32_t)sub_output_width; @@ -465,28 +489,25 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit( stride_w, (int)filter->dims()[3], padding_w); - arg->output.address = out_ptr; - arg->output.scale_address = out->scale; - - int sub_channels = (int)input->dims()[1]; - int omit_size = arg->omit_size; + auto sub_channels = (int)input->dims()[1]; + uint32_t omit_size = arg->omit_size; int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; - int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; int sub_filter_num = sub_conv_num * (arg->filter_num); - int conv_output_size = + framework::DDim dims_out_new = framework::make_ddim( + {1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width}); + fpga::format_fp16_ofm(out, dims_out_new); + auto out_ptr = out->data(); + arg->output.address = + (half *)out_ptr + + omit_size * sizeof(half) * + (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); + arg->output.scale_address = out->scale; + + uint32_t conv_output_size = (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * sub_output_height; - int ouput_size = conv_output_size * sub_conv_num; - - int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT); - int align_sub_filter_count = - align_to_x(sub_filter_width * sub_filter_width * sub_channels, - FILTER_ELEMENT_ALIGNMENT); - int align_conv_sub_filter_count = - align_sub_filter_count * align_sub_filter_num; - - int split_num = + uint32_t split_num = group_num == 1 ? (uint32_t)get_deconv_plit_num(filter, sub_conv_num) : 1; arg->split_conv_args = @@ -508,14 +529,10 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, (float **)fpga_malloc(split_num * sizeof(float *)); arg->split_conv_args[i].concat_arg.channel_num = (uint32_t *)fpga_malloc(split_num * sizeof(uint32_t)); - // arg->split_conv_args[i].concat_arg.image_out = - // fpga_malloc(conv_output_size * sizeof(half)); - // arg->split_conv_args[i].concat_arg.scale_out = fpga_malloc(2 * - // sizeof(float)); } - int filter_num_per_div = - get_deconv_filter_num_per_div(filter, group_num, stride_w); + auto filter_num_per_div = + (uint32_t)get_deconv_filter_num_per_div(filter, group_num, stride_w); int element_num = get_aligned_filter_element_num( (int)(sub_channels * sub_filter_width * sub_filter_width)); @@ -533,14 +550,21 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, align_to_x(residual, FILTER_NUM_ALIGNMENT); int filter_sub_conv_offset = element_num * num_after_alignment; + uint32_t out_addr_offset = 0; for (int i = 0; i < sub_conv_num; ++i) { if (sub_conv_num == 1) { arg->split_conv_args[i].output.address = arg->output.address; arg->split_conv_args[i].output.scale_address = arg->output.scale_address; + out_addr_offset = 0; } else { - auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); - arg->split_conv_args[i].output.address = (void *)((half *)ptr_output); + auto ptr_output = (half *)out_ptr; + out_addr_offset = + sizeof(half) * (sub_conv_num - 1 - i) * + (align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT)); + + arg->split_conv_args[i].output.address = (void *)(ptr_output); + auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); arg->split_conv_args[i].output.scale_address = ptr_output_scale; } @@ -556,6 +580,13 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i].conv_arg[j].kernel.stride_w = 1; arg->split_conv_args[i].conv_arg[j].kernel.stride_h = 1; + arg->split_conv_args[i].conv_arg[j].deconv_tx_param.deconv_en = 1; + arg->split_conv_args[i].conv_arg[j].deconv_tx_param.sub_conv_num = + sub_conv_num; + arg->split_conv_args[i].conv_arg[j].deconv_tx_param.omit_size = omit_size; + arg->split_conv_args[i].conv_arg[j].deconv_tx_param.out_addr_offset = + out_addr_offset; + arg->split_conv_args[i].conv_arg[j].image.scale_address = input->scale; arg->split_conv_args[i].conv_arg[j].image.channels = (uint32_t)sub_channels; @@ -568,10 +599,10 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, arg->split_conv_args[i].conv_arg[j].image.address = input_ptr; arg->split_conv_args[i].conv_arg[j].filter_scale_address = filter->scale; - arg->split_conv_args[i].conv_arg[j].filter_num = (uint32_t)( - j == split_num - 1 - ? sub_filter_num - (split_num - 1) * filter_num_per_div // NOLINT - : filter_num_per_div); + arg->split_conv_args[i].conv_arg[j].filter_num = + (uint32_t)(j == split_num - 1 + ? sub_filter_num - (split_num - 1) * filter_num_per_div + : filter_num_per_div); size_t filter_size = element_num * @@ -588,19 +619,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, fpga_flush(arg->split_conv_args[i].conv_arg[j].filter_address, filter_size); - { - static int test_cnt = 0; - signed char result = 0; - if (test_cnt <= 1) { - std::string filename = "deconv_split_flt" + std::to_string(test_cnt); - - fpga::savefile( - filename, arg->split_conv_args[i].conv_arg[j].filter_address, - filter_size, result); - test_cnt++; - } - } - size_t bs_align_num = align_to_x( arg->split_conv_args[i].conv_arg[j].filter_num, BS_NUM_ALIGNMENT); size_t bs_size = 2 * bs_align_num * sizeof(float); diff --git a/src/fpga/V1/api.h b/src/fpga/V1/api.h index f6ce24c4edf0d81c0cb6a9bfa65b02ea6eeb813a..4cbba4c59a3bb8e4b02ad69c6331b30ad2f22a50 100644 --- a/src/fpga/V1/api.h +++ b/src/fpga/V1/api.h @@ -23,6 +23,7 @@ namespace fpga { void format_image(framework::Tensor* image_tensor); void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory +void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims); void format_fp32_ofm(framework::Tensor* ofm_tensor); float filter_find_max(framework::Tensor* filter_tensor); diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index f58e3a5bfde2f66b26d26c529ade229f79d5d05a..d70d52d053aea05649e340a768736077311f0f5c 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -260,6 +260,7 @@ int ComputeBasicConv(const struct ConvArgs &args) { reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8); reg_writeq(args.driver.prog_full_cnt, 0xd08); reg_writeq(args.driver.post_prog_full_cnt, 0xd10); + reg_writeq(args.driver.deconv_param, 0xd18); reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20); reg_writeq(args.driver.cmd, REG_CONV_CMD); DLOG << "before reg poll"; diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index eabe03c29f2f8a6805b63131e767de9e86d7c055..bb446dda1a99cee44f48d45f02e25b1ef8b6d6a3 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -105,6 +105,8 @@ struct ConvDriverParam { uint64_t post_prog_full_cnt; uint64_t fpga_bias_scale_len; uint64_t cmd; + + uint64_t deconv_param; }; struct EWAddDriverParam { @@ -117,6 +119,13 @@ struct EWAddDriverParam { uint64_t coefficient; uint64_t cmd; }; + +struct DeconvTxParm { + uint32_t omit_size; + uint32_t sub_conv_num; + uint32_t deconv_en; + uint32_t out_addr_offset; +}; #endif struct ConvArgs { @@ -136,6 +145,7 @@ struct ConvArgs { #endif #ifdef PADDLE_MOBILE_FPGA_V1 + struct DeconvTxParm deconv_tx_param; struct ConvDriverParam driver; #endif }; diff --git a/src/framework/executor.cpp b/src/framework/executor.cpp index 884f5200f2bd9ec1b86429b4d37c3e58ea16724e..e82006be05e430fa46bd2ea8c372237ab9630f38 100644 --- a/src/framework/executor.cpp +++ b/src/framework/executor.cpp @@ -230,6 +230,10 @@ template bool Executor::varInputMemory( const std::shared_ptr &var_desc, Variable *var, LoDTensor *tensor) const { +#ifdef PADDLE_MOBILE_FPGA + tensor->init(typeid(float)); + return true; +#endif auto type = var_desc->Tensor_desc().DataType(); switch (type) { case VARTYPE_TYPE_FP32: diff --git a/src/framework/tensor.h b/src/framework/tensor.h index cbdfe6425e9e535ca83403c26fe5e142d347d0c7..afbba4d801e5d5dce2ba2edb1fd78c06ce66029e 100644 --- a/src/framework/tensor.h +++ b/src/framework/tensor.h @@ -202,6 +202,21 @@ class Tensor : public TensorBase { inline void reset_data_ptr(void *p) { ((PlaceholderImpl *)(holder_.get()))->ptr_.reset((uint8_t *)p); // NOLINT } + + inline void *init(std::type_index type) { + if (holder_ != nullptr) { + holder_->set_type(type); + } + PADDLE_MOBILE_ENFORCE(numel() >= 0, "the Tensor's numel must >=0.") + int64_t size = 1 * SizeOfType(type); + if (holder_ == nullptr || holder_->size() < size + offset_) { + holder_.reset(new PlaceholderImpl(size, type)); + offset_ = 0; + } + return reinterpret_cast( + reinterpret_cast(holder_->ptr()) + offset_); + } + float scale[2]; // scale[0]= MAX/127.0, scale[1]= 127.0/MAX #endif }; diff --git a/src/framework/tensor_base.h b/src/framework/tensor_base.h index e1539d2e681973b39eeca5b30e2ed35b535be8cb..b41d7786c15222b3133d02b820cf4e089b19c1d3 100644 --- a/src/framework/tensor_base.h +++ b/src/framework/tensor_base.h @@ -91,6 +91,9 @@ class TensorBase { } inline void check_memory_size() const { +#ifdef PADDLE_MOBILE_FPGA + return; +#endif PADDLE_MOBILE_ENFORCE( holder_ != nullptr, "Tensor holds no memory. Call Tensor::mutable_data first."); diff --git a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp index 18435d29ab64182ae26721374967fb7c0774437d..7523ef59d14d7096360d06411685c54564faf29a 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_kernel.cpp @@ -57,12 +57,9 @@ bool DeconvAddKernel::Init(FusionDeconvAddParam *param) { int element_num_per_div = fpga::get_deconv_filter_num_per_div(filter, param->Groups(), sub_conv_n); - // fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel * sub_conv_n); - fpga::format_fp16_ofm(out); - fpga::DeconvArgs deconv_arg = {0}; fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0], diff --git a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp index 92f3addd821a36974de1bab9226d152329259ccd..4cc0bed75dc8ab0e968bd97f214a035ac22aac7e 100644 --- a/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V1/deconv_add_relu_kernel.cpp @@ -61,8 +61,6 @@ bool DeconvAddReluKernel::Init( fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel * sub_conv_n); - fpga::format_fp16_ofm(out); - fpga::DeconvArgs deconv_arg = {0}; fpga::fill_deconv_arg(&deconv_arg, input, out, filter, relu_enabled, param->Groups(), param->Strides()[0],