diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 47acd275fa644f7c6d51c34a547c814531fd88c5..0b3c1f8132aaee1d91a107ff954540e668927e9b 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -181,10 +181,12 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { return 0; } +int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } + void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); auto channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = image_tensor->mutable_data(); + auto data_ptr = image_tensor->data(); size_t memory_size = channel * height * width * sizeof(float); float *new_data = (float *)fpga_malloc(memory_size); fpga_copy(new_data, data_ptr, memory_size); @@ -192,7 +194,7 @@ void format_image(framework::Tensor *image_tensor) { image_tensor->reset_data_ptr(new_data); } -void format_ofm(framework::Tensor *ofm_tensor) { +void format_fp16_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); size_t memory_size = 0; if (dims.size() == 4) { @@ -209,6 +211,23 @@ void format_ofm(framework::Tensor *ofm_tensor) { ofm_tensor->reset_data_ptr(p); } +void format_fp32_ofm(framework::Tensor *ofm_tensor) { + auto dims = ofm_tensor->dims(); + size_t memory_size = 0; + if (dims.size() == 4) { + auto channel = dims[1], height = dims[2], width = dims[3]; + memory_size = + height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float); + } else if (dims.size() == 2) { + memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float); + } else { + DLOG << "Wrong ofm dimension"; + } + auto p = fpga_malloc(memory_size); + memset(p, 0, memory_size); + ofm_tensor->reset_data_ptr(p); +} + float filter_find_max(framework::Tensor *filter_tensor) { auto filter_ptr = filter_tensor->data(); return filter::find_max(filter_ptr, filter_tensor->numel()); @@ -242,7 +261,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value, int group_num) { auto dims = filter_tensor->dims(); auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->mutable_data(); + auto data_ptr = filter_tensor->data(); size_t memory_size = num * channel * height * width * sizeof(float); auto new_data = (float *)fpga_malloc(memory_size); fpga_copy(new_data, data_ptr, memory_size); @@ -277,7 +296,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, int padding_h, int padding_w, float *bs_ptr) { auto input_ptr = input->data(); auto filter_ptr = filter->data(); - auto out_ptr = out->mutable_data(); + auto out_ptr = out->data(); arg->group_num = (uint32_t)group_num; arg->split_num = (uint32_t)fpga::get_plit_num(filter); @@ -300,7 +319,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t)); arg->concat_arg.image_out = out_ptr; - const int channel = (int)out->dims()[1]; + auto channel = (int)out->dims()[1]; int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num); int element_num = fpga::get_aligned_filter_element_num( filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); diff --git a/src/fpga/api.h b/src/fpga/api.h index 9d17e05d6cbfeeb8abac1e06c731510fed2ee65d..83eca9e2688d7076d33818d992cafb5a4c05df01 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -206,8 +206,10 @@ int ComputeFPGAConcat(const struct ConcatArgs& args); static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } +int get_align_image_cw(int cw); void format_image(framework::Tensor* image_tensor); -void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory +void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory +void format_fp32_ofm(framework::Tensor* ofm_tensor); float filter_find_max(framework::Tensor* filter_tensor); int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 7cfdaa56aedcfdafb0e0de5e7fe9d1897a5794d4..b0585d5e8377fbd4a9bef46a9637c608b3ca4e37 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -45,7 +45,7 @@ class FeedOp : public framework::OperatorBase { void Init() { Tensor *output = param_.Out(); - fpga::format_ofm(output); + fpga::format_fp16_ofm(output); } void RunImpl() const { @@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase { auto input_ptr = input->data(); fpga::format_image(input); Tensor *output = param_.Out(); - auto output_ptr = output->mutable_data(); + auto output_ptr = output->data(); fpga::BypassArgs args; @@ -62,9 +62,9 @@ class FeedOp : public framework::OperatorBase { args.input_layout_type = fpga::LAYOUT_CHW; args.output_layout_type = fpga::LAYOUT_HWC; args.image.address = (void *)input_ptr; - args.image.channels = input->dims()[1]; - args.image.height = input->dims()[2]; - args.image.width = input->dims()[3]; + args.image.channels = (uint32_t)input->dims()[1]; + args.image.height = (uint32_t)input->dims()[2]; + args.image.width = (uint32_t)input->dims()[3]; args.image.pad_height = 0; args.image.pad_width = 0; args.output.address = output_ptr; diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 58d1717dce08fc9065449a657d68c4c3756c300f..21a03bcc3aca243ce3f66bcda6119b63a742560a 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -64,7 +64,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { int element_num_per_div = fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index 00bfa9101b6d5c464fb6603d8fde13ce2885a630..749e61f45d2865b7dd87be44339a4336a987f636 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -62,7 +62,7 @@ bool ConvAddBNReluKernel::Init( fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 71f0420b6ab264fd893c7e818e3cf9ac0f9341e5..2570b80857d8b1d0c98828e0197ffe37afcf749f 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -44,7 +44,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index 007561911231cfe25c199c0a9bd7238c58dc85e8..34954fd6d4a573321ef34b5c09567d90b4fc9022 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef FUSION_CONVBN_OP #include "operators/kernel/conv_bn_kernel.h" -#include "fpga/api.h" namespace paddle_mobile { namespace operators { @@ -33,10 +32,8 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { const float epsilon = param->Epsilon(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], "Output channel should be equal to bias number"); - const int channel = out->dims()[1]; - auto bs_ptr = - reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); auto new_scale = new Tensor(); auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); @@ -59,7 +56,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index 4c62888b95c08b0198255124ebeff5a265274871..04d6892e3f4e526a5baa13cd86f8b2a4fe1de176 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -56,7 +56,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp index 9840f495e89a3e63990bf5f10c65cf4afe8d0854..a3314c1b2c2c3a3a79e582fe4c79d34f6eb5b47c 100644 --- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp @@ -27,7 +27,7 @@ bool ElementwiseAddReluKernel::Init( auto *out = param->Out(); auto input_x_ptr = input_x->data(); auto input_y_ptr = input_y->data(); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); auto out_ptr = out->mutable_data(); fpga::EWAddArgs ewaddArgs; diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 5b4c95af9f8844ac2242887d9d0233ab1b83460d..64aa255ff2308c91a71c6e6f018d2ba435f243df 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -49,7 +49,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 5681fcc7a7108bce971d0aa82733dbc7595e29cc..5930f3d4115a469a9e5515b007be090de7d0219c 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -50,7 +50,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); + fpga::format_fp16_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp index 82cb88b1d7c141ab94563e74a693119b328920fc..39c3977ce138c172b990289dd34986d7c649519e 100644 --- a/src/operators/kernel/fpga/pool_kernel.cpp +++ b/src/operators/kernel/fpga/pool_kernel.cpp @@ -24,7 +24,7 @@ bool PoolKernel::Init(PoolParam *param) { auto *input = const_cast(param->Input()); auto input_ptr = input->data(); Tensor *output = param->Output(); - fpga::format_ofm(output); + fpga::format_fp16_ofm(output); auto output_ptr = output->mutable_data(); vector ksize = param->Ksize(); vector strides = param->Strides(); diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index fd84cb8e46c974d23816c0dd4c99a545d996c409..0bc874c570248533447521d746dc653fc0e17114 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -24,22 +24,23 @@ namespace operators { template <> bool SoftmaxKernel::Init(SoftmaxParam *param) { - const Tensor *input = param->InputX(); + auto input = const_cast(param->InputX()); auto input_ptr = input->data(); - auto output_ptr = param->Out(); - Tensor *floatInput = new Tensor(*input); + auto float_input = new Tensor(*input); + fpga::format_fp32_ofm(float_input); + fpga::BypassArgs args; args.input_layout_type = fpga::LAYOUT_HWC; args.output_layout_type = fpga::LAYOUT_CHW; args.input_data_type = fpga::DATA_TYPE_FP16; args.output_data_type = fpga::DATA_TYPE_FP32; - args.image.address = (void *)(input_ptr); - args.image.height = (uint32_t)input->dims()[0]; - args.image.width = (uint32_t)input->dims()[1]; - args.image.channels = 1; - args.output.address = (void *)floatInput->mutable_data(); + args.image.address = input_ptr; + args.image.height = 1; + args.image.width = 1; + args.image.channels = (uint32_t)input->dims()[1]; + args.output.address = float_input->mutable_data(); - param->SetFloatInput(floatInput); + param->SetFloatInput(float_input); param->SetFpgaArgs(args); return true; } @@ -47,17 +48,16 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { template <> void SoftmaxKernel::Compute( const SoftmaxParam ¶m) const { - DLOG << "======================================= FPGA SoftMAX " - "==============================================="; - const Tensor *in_x = param.FloatInput(); + Tensor *in_x = param.FloatInput(); Tensor *out = param.Out(); - fpga::fpga_flush((void *)in_x->data(), in_x->memory_size()); + fpga::PerformBypass(param.FpgaArgs()); - fpga::fpga_invalidate(out->data(), out->memory_size()); + fpga::fpga_invalidate( + (void *)in_x->data(), + (size_t)fpga::get_align_image_cw((int)in_x->dims()[1]) * sizeof(float)); - auto x_dims = in_x->dims(); - out->Resize(x_dims); math::SoftmaxFuntor()(in_x, out); + fpga::fpga_flush(out->data(), out->memory_size()); } } // namespace operators diff --git a/test/fpga/test_format_data.cpp b/test/fpga/test_format_data.cpp index a7b2e393ceae3cc55e4c453a5e4738e7ebff6883..1d67c3110ff86dc6fba2d49412edb70ab1c9c16d 100644 --- a/test/fpga/test_format_data.cpp +++ b/test/fpga/test_format_data.cpp @@ -71,7 +71,7 @@ void test_fill_conv_arg() { fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001); DLOG << "format ofm"; - fpga::format_ofm(&out); + fpga::format_fp16_ofm(&out); DLOG << "Build arg"; fpga::WrapperConvArgs arg;