diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 138906c790574a4a0201180b5d18cd67960a7e1d..725895ae6a3da161af545646c2a74bda16be532f 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -22,7 +22,7 @@ limitations under the License. */ #include "fpga/filter.h" #include "fpga/image.h" #define FPGA_TEST_MODE -#define PADDLE_MOBILE_OS_LINUX +//#define PADDLE_MOBILE_OS_LINUX namespace paddle_mobile { namespace fpga { @@ -125,6 +125,7 @@ float fp16_2_fp32(half fp16_num) { } int ComputeBasicConv(const struct ConvArgs &args) { +#ifdef FPGA_TEST_MODE DLOG << "======Compute Basic Conv======"; DLOG << " relu_enabled:" << args.relu_enabled << " sb_address:" << args.sb_address @@ -144,7 +145,7 @@ int ComputeBasicConv(const struct ConvArgs &args) { << " stride_w:" << args.kernel.stride_w; DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; - +#endif return do_ioctl(IOCTL_CONFIG_CONV, &args); } @@ -192,8 +193,9 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { int ComputeFpgaEWAdd(const struct EWAddArgs &args) { #ifdef FPGA_TEST_MODE DLOG << "=============ComputeFpgaEWAdd==========="; - DLOG << " relu_enabled:" << args.relu_enabled << " const0:" << args.const0 - << " const1:" << args.const1; + DLOG << " relu_enabled:" << args.relu_enabled + << " const0:" << fp16_2_fp32(short(args.const0)) + << " const1:" << fp16_2_fp32(short(args.const1)); DLOG << " image0_address:" << args.image0.address << " image0_scale_address:" << args.image0.scale_address << " image0_channels:" << args.image0.channels @@ -401,8 +403,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_out = out_ptr; arg->concat_arg.scale_out = out->scale; - arg->concat_arg.height = (uint32_t)filter->dims()[2]; - arg->concat_arg.width = (uint32_t)filter->dims()[3]; + arg->concat_arg.height = (uint32_t)out->dims()[2]; + arg->concat_arg.width = (uint32_t)out->dims()[3]; int n = arg->split_num; arg->concat_arg.images_in = @@ -411,7 +413,6 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, (float **)fpga_malloc(n * sizeof(float *)); // NOLINT arg->concat_arg.channel_num = (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT - arg->concat_arg.image_out = out_ptr; auto channel = (int)out->dims()[1]; // NOLINT int filter_num_per_div = get_filter_num_per_div(filter, group_num); diff --git a/src/fpga/bias_scale.cpp b/src/fpga/bias_scale.cpp index 50f1ed03f0121b5afdc41d427e5b52675994bd1e..23889d5b1fee3d8cb9e4673f42b18574366411eb 100644 --- a/src/fpga/bias_scale.cpp +++ b/src/fpga/bias_scale.cpp @@ -27,6 +27,9 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); + if (num_per_div_before_alignment == num_per_div_after_alignment) { + return; + } int num_element = 2 * div_num * num_per_div_after_alignment; // including bias & scale float *ptr_aligned = diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp index 34e0ad6f18f8e80d636e42630e03650c018a8825..c824b446ce3a4c3f13ad788780997a3920a1484c 100644 --- a/src/fpga/filter.cpp +++ b/src/fpga/filter.cpp @@ -210,12 +210,12 @@ void format_filter(float **data_in, int num, int channel, int height, int width, align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * div_num; - + int residual = num % num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * + ((residual == 0) ? div_num : (div_num - 1)) + + align_to_x(residual, FILTER_NUM_ALIGNMENT); quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_to_hwc(quantize_data, num, channel, height, width); align_element(quantize_data, num, chw); align_num(quantize_data, num_per_div_before_alignment, num, chw); diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 904dd8a1da9e67d0c1283806e766d3a25dc27309..7c7bceaaee82617122da9c0fd2a5fa6b688f1153 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -44,6 +44,7 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { int width = (uint32_t)input_x->dims()[3]; int filter_channel = chw / height / width; + out->Resize(framework::make_ddim({1, channel, 1, 1})); filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_fc_filter(filter, max_value); diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 46dae1b2a076add9f17e4e5bc6d3a99ad583fb50..d543e1ea46bea09ee7331d03760633ee240454d5 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -45,6 +45,7 @@ bool FusionFcKernel::Init(FusionFcParam *param) { int width = (uint32_t)input_x->dims()[3]; int filter_channel = chw / height / width; + out->Resize(framework::make_ddim({1, channel, 1, 1})); filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_fc_filter(filter, max_value); diff --git a/src/operators/kernel/fpga/mul_kernel.cpp b/src/operators/kernel/fpga/mul_kernel.cpp index 07aa4bcc43d28805ab0660bf89149c5ec5f1c732..9e282bd27b744cb48fccdc8e4602ae2fc9a1ad79 100644 --- a/src/operators/kernel/fpga/mul_kernel.cpp +++ b/src/operators/kernel/fpga/mul_kernel.cpp @@ -44,6 +44,7 @@ bool MulKernel::Init(MulParam *param) { int width = (uint32_t)input_x->dims()[3]; int filter_channel = chw / height / width; + out->Resize(framework::make_ddim({1, channel, 1, 1})); filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_fc_filter(filter, max_value); diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index dba555708f505eb9bdf81d6f4487227c88f0a616..e36db57f4b4f18712df50b2b132cdd1032a41921 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -27,7 +27,7 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { auto input = const_cast(param->InputX()); auto input_ptr = input->data(); auto float_input = new Tensor; - float_input->mutable_data(input->dims()); + float_input->mutable_data({1, input->dims()[1]}); fpga::format_fp32_ofm(float_input); fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; @@ -56,7 +56,6 @@ void SoftmaxKernel::Compute( fpga::fpga_invalidate( (void *)in_x->data(), // NOLINT fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float)); - math::SoftmaxFuntor()(in_x, out); fpga::fpga_flush(out->data(), out->memory_size()); }