diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 6acddc893f8f6340149170cb8803010a3c54454e..d6994dc7443150ea2c8e4d05499a3b2fac3db579 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -14,11 +14,9 @@ limitations under the License. */ #include "api.h" #include -#include -#include #include #include -#include +#include #include "bias_scale.h" #include "filter.h" #include "image.h" @@ -48,6 +46,7 @@ int open_device() { // memory management; void *fpga_malloc(size_t size) { + DLOG << size << " bytes allocated"; #ifdef PADDLE_MOBILE_OS_LINUX return reinterpret_cast( mmap64(NULL, size, PROT_READ | PROT_WRITE, MAP_SHARED, fd, 0)); @@ -181,10 +180,19 @@ void format_image(framework::Tensor *image_tensor) { void format_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); - auto channel = dims[1], height = dims[2], width = dims[3]; - size_t memory_size = - height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); - ofm_tensor->reset_data_ptr(fpga_malloc(memory_size)); + size_t memory_size = 0; + if (dims.size() == 4) { + auto channel = dims[1], height = dims[2], width = dims[3]; + memory_size = + height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); + } else if (dims.size() == 2) { + memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half); + } else { + DLOG << "Wrong ofm dimension"; + } + auto p = fpga_malloc(memory_size); + memset(p, 0, memory_size); + ofm_tensor->reset_data_ptr(p); } float filter_find_max(framework::Tensor *filter_tensor) { @@ -200,7 +208,7 @@ int get_plit_num(framework::Tensor *filter_tensor) { return filter::calc_split_num(num, div_capacity); } -int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) { +int get_filter_num_per_div(framework::Tensor *filter_tensor, int group_num) { auto dims = filter_tensor->dims(); auto chw = dims[1] * dims[2] * dims[3]; auto num = dims[0]; @@ -279,7 +287,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, arg->concat_arg.image_out = out_ptr; const int channel = (int)out->dims()[1]; - int element_num_per_div = fpga::get_element_num_per_div(filter, group_num); + int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num); int element_num = fpga::get_aligned_filter_element_num( filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); @@ -297,12 +305,14 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, arg->conv_args[i].image.scale_address = input->scale; arg->conv_args[i].image.pad_height = (uint32_t)padding_h; arg->conv_args[i].image.pad_width = (uint32_t)padding_w; - arg->conv_args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - arg->conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + arg->conv_args[i].filter_scale_address = filter->scale; + arg->conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; + arg->conv_args[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; arg->conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( - channel - (n - 1) * element_num_per_div) - : element_num_per_div); + channel - (n - 1) * filter_num_per_div) + : filter_num_per_div); if (n > 1) { arg->conv_args[i].output.scale_address = diff --git a/src/fpga/api.h b/src/fpga/api.h index 4dfd80f318f07e75644c21e50d6b5e691908245a..ea50c54b7f6da746aea24f13c66324aea1d7f5b3 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -74,6 +74,7 @@ struct ConvArgs { bool relu_enabled; void* sb_address; // scale and bias are interlaced; void* filter_address; + float* filter_scale_address; uint32_t filter_num; uint32_t group_num; @@ -200,7 +201,7 @@ void format_image(framework::Tensor* image_tensor); void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory float filter_find_max(framework::Tensor* filter_tensor); -int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num); +int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); int get_plit_num(framework::Tensor* filter_tensor); int get_aligned_filter_element_num(int chw); int get_aligned_filter_num(int num); diff --git a/src/fpga/filter.cpp b/src/fpga/filter.cpp index 5f1a16d2339f3859f4cd85408c965d8d2634a55f..f3672182959ff21bff5c4cb264bc53ab6de53738 100644 --- a/src/fpga/filter.cpp +++ b/src/fpga/filter.cpp @@ -101,7 +101,6 @@ void align_element(char **data_in, int num, int chw) { int j = 0; int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); if (align_chw != chw) { - printf("align %d \n", align_chw); char *tmp = *data_in; char *data_tmp = (char *)fpga_malloc(num * align_chw * sizeof(char)); diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 84b9d6b0ddd9a1577ee37d095cabed2a8a2fe5a2..58d1717dce08fc9065449a657d68c4c3756c300f 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -23,7 +23,7 @@ template <> bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { bool relu_enabled = false; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); + auto bias = param->Bias(); auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); @@ -62,7 +62,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { fpga::format_filter(filter, max_value, param->Groups()); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); @@ -80,7 +80,6 @@ void ConvAddBNKernel::Compute( const FusionConvAddBNParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvAddBNKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index e38ae9240534b17e97d7ee1c68bffb25a8aedf71..00bfa9101b6d5c464fb6603d8fde13ce2885a630 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -24,7 +24,6 @@ bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { bool relu_enabled = true; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); @@ -58,14 +57,12 @@ bool ConvAddBNReluKernel::Init( float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -80,7 +77,6 @@ void ConvAddBNReluKernel::Compute( const FusionConvAddBNReluParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvAddBNReluKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 31f28df5103942750758040ab983e2c0298a8cfd..71f0420b6ab264fd893c7e818e3cf9ac0f9341e5 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -23,7 +23,6 @@ template <> bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { bool relu_enabled = true; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); auto filter = const_cast(param->Filter()); @@ -40,14 +39,12 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -62,7 +59,6 @@ void ConvAddReluKernel::Compute( const FusionConvAddReluParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvAddReluKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index 8818e98c376ab4e33d399bdf429e5b01928672e2..007561911231cfe25c199c0a9bd7238c58dc85e8 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -24,7 +24,6 @@ template <> bool ConvBNKernel::Init(FusionConvBNParam *param) { bool relu_enabled = false; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); auto filter = const_cast(param->Filter()); auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); @@ -55,14 +54,12 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -77,7 +74,6 @@ void ConvBNKernel::Compute( const FusionConvBNParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvBNKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index 8fe4425a23de2b4b16b241bf65d893d10132cc2e..4c62888b95c08b0198255124ebeff5a265274871 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -23,7 +23,6 @@ template <> bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bool relu_enabled = true; auto input = const_cast(param->Input()); - auto input_ptr = input->data(); auto filter = const_cast(param->Filter()); auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); @@ -52,27 +51,12 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, param->Groups()); - auto filter_ptr = filter->data(); int element_num_per_div = - fpga::get_element_num_per_div(filter, param->Groups()); + fpga::get_filter_num_per_div(filter, param->Groups()); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); fpga::format_ofm(out); - auto out_ptr = out->mutable_data(); - - fpga::WrapperConvArgs convArgs; - convArgs.group_num = (uint32_t)param->Groups(); - convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); - convArgs.filter_num = (uint32_t)filter->dims()[0]; - convArgs.output.address = out_ptr; - convArgs.output.scale_address = out->scale; - convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( - convArgs.split_num * sizeof(fpga::ConvArgs)); - param->SetFpgaArgs(convArgs); - - int element_num = fpga::get_aligned_filter_element_num( - filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled, @@ -87,7 +71,6 @@ void ConvBNReluKernel::Compute( const FusionConvBNReluParam ¶m) const { fpga::ComputeFpgaConv(param.FpgaArgs()); } -template class ConvBNReluKernel; } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/dropout_kernel.cpp b/src/operators/kernel/fpga/dropout_kernel.cpp index 3a4dd216d481322a9228cfd247bf6f0d0098177e..b0981c4254060996a16f4ae5beabb7c22edd6d34 100644 --- a/src/operators/kernel/fpga/dropout_kernel.cpp +++ b/src/operators/kernel/fpga/dropout_kernel.cpp @@ -27,13 +27,7 @@ bool DropoutKernel::Init(DropoutParam *param) { template <> void DropoutKernel::Compute( - const DropoutParam ¶m) const { - // auto *input_x = param.InputX(); - // auto *out = param.Out(); - // auto input_x_ptr = input_x->data(); - // auto out_ptr = out->mutable_data(); - // out_ptr = const_cast(input_x_ptr); -} + const DropoutParam ¶m) const {} } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 48d7425fcb7a3c630165fe4a7d26875a4f4a0a9d..5b4c95af9f8844ac2242887d9d0233ab1b83460d 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -21,7 +21,6 @@ template <> bool FusionFcReluKernel::Init(FusionFcReluParam *param) { bool relu_enabled = true; auto input_x = const_cast(param->InputX()); - auto input_x_ptr = input_x->data(); auto filter = const_cast(param->InputY()); auto input_z = param->InputZ(); auto input_z_ptr = input_z->data(); @@ -47,12 +46,10 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, 1); - auto filter_ptr = filter->data(); - int element_num_per_div = fpga::get_element_num_per_div(filter, 1); + int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - auto out_ptr = out->mutable_data(); + fpga::format_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index ccc6009700c98f1f94835a7e21a83de1faade1f0..5681fcc7a7108bce971d0aa82733dbc7595e29cc 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -22,7 +22,6 @@ template <> bool FusionFcKernel::Init(FusionFcParam *param) { bool relu_enabled = false; auto input_x = const_cast(param->InputX()); - auto input_x_ptr = input_x->data(); auto filter = const_cast(param->InputY()); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); @@ -48,12 +47,10 @@ bool FusionFcKernel::Init(FusionFcParam *param) { filter->Resize(framework::make_ddim({num, filter_channel, height, width})); float max_value = fpga::filter_find_max(filter); fpga::format_filter(filter, max_value, 1); - auto filter_ptr = filter->data(); - int element_num_per_div = fpga::get_element_num_per_div(filter, 1); + int element_num_per_div = fpga::get_filter_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - - auto out_ptr = out->mutable_data(); + fpga::format_ofm(out); fpga::WrapperConvArgs conv_arg; fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0, diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp index d3df951dbc340814d766f76e8720c3aaef2f3539..82cb88b1d7c141ab94563e74a693119b328920fc 100644 --- a/src/operators/kernel/fpga/pool_kernel.cpp +++ b/src/operators/kernel/fpga/pool_kernel.cpp @@ -50,9 +50,7 @@ bool PoolKernel::Init(PoolParam *param) { template <> void PoolKernel::Compute(const PoolParam ¶m) const { -#ifdef PADDLE_MOBILE_FPGA fpga::ComputeFpgaPool(param.FpgaArgs()); -#endif } } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index 077f7d3c8c870ea8be5f102bf23ec837b32117ac..7c784ce474bbb2588dcf78ecded740777445fc80 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -55,7 +55,6 @@ void SoftmaxKernel::Compute( math::SoftmaxFuntor()(in_x, out); } -template class SoftmaxKernel; } // namespace operators } // namespace paddle_mobile diff --git a/test/fpga/test_format_data.cpp b/test/fpga/test_format_data.cpp index 0fa3c23d2af6220959d434a6805adc9a7ae984a5..a7b2e393ceae3cc55e4c453a5e4738e7ebff6883 100644 --- a/test/fpga/test_format_data.cpp +++ b/test/fpga/test_format_data.cpp @@ -22,7 +22,7 @@ namespace fpga = paddle_mobile::fpga; using std::cout; using std::endl; -int main() { +void test_format_image() { std::vector dims{1, 1, 3, 3}; std::vector elements{1, 2, 3, 4, 5, 6, 7, 8, 9}; frame::DDim ddim = frame::make_ddim(dims); @@ -44,6 +44,50 @@ int main() { cout << endl; auto dd = image.dims(); cout << dims[0] << dims[1] << dims[2] << dims[3] << endl; +} + +void test_fill_conv_arg() { + Tensor input, out, filter; + DLOG << "Setup input"; + SetupTensor(&input, {1, 250, 32, 30}, static_cast(0), + static_cast(1)); + + DLOG << "Setup filter"; + SetupTensor(&filter, {1001, 250, 3, 3}, static_cast(0), + static_cast(1)); + + DLOG << "Setup output"; + SetupTensor(&out, {1, 1001, 32, 30}, static_cast(0), + static_cast(1)); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * 1001 * sizeof(float)); + + DLOG << "find max"; + float max_value = fpga::filter_find_max(&filter); + DLOG << "format filter"; + fpga::format_filter(&filter, max_value, 1); + + DLOG << "format bs_ptr"; + int element_num_per_div = fpga::get_filter_num_per_div(&filter, 1); + fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001); + DLOG << "format ofm"; + fpga::format_ofm(&out); + DLOG << "Build arg"; + + fpga::WrapperConvArgs arg; + fpga::fill_conv_arg(&arg, &input, &out, &filter, true, 1, 1, 1, 1, 1, bs_ptr); + DLOG << "splitNum: " << arg.split_num << " group_num:" << arg.group_num + << " filter_num:" << arg.filter_num; + + for (int i = 0; i < arg.split_num; i++) { + DLOG << arg.conv_args[i].filter_num << " " << arg.conv_args[i].sb_address + << " " << arg.conv_args[i].filter_address << " " + << arg.conv_args[i].filter_scale_address; + } +} + +int main() { + test_format_image(); + test_fill_conv_arg(); return 0; }