diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index a9ca711bdb95472664c64cffe886d7b5c32aae7e..0aefa45953dff86a3ccec35e44dd8e072008df75 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -89,8 +89,14 @@ DLOG << " kernel_height:" << args.kernel.height DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address;*/ #endif + int split_num = args.split_num; + for (int i = 0; i < split_num; i++) { + do_ioctl(IOCTL_CONFIG_CONV, &args.conv_args[i]); + } - return do_ioctl(IOCTL_CONFIG_CONV, &args); + if (split_num > 1) { + ComputeFPGAConcat(args.concat_arg); + } } int ComputeFpgaPool(const struct PoolingArgs &args) { @@ -155,9 +161,16 @@ int PerformBypass(const struct BypassArgs &args) { return do_ioctl(IOCTL_CONFIG_BYPASS, &args); } +int ComputeFPGAConcat(const struct ConcatArgs &args) { + image::concat_images(args.images_in, args.scales_in, args.image_out, + args.scale_out, args.image_num, args.channel_num, + args.height, args.width); + return 0; +} + void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); - int channel = dims[1], height = dims[2], width = dims[3]; + auto channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = image_tensor->mutable_data(); size_t memory_size = channel * height * width * sizeof(float); float *new_data = (float *)fpga_malloc(memory_size); @@ -168,7 +181,7 @@ void format_image(framework::Tensor *image_tensor) { void format_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); - int channel = dims[1], height = dims[2], width = dims[3]; + auto channel = dims[1], height = dims[2], width = dims[3]; size_t memory_size = height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); ofm_tensor->reset_data_ptr(fpga_malloc(memory_size)); @@ -181,16 +194,16 @@ float filter_find_max(framework::Tensor *filter_tensor) { int get_plit_num(framework::Tensor *filter_tensor) { auto dims = filter_tensor->dims(); - int chw = dims[1] * dims[2] * dims[3]; - int num = dims[0]; + auto chw = dims[1] * dims[2] * dims[3]; + auto num = dims[0]; int div_capacity = filter::calc_division_capacity(chw); return filter::calc_split_num(num, div_capacity); } int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) { auto dims = filter_tensor->dims(); - int chw = dims[1] * dims[2] * dims[3]; - int num = dims[0]; + auto chw = dims[1] * dims[2] * dims[3]; + auto num = dims[0]; int div_capacity = filter::calc_division_capacity(chw); return filter::calc_num_per_div(num, group_num, div_capacity); } @@ -206,25 +219,10 @@ int get_aligned_filter_num(int num) { void format_filter(framework::Tensor *filter_tensor, float max_value, int group_num) { auto dims = filter_tensor->dims(); - int num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; + auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = filter_tensor->mutable_data(); size_t memory_size = num * channel * height * width * sizeof(float); - float *new_data = (float *)fpga_malloc(memory_size); - fpga_copy(new_data, data_ptr, memory_size); - filter::format_filter(&new_data, num, channel, height, width, group_num, - max_value); - filter_tensor->reset_data_ptr(new_data); -} - -void format_fc_matrix(framework::Tensor *filter_tensor, float max_value, - int group_num, int height, int width) { - auto dims = filter_tensor->dims(); - PADDLE_MOBILE_ENFORCE(height == 1 && width == 1, - "IFM should be flattened for FC"); - int num = dims[1], channel = dims[0] / height / width; - auto data_ptr = filter_tensor->mutable_data(); - size_t memory_size = num * channel * height * width * sizeof(float); - float *new_data = (float *)fpga_malloc(memory_size); + auto new_data = (float *)fpga_malloc(memory_size); fpga_copy(new_data, data_ptr, memory_size); filter::format_filter(&new_data, num, channel, height, width, group_num, max_value); @@ -237,5 +235,19 @@ void format_bias_scale_array(float **bias_scale_array, element_num_per_division, num); } +void format_concat_output(framework::Tensor *out, int height, int width, + int image_num, uint32_t *channel_num) { + int sum_channel = 0, sum_cw = 0; + for (int i = 0; i < image_num; i++) { + sum_channel += channel_num[i]; + } + + sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT); + auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half)); + auto ddim = framework::make_ddim({-1, sum_channel, height, width}); + out->Resize(ddim); + out->reset_data_ptr(data_ptr); +} + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/api.h b/src/fpga/api.h index aa13b09178ad2e63222041c0d432d341640b3847..3feae5c86a1133bbfc9001112565d8bdd79b7d34 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -92,12 +92,24 @@ struct ConvArgs { struct ImageOutputArgs output; }; +struct ConcatArgs { + uint32_t image_num; + half** images_in; + float** scales_in; + void* image_out; + float* scale_out; + uint32_t* channel_num; + uint32_t height; + uint32_t width; +}; + struct WrapperConvArgs { uint32_t split_num; uint32_t group_num; uint32_t filter_num; struct ImageOutputArgs output; - struct ConvArgs* args; + struct ConvArgs* conv_args; + struct ConcatArgs concat_arg; }; struct PoolingArgs { @@ -176,6 +188,7 @@ int PerformBypass(const struct BypassArgs& args); int ComputeFpgaConv(const struct WrapperConvArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args); int ComputeFpgaEWAdd(const struct EWAddArgs& args); +int ComputeFPGAConcat(const struct ConcatArgs& args); static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } void format_image(framework::Tensor* image_tensor); @@ -188,10 +201,10 @@ int get_aligned_filter_num(int num); void format_filter(framework::Tensor* filter_tensor, float max_value, int group_num); -void format_fc_matrix(framework::Tensor* filter_tensor, float max_value, - int group_num, int height = 1, int width = 1); void format_bias_scale_array(float** bias_scale_array, int element_num_per_division, int num); +void format_concat_output(framework::Tensor* out, int height, int width, + int image_num, uint32_t* channel_num); } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/image.cpp b/src/fpga/image.cpp index 88168ee2125619ed0ae509d16e4fa81e5730d766..c6c150df75dbc0c4389bdec1f77b984098de72eb 100644 --- a/src/fpga/image.cpp +++ b/src/fpga/image.cpp @@ -62,6 +62,10 @@ void format_image(float **data_in, int channel, int height, int width) { align_element_conv(data_in, height, channel * width); } +void concat_images(int16_t **images_in, float **scales_in, void *image_out, + float *scale_out, int image_num, uint32_t *channel_num, + int height, int width) {} + } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/image.h b/src/fpga/image.h index 83ba5bc4d04ce4facaf9441cebe15534bf200f91..7e004916118ae97d60d24e798300d66a98191211 100644 --- a/src/fpga/image.h +++ b/src/fpga/image.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include + #define IMAGE_ALIGNMENT 16 // Aligned to 16 namespace paddle_mobile { namespace fpga { @@ -21,6 +24,10 @@ namespace image { void convert_to_hwc(float** data_in, int channel, int height, int width); void align_element_conv(float** data_in, int height, int cw); void format_image(float** data_in, int channel, int height, int width); +void concat_images(int16_t** images_in, float** scales_in, void* image_out, + float* scale_out, int image_num, uint32_t* channel_num, + int height, + int width); // Concat featuremaps along channel direction } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp index 8bf7b20a224743f4395cd862d27f1882a847812a..9de1511746f70c225e2d978a43b43cb34ad9143f 100644 --- a/src/operators/kernel/fpga/concat_kernel.cpp +++ b/src/operators/kernel/fpga/concat_kernel.cpp @@ -21,31 +21,44 @@ namespace operators { template <> bool ConcatKernel::Init(ConcatParam *param) { + auto inputs = param->Inputs(); + auto out = param->Out(); + auto image_num = inputs.size(); + auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *)); + auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *)); + auto channel_num = + (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); + + auto height = inputs[0]->dims()[2]; + auto width = inputs[0]->dims()[3]; + for (int i = 0; i < image_num; i++) { + auto input = inputs[i]; + PADDLE_MOBILE_ENFORCE( + input->dims()[2] == height && input->dims()[3] == width, + "Image height & width should be unified"); + images_in[i] = (half *)input->data(); + channel_num[i] = (uint32_t)inputs[i]->dims()[1]; + scales_in[i] = input->scale; + } + fpga::format_concat_output(out, (int)height, (int)width, (int)image_num, + channel_num); + + fpga::ConcatArgs concatArgs; + concatArgs.image_num = (uint32_t)image_num; + concatArgs.images_in = images_in; + concatArgs.scales_in = scales_in; + concatArgs.image_out = (half *)out->mutable_data(); + concatArgs.scale_out = out->scale; + concatArgs.channel_num = channel_num; + concatArgs.height = (uint32_t)height; + concatArgs.width = (uint32_t)width; + param->SetFpgaArgs(concatArgs); return true; } template <> void ConcatKernel::Compute(const ConcatParam ¶m) const { - auto inputs = param.Inputs(); - auto *out = param.Out(); - int64_t axis = param.Axis(); - out->mutable_data(); - - DDim out_dim = out->dims(); - int pixels = out_dim[1] * out_dim[2]; - auto out_channel = out_dim[3]; - - auto out_offset = 0; - for (int i = 0; i < inputs.size(); ++i) { - auto input = inputs[i]; - auto channels = input->dims()[3]; - out_offset += channels; - auto src = input->data(); - for (int j = 0; j < pixels; ++j) { - auto dst = out->mutable_data() + out_offset; - memory::Copy(dst, src, sizeof(half)); - } - } + ComputeFPGAConcat(param.FpgaArgs()); } template class ConcatKernel; diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 2803a3de456e355d1044bec6beaaa3dad8a4e312..9597cf3178ca3d6758f140eec7e7b6281606ad80 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -22,13 +22,13 @@ namespace operators { template <> bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { bool relu_enabled = false; - auto *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); - const Tensor *bias = param->Bias(); + auto bias = param->Bias(); auto bias_ptr = bias->data(); - auto *filter = const_cast(param->Filter()); + auto filter = const_cast(param->Filter()); - Tensor *out = param->Output(); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); @@ -40,10 +40,10 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - auto *bs_ptr = + auto bs_ptr = reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - auto *new_scale = new Tensor(); - auto *new_bias = new Tensor(); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -75,35 +75,68 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { convArgs.filter_num = (uint32_t)filter->dims()[0]; convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; - convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num * - sizeof(fpga::ConvArgs)); + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); + + convArgs.concat_arg.image_num = convArgs.split_num; + convArgs.concat_arg.image_out = out_ptr; + convArgs.concat_arg.scale_out = out->scale; + convArgs.concat_arg.height = (uint32_t)filter->dims()[2]; + convArgs.concat_arg.width = (uint32_t)filter->dims()[3]; + + int n = convArgs.split_num; + convArgs.concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *)); + convArgs.concat_arg.scales_in = + (float **)fpga::fpga_malloc(n * sizeof(float *)); + convArgs.concat_arg.channel_num = + (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t)); + convArgs.concat_arg.image_out = out_ptr; + param->SetFpgaArgs(convArgs); int element_num = fpga::get_aligned_filter_element_num( filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); - int n = convArgs.split_num; for (int i = 0; i < n; i++) { - convArgs.args[i].relu_enabled = relu_enabled; - convArgs.args[i].group_num = (uint32_t)param->Groups(); - convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; - convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; - convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2]; - convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3]; - convArgs.args[i].image.address = input_ptr; - convArgs.args[i].image.channels = (uint32_t)input->dims()[1]; - convArgs.args[i].image.height = (uint32_t)input->dims()[2]; - convArgs.args[i].image.width = (uint32_t)input->dims()[3]; - convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0]; - convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1]; - convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; - convArgs.args[i].filter_num = + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.scale_address = input->scale; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( channel - (n - 1) * element_num_per_div) : element_num_per_div); - convArgs.args[i].image.scale_address = - (float *)fpga::fpga_malloc(2 * sizeof(float)); + + if (n > 1) { + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].output.address = + fpga::fpga_malloc(input->dims()[2] * input->dims()[3] * + convArgs.conv_args[i].filter_num * sizeof(half)); + } + + else { + convArgs.conv_args[i].output.scale_address = out->scale; + convArgs.conv_args[i].output.address = out_ptr; + } + + convArgs.concat_arg.images_in[i] = + (half *)convArgs.conv_args[i].output.address; + convArgs.concat_arg.scales_in[i] = + (float *)convArgs.conv_args[i].sb_address; + convArgs.concat_arg.channel_num[i] = convArgs.conv_args[i].filter_num; } return true; } diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index b96e6669998287588af5fbbde27ca8a87fa30b90..c8f7292f89b7a98b290bdccde1139a2df2d10182 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -23,12 +23,12 @@ template <> bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { bool relu_enabled = true; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - Tensor *filter = const_cast(param->Filter()); - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -39,9 +39,9 @@ bool ConvAddBNReluKernel::Init( "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); - Tensor *new_scale = new Tensor(); - Tensor *new_bias = new Tensor(); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -73,8 +73,8 @@ bool ConvAddBNReluKernel::Init( convArgs.filter_num = (uint32_t)filter->dims()[0]; convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; - convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num * - sizeof(fpga::ConvArgs)); + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); int element_num = fpga::get_aligned_filter_element_num( @@ -82,26 +82,28 @@ bool ConvAddBNReluKernel::Init( int n = convArgs.split_num; for (int i = 0; i < n; i++) { - convArgs.args[i].relu_enabled = relu_enabled; - convArgs.args[i].group_num = (uint32_t)param->Groups(); - convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; - convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; - convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2]; - convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3]; - convArgs.args[i].image.address = input_ptr; - convArgs.args[i].image.channels = (uint32_t)input->dims()[1]; - convArgs.args[i].image.height = (uint32_t)input->dims()[2]; - convArgs.args[i].image.width = (uint32_t)input->dims()[3]; - convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0]; - convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1]; - convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; - convArgs.args[i].filter_num = + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( channel - (n - 1) * element_num_per_div) : element_num_per_div); - convArgs.args[i].image.scale_address = + convArgs.conv_args[i].output.scale_address = (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; } return true; return true; diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 9bc041a4d960a1bacc74e1caffce3190b3659363..4b0c877376c44fa5079d52eefd33e1025a60f1c5 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -22,17 +22,17 @@ namespace operators { template <> bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { bool relu_enabled = true; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - auto *filter = const_cast(param->Filter()); - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; - auto *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i + channel] = 1; bs_ptr[i] = bias_ptr[i]; @@ -55,8 +55,8 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { convArgs.filter_num = (uint32_t)filter->dims()[0]; convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; - convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num * - sizeof(fpga::ConvArgs)); + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); int element_num = fpga::get_aligned_filter_element_num( @@ -64,26 +64,28 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { int n = convArgs.split_num; for (int i = 0; i < n; i++) { - convArgs.args[i].relu_enabled = relu_enabled; - convArgs.args[i].group_num = (uint32_t)param->Groups(); - convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; - convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; - convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2]; - convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3]; - convArgs.args[i].image.address = input_ptr; - convArgs.args[i].image.channels = (uint32_t)input->dims()[1]; - convArgs.args[i].image.height = (uint32_t)input->dims()[2]; - convArgs.args[i].image.width = (uint32_t)input->dims()[3]; - convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0]; - convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1]; - convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; - convArgs.args[i].filter_num = + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( channel - (n - 1) * element_num_per_div) : element_num_per_div); - convArgs.args[i].image.scale_address = + convArgs.conv_args[i].output.scale_address = (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; } return true; } diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index 0c25ca0e8c9acef2caeeb2f712efe9db3eb21012..60e562f4ed9357e1279992cd1691aed516e06138 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -23,10 +23,10 @@ namespace operators { template <> bool ConvBNKernel::Init(FusionConvBNParam *param) { bool relu_enabled = false; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); - Tensor *filter = const_cast(param->Filter()); - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -36,10 +36,10 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - float *bs_ptr = + auto bs_ptr = reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - Tensor *new_scale = new Tensor(); - Tensor *new_bias = new Tensor(); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -70,8 +70,8 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { convArgs.filter_num = (uint32_t)filter->dims()[0]; convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; - convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num * - sizeof(fpga::ConvArgs)); + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); int element_num = fpga::get_aligned_filter_element_num( @@ -79,26 +79,28 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { int n = convArgs.split_num; for (int i = 0; i < n; i++) { - convArgs.args[i].relu_enabled = relu_enabled; - convArgs.args[i].group_num = (uint32_t)param->Groups(); - convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; - convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; - convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2]; - convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3]; - convArgs.args[i].image.address = input_ptr; - convArgs.args[i].image.channels = (uint32_t)input->dims()[1]; - convArgs.args[i].image.height = (uint32_t)input->dims()[2]; - convArgs.args[i].image.width = (uint32_t)input->dims()[3]; - convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0]; - convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1]; - convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; - convArgs.args[i].filter_num = + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( channel - (n - 1) * element_num_per_div) : element_num_per_div); - convArgs.args[i].image.scale_address = + convArgs.conv_args[i].output.scale_address = (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; } return true; } diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index 2a493fc07cadeaca54bd2d8d37727a56017e9455..95775f30e667cec5c561e466b22a52d5f1dd44e3 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -22,10 +22,10 @@ namespace operators { template <> bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bool relu_enabled = true; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); - Tensor *filter = const_cast(param->Filter()); - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -34,9 +34,9 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); - Tensor *new_scale = new Tensor(); - Tensor *new_bias = new Tensor(); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -67,8 +67,8 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { convArgs.filter_num = (uint32_t)filter->dims()[0]; convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; - convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num * - sizeof(fpga::ConvArgs)); + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); int element_num = fpga::get_aligned_filter_element_num( @@ -76,26 +76,28 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { int n = convArgs.split_num; for (int i = 0; i < n; i++) { - convArgs.args[i].relu_enabled = relu_enabled; - convArgs.args[i].group_num = (uint32_t)param->Groups(); - convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; - convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; - convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2]; - convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3]; - convArgs.args[i].image.address = input_ptr; - convArgs.args[i].image.channels = (uint32_t)input->dims()[1]; - convArgs.args[i].image.height = (uint32_t)input->dims()[2]; - convArgs.args[i].image.width = (uint32_t)input->dims()[3]; - convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0]; - convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1]; - convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; - convArgs.args[i].filter_num = + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( channel - (n - 1) * element_num_per_div) : element_num_per_div); - convArgs.args[i].image.scale_address = + convArgs.conv_args[i].output.scale_address = (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; } return true; } diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index c188831e1303e365275283adb54f55d571aca52d..75e680199156b5e315e0d59f4010e21e0b23907a 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -20,16 +20,16 @@ namespace operators { template <> bool FusionFcReluKernel::Init(FusionFcReluParam *param) { bool relu_enabled = true; - auto *input_x = const_cast(param->InputX()); + auto input_x = const_cast(param->InputX()); auto input_x_ptr = input_x->data(); - auto *filter = const_cast(param->InputY()); - const Tensor *input_z = param->InputZ(); + auto filter = const_cast(param->InputY()); + auto input_z = param->InputZ(); auto input_z_ptr = input_z->data(); - Tensor *out = param->Out(); + auto out = param->Out(); PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], "Image channel should be equal to weight number"); int channel = (uint32_t)out->dims()[1]; - auto *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i + channel] = 1; bs_ptr[i] = input_z_ptr[i]; @@ -60,8 +60,8 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { convArgs.filter_num = (uint32_t)filter->dims()[0]; convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; - convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num * - sizeof(fpga::ConvArgs)); + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); int element_num = fpga::get_aligned_filter_element_num( @@ -69,26 +69,28 @@ bool FusionFcReluKernel::Init(FusionFcReluParam *param) { int n = convArgs.split_num; for (int i = 0; i < n; i++) { - convArgs.args[i].relu_enabled = relu_enabled; - convArgs.args[i].group_num = 1; - convArgs.args[i].kernel.stride_h = 1; - convArgs.args[i].kernel.stride_w = 1; - convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2]; - convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3]; - convArgs.args[i].image.address = input_x_ptr; - convArgs.args[i].image.channels = (uint32_t)input_x->dims()[1]; - convArgs.args[i].image.height = (uint32_t)input_x->dims()[2]; - convArgs.args[i].image.width = (uint32_t)input_x->dims()[3]; - convArgs.args[i].image.pad_height = 0; - convArgs.args[i].image.pad_width = 0; - convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; - convArgs.args[i].filter_num = + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = 1; + convArgs.conv_args[i].kernel.stride_h = 1; + convArgs.conv_args[i].kernel.stride_w = 1; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_x_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3]; + convArgs.conv_args[i].image.pad_height = 0; + convArgs.conv_args[i].image.pad_width = 0; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( channel - (n - 1) * element_num_per_div) : element_num_per_div); - convArgs.args[i].image.scale_address = + convArgs.conv_args[i].output.scale_address = (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input_x->scale; } return true; } diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 6821e135085ce2a57ea58e4657549c54a1246342..3fe0457dffdea83ef59738a2c436bf558ab9635f 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -21,17 +21,17 @@ namespace operators { template <> bool FusionFcKernel::Init(FusionFcParam *param) { bool relu_enabled = false; - auto *input_x = const_cast(param->InputX()); + auto input_x = const_cast(param->InputX()); auto input_x_ptr = input_x->data(); - auto *filter = const_cast(param->InputY()); + auto filter = const_cast(param->InputY()); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); - Tensor *out = param->Out(); + auto out = param->Out(); PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], "Image channel should be equal to weight number"); int channel = (uint32_t)out->dims()[1]; - auto *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i + channel] = 1; bs_ptr[i] = input_z_ptr[i]; @@ -61,8 +61,8 @@ bool FusionFcKernel::Init(FusionFcParam *param) { convArgs.filter_num = (uint32_t)filter->dims()[0]; convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; - convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num * - sizeof(fpga::ConvArgs)); + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); int element_num = fpga::get_aligned_filter_element_num( @@ -70,26 +70,28 @@ bool FusionFcKernel::Init(FusionFcParam *param) { int n = convArgs.split_num; for (int i = 0; i < n; i++) { - convArgs.args[i].relu_enabled = relu_enabled; - convArgs.args[i].group_num = 1; - convArgs.args[i].kernel.stride_h = 1; - convArgs.args[i].kernel.stride_w = 1; - convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2]; - convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3]; - convArgs.args[i].image.address = input_x_ptr; - convArgs.args[i].image.channels = (uint32_t)input_x->dims()[1]; - convArgs.args[i].image.height = (uint32_t)input_x->dims()[2]; - convArgs.args[i].image.width = (uint32_t)input_x->dims()[3]; - convArgs.args[i].image.pad_height = 0; - convArgs.args[i].image.pad_width = 0; - convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num]; - convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; - convArgs.args[i].filter_num = + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = 1; + convArgs.conv_args[i].kernel.stride_h = 1; + convArgs.conv_args[i].kernel.stride_w = 1; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_x_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3]; + convArgs.conv_args[i].image.pad_height = 0; + convArgs.conv_args[i].image.pad_width = 0; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( channel - (n - 1) * element_num_per_div) : element_num_per_div); - convArgs.args[i].image.scale_address = + convArgs.conv_args[i].output.scale_address = (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input_x->scale; } return true; } diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index fd9fdda58a0d193729a40f1ff2a23f5d5cade948..38e596c3ef8706a2598befa96108b7252b908cca 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -734,7 +734,7 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a, #endif } } - WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias); + // WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias); } void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b, diff --git a/src/operators/op_param.h b/src/operators/op_param.h index e7b02d0fc3a15ade9ca758e2eb47fb2d6d7c517a..5ceca937efe85e8ff06d5f1320f1018014a5bbc9 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -483,6 +483,15 @@ class ConcatParam : public OpParam { vector inputs_; GType *out_; int axis_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::ConcatArgs fpga_concat_args; + + public: + const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; } + void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; } +#endif }; #endif