diff --git a/CMakeLists.txt b/CMakeLists.txt index 8438ad53b026df19beb9f49becee379a2e45a795..8c388d8b2a6374c68aecf86b215c8e8462b13c2b 100644 --- a/CMakeLists.txt +++ b/CMakeLists.txt @@ -1,9 +1,9 @@ cmake_minimum_required(VERSION 3.6) -option(USE_OPENMP "openmp support" ON) +option(USE_OPENMP "openmp support" OFF) project(paddle-mobile) -option(DEBUGING "enable debug mode" OFF) +option(DEBUGING "enable debug mode" ON) option(USE_EXCEPTION "use std exception" OFF) option(LOG_PROFILE "log profile" OFF) # select the platform to build @@ -94,6 +94,8 @@ else() endif() if(FPGA) + set(DEBUGING ON) + add_definitions(-DPADDLE_MOBILE_DEBUG) add_definitions(-DPADDLE_MOBILE_FPGA) else() file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/*.cpp src/operators/kernel/fpga/*.cc) @@ -140,7 +142,12 @@ set(CMAKE_LIBRARY_OUTPUT_DIRECTORY build) set(CMAKE_RUNTIME_OUTPUT_DIRECTORY build) # NET default -set(NET "default" CACHE STRING "select net type") +if (FPGA) + set(NET "FPGAnets" CACHE STRING "select net type") +else() + set(NET "default" CACHE STRING "select net type") +endif() + set_property(CACHE NET PROPERTY STRINGS "default" "googlenet" "mobilenet" "yolo" "squeezenet" "FPGAnets" "NLP") include("${CMAKE_CURRENT_LIST_DIR}/tools/op.cmake") diff --git a/src/fpga/api.cpp b/src/fpga/api.cpp index 9d8a1c7f6780b74481d51d2c9b3097df86f6817d..0aefa45953dff86a3ccec35e44dd8e072008df75 100644 --- a/src/fpga/api.cpp +++ b/src/fpga/api.cpp @@ -68,29 +68,35 @@ void fpga_copy(void *dest, const void *src, size_t num) { memcpy(dest, src, num); } -int ComputeFpgaConv(const struct ConvArgs &args) { +int ComputeFpgaConv(const struct WrapperConvArgs &args) { #ifdef FPGA_TEST_MODE - DLOG << " relu_enabled:" << args.relu_enabled - << " sb_address:" << args.sb_address - << " filter_address:" << args.filter_address - << " filter_num:" << args.filter_num - << " group_num:" << args.group_num; - DLOG << " image_address:" << args.image.address - << " image_scale_address:" << args.image.scale_address - << " image_channels:" << args.image.channels - << " image_height:" << args.image.height - << " image_width:" << args.image.width - << " pad_height:" << args.image.pad_height - << " pad_width:" << args.image.pad_width; - DLOG << " kernel_height:" << args.kernel.height - << " kernel_width:" << args.kernel.width - << " stride_h:" << args.kernel.stride_h - << " stride_w:" << args.kernel.stride_w; - DLOG << " out_address:" << args.output.address - << " out_scale_address:" << args.output.scale_address; +/*DLOG << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; +DLOG << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; +DLOG << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; +DLOG << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address;*/ #endif + int split_num = args.split_num; + for (int i = 0; i < split_num; i++) { + do_ioctl(IOCTL_CONFIG_CONV, &args.conv_args[i]); + } - return do_ioctl(IOCTL_CONFIG_CONV, &args); + if (split_num > 1) { + ComputeFPGAConcat(args.concat_arg); + } } int ComputeFpgaPool(const struct PoolingArgs &args) { @@ -155,9 +161,16 @@ int PerformBypass(const struct BypassArgs &args) { return do_ioctl(IOCTL_CONFIG_BYPASS, &args); } +int ComputeFPGAConcat(const struct ConcatArgs &args) { + image::concat_images(args.images_in, args.scales_in, args.image_out, + args.scale_out, args.image_num, args.channel_num, + args.height, args.width); + return 0; +} + void format_image(framework::Tensor *image_tensor) { auto dims = image_tensor->dims(); - int channel = dims[1], height = dims[2], width = dims[3]; + auto channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = image_tensor->mutable_data(); size_t memory_size = channel * height * width * sizeof(float); float *new_data = (float *)fpga_malloc(memory_size); @@ -168,7 +181,7 @@ void format_image(framework::Tensor *image_tensor) { void format_ofm(framework::Tensor *ofm_tensor) { auto dims = ofm_tensor->dims(); - int channel = dims[1], height = dims[2], width = dims[3]; + auto channel = dims[1], height = dims[2], width = dims[3]; size_t memory_size = height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half); ofm_tensor->reset_data_ptr(fpga_malloc(memory_size)); @@ -178,38 +191,38 @@ float filter_find_max(framework::Tensor *filter_tensor) { auto filter_ptr = filter_tensor->data(); return filter::find_max(filter_ptr, filter_tensor->numel()); } + +int get_plit_num(framework::Tensor *filter_tensor) { + auto dims = filter_tensor->dims(); + auto chw = dims[1] * dims[2] * dims[3]; + auto num = dims[0]; + int div_capacity = filter::calc_division_capacity(chw); + return filter::calc_split_num(num, div_capacity); +} + int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) { auto dims = filter_tensor->dims(); - PADDLE_MOBILE_ENFORCE(dims.size() == 4 || dims.size() == 2, - "Filter order should be 4 or 2"); - int chw = dims.size() == 4 ? dims[1] * dims[2] * dims[3] : dims[1]; - int num = dims.size() == 4 ? dims[0] : dims[1]; + auto chw = dims[1] * dims[2] * dims[3]; + auto num = dims[0]; int div_capacity = filter::calc_division_capacity(chw); return filter::calc_num_per_div(num, group_num, div_capacity); } -void format_filter(framework::Tensor *filter_tensor, float max_value, - int group_num) { - auto dims = filter_tensor->dims(); - int num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; - auto data_ptr = filter_tensor->mutable_data(); - size_t memory_size = num * channel * height * width * sizeof(float); - float *new_data = (float *)fpga_malloc(memory_size); - fpga_copy(new_data, data_ptr, memory_size); - filter::format_filter(&new_data, num, channel, height, width, group_num, - max_value); - filter_tensor->reset_data_ptr(new_data); +int get_aligned_filter_element_num(int chw) { + return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); +} + +int get_aligned_filter_num(int num) { + return align_to_x(num, FILTER_NUM_ALIGNMENT); } -void format_fc_matrix(framework::Tensor *filter_tensor, float max_value, - int group_num, int height, int width) { +void format_filter(framework::Tensor *filter_tensor, float max_value, + int group_num) { auto dims = filter_tensor->dims(); - PADDLE_MOBILE_ENFORCE(height == 1 && width == 1, - "IFM should be flattened for FC"); - int num = dims[1], channel = dims[0] / height / width; + auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3]; auto data_ptr = filter_tensor->mutable_data(); size_t memory_size = num * channel * height * width * sizeof(float); - float *new_data = (float *)fpga_malloc(memory_size); + auto new_data = (float *)fpga_malloc(memory_size); fpga_copy(new_data, data_ptr, memory_size); filter::format_filter(&new_data, num, channel, height, width, group_num, max_value); @@ -222,5 +235,19 @@ void format_bias_scale_array(float **bias_scale_array, element_num_per_division, num); } +void format_concat_output(framework::Tensor *out, int height, int width, + int image_num, uint32_t *channel_num) { + int sum_channel = 0, sum_cw = 0; + for (int i = 0; i < image_num; i++) { + sum_channel += channel_num[i]; + } + + sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT); + auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half)); + auto ddim = framework::make_ddim({-1, sum_channel, height, width}); + out->Resize(ddim); + out->reset_data_ptr(data_ptr); +} + } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/api.h b/src/fpga/api.h index f7010e56ad4caf7bc49f5d37301009e226780da5..3feae5c86a1133bbfc9001112565d8bdd79b7d34 100644 --- a/src/fpga/api.h +++ b/src/fpga/api.h @@ -92,6 +92,26 @@ struct ConvArgs { struct ImageOutputArgs output; }; +struct ConcatArgs { + uint32_t image_num; + half** images_in; + float** scales_in; + void* image_out; + float* scale_out; + uint32_t* channel_num; + uint32_t height; + uint32_t width; +}; + +struct WrapperConvArgs { + uint32_t split_num; + uint32_t group_num; + uint32_t filter_num; + struct ImageOutputArgs output; + struct ConvArgs* conv_args; + struct ConcatArgs concat_arg; +}; + struct PoolingArgs { struct KernelArgs kernel; struct ImageInputArgs image; // input image; @@ -165,21 +185,26 @@ enum FPGA_ERR_TYPE { //============================== API ============================= int PerformBypass(const struct BypassArgs& args); -int ComputeFpgaConv(const struct ConvArgs& args); +int ComputeFpgaConv(const struct WrapperConvArgs& args); int ComputeFpgaPool(const struct PoolingArgs& args); int ComputeFpgaEWAdd(const struct EWAddArgs& args); +int ComputeFPGAConcat(const struct ConcatArgs& args); static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } void format_image(framework::Tensor* image_tensor); void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory float filter_find_max(framework::Tensor* filter_tensor); int get_element_num_per_div(framework::Tensor* filter_tensor, int group_num); +int get_plit_num(framework::Tensor* filter_tensor); +int get_aligned_filter_element_num(int chw); +int get_aligned_filter_num(int num); + void format_filter(framework::Tensor* filter_tensor, float max_value, int group_num); -void format_fc_matrix(framework::Tensor* filter_tensor, float max_value, - int group_num, int height = 1, int width = 1); void format_bias_scale_array(float** bias_scale_array, int element_num_per_division, int num); +void format_concat_output(framework::Tensor* out, int height, int width, + int image_num, uint32_t* channel_num); } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/image.cpp b/src/fpga/image.cpp index 88168ee2125619ed0ae509d16e4fa81e5730d766..c6c150df75dbc0c4389bdec1f77b984098de72eb 100644 --- a/src/fpga/image.cpp +++ b/src/fpga/image.cpp @@ -62,6 +62,10 @@ void format_image(float **data_in, int channel, int height, int width) { align_element_conv(data_in, height, channel * width); } +void concat_images(int16_t **images_in, float **scales_in, void *image_out, + float *scale_out, int image_num, uint32_t *channel_num, + int height, int width) {} + } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/image.h b/src/fpga/image.h index 83ba5bc4d04ce4facaf9441cebe15534bf200f91..7e004916118ae97d60d24e798300d66a98191211 100644 --- a/src/fpga/image.h +++ b/src/fpga/image.h @@ -13,6 +13,9 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma once + +#include + #define IMAGE_ALIGNMENT 16 // Aligned to 16 namespace paddle_mobile { namespace fpga { @@ -21,6 +24,10 @@ namespace image { void convert_to_hwc(float** data_in, int channel, int height, int width); void align_element_conv(float** data_in, int height, int cw); void format_image(float** data_in, int channel, int height, int width); +void concat_images(int16_t** images_in, float** scales_in, void* image_out, + float* scale_out, int image_num, uint32_t* channel_num, + int height, + int width); // Concat featuremaps along channel direction } // namespace image } // namespace fpga } // namespace paddle_mobile diff --git a/src/framework/dim.h b/src/framework/dim.h index 0d3e86e92289da155843e1a9959d5ea67a73c060..85e86076e1de53fa80b75f56237901da49e22eb9 100644 --- a/src/framework/dim.h +++ b/src/framework/dim.h @@ -15,6 +15,7 @@ limitations under the License. */ #pragma once #include +#include #include "common/enforce.h" namespace paddle_mobile { namespace framework { diff --git a/src/operators/feed_op.h b/src/operators/feed_op.h index 7982735030690a9d3fe75cbadeb45f0f70a78836..e1f8fdf63ff508d9afc59e2230406c46f2c9e4d0 100644 --- a/src/operators/feed_op.h +++ b/src/operators/feed_op.h @@ -49,7 +49,7 @@ class FeedOp : public framework::OperatorBase { } void RunImpl() const { - Tensor *input = const_cast(param_.InputX()); + auto input = (Tensor *)const_cast(param_.InputX()); auto input_ptr = input->data(); fpga::format_image(input); Tensor *output = param_.Out(); diff --git a/src/operators/kernel/arm/dropout_kernel.cpp b/src/operators/kernel/arm/dropout_kernel.cpp index 348b74cd26c79f5d0f2e65012a5760ab7eb58707..4578ac6607d87c316853f6201f02f8204bc41de1 100644 --- a/src/operators/kernel/arm/dropout_kernel.cpp +++ b/src/operators/kernel/arm/dropout_kernel.cpp @@ -27,7 +27,11 @@ bool DropoutKernel::Init(DropoutParam *para) { template struct DropoutFunctor { - inline T operator()(T in) const { return in; } + DropoutFunctor(T drop_pro) : dropout_pro_(drop_pro) {} + inline T operator()(T in) const { return (1 - dropout_pro_) * in; } + + private: + T dropout_pro_; }; template <> @@ -36,8 +40,8 @@ void DropoutKernel::Compute(const DropoutParam ¶m) const { auto *input_x_ptr = input_x->data(); auto *out = param.Out(); auto *out_ptr = out->mutable_data(); - - DropoutFunctor func_; + const float dropoutProb = param.DropoutProb(); + DropoutFunctor func_(dropoutProb); math::Transform trans; trans(input_x_ptr, input_x_ptr + input_x->numel(), out_ptr, func_); } diff --git a/src/operators/kernel/fpga/concat_kernel.cpp b/src/operators/kernel/fpga/concat_kernel.cpp index 8bf7b20a224743f4395cd862d27f1882a847812a..9de1511746f70c225e2d978a43b43cb34ad9143f 100644 --- a/src/operators/kernel/fpga/concat_kernel.cpp +++ b/src/operators/kernel/fpga/concat_kernel.cpp @@ -21,31 +21,44 @@ namespace operators { template <> bool ConcatKernel::Init(ConcatParam *param) { + auto inputs = param->Inputs(); + auto out = param->Out(); + auto image_num = inputs.size(); + auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *)); + auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *)); + auto channel_num = + (uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t)); + + auto height = inputs[0]->dims()[2]; + auto width = inputs[0]->dims()[3]; + for (int i = 0; i < image_num; i++) { + auto input = inputs[i]; + PADDLE_MOBILE_ENFORCE( + input->dims()[2] == height && input->dims()[3] == width, + "Image height & width should be unified"); + images_in[i] = (half *)input->data(); + channel_num[i] = (uint32_t)inputs[i]->dims()[1]; + scales_in[i] = input->scale; + } + fpga::format_concat_output(out, (int)height, (int)width, (int)image_num, + channel_num); + + fpga::ConcatArgs concatArgs; + concatArgs.image_num = (uint32_t)image_num; + concatArgs.images_in = images_in; + concatArgs.scales_in = scales_in; + concatArgs.image_out = (half *)out->mutable_data(); + concatArgs.scale_out = out->scale; + concatArgs.channel_num = channel_num; + concatArgs.height = (uint32_t)height; + concatArgs.width = (uint32_t)width; + param->SetFpgaArgs(concatArgs); return true; } template <> void ConcatKernel::Compute(const ConcatParam ¶m) const { - auto inputs = param.Inputs(); - auto *out = param.Out(); - int64_t axis = param.Axis(); - out->mutable_data(); - - DDim out_dim = out->dims(); - int pixels = out_dim[1] * out_dim[2]; - auto out_channel = out_dim[3]; - - auto out_offset = 0; - for (int i = 0; i < inputs.size(); ++i) { - auto input = inputs[i]; - auto channels = input->dims()[3]; - out_offset += channels; - auto src = input->data(); - for (int j = 0; j < pixels; ++j) { - auto dst = out->mutable_data() + out_offset; - memory::Copy(dst, src, sizeof(half)); - } - } + ComputeFPGAConcat(param.FpgaArgs()); } template class ConcatKernel; diff --git a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp index 74080e6b0541a068956f031f984eea9ac0160b2d..9597cf3178ca3d6758f140eec7e7b6281606ad80 100644 --- a/src/operators/kernel/fpga/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_kernel.cpp @@ -15,7 +15,6 @@ limitations under the License. */ #ifdef FUSION_CONVADDBN_OP #include "operators/kernel/conv_add_bn_kernel.h" -#include "fpga/api.h" namespace paddle_mobile { namespace operators { @@ -23,13 +22,13 @@ namespace operators { template <> bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { bool relu_enabled = false; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); - const Tensor *bias = param->Bias(); + auto bias = param->Bias(); auto bias_ptr = bias->data(); - Tensor *filter = param->Filter(); + auto filter = const_cast(param->Filter()); - Tensor *out = param->Output(); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); @@ -41,10 +40,10 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - float *bs_ptr = + auto bs_ptr = reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - Tensor *new_scale = new Tensor(); - Tensor *new_bias = new Tensor(); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -70,27 +69,75 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { fpga::format_ofm(out); auto out_ptr = out->mutable_data(); - fpga::ConvArgs convArgs; - convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)filter_ptr; - convArgs.filter_num = filter->dims()[0]; - convArgs.group_num = param->Groups(); - convArgs.sb_address = (void *)bs_ptr; - convArgs.kernel.stride_h = param->Strides()[0]; - convArgs.kernel.stride_w = param->Strides()[1]; - convArgs.kernel.height = filter->dims()[2]; - convArgs.kernel.width = filter->dims()[3]; - convArgs.image.address = (void *)input_ptr; - convArgs.image.channels = input->dims()[1]; - convArgs.image.height = input->dims()[2]; - convArgs.image.width = input->dims()[3]; - convArgs.image.pad_height = param->Paddings()[0]; - convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->scale; - convArgs.output.address = (void *)out_ptr; + fpga::WrapperConvArgs convArgs; + convArgs.group_num = (uint32_t)param->Groups(); + convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); + convArgs.filter_num = (uint32_t)filter->dims()[0]; + convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); + + convArgs.concat_arg.image_num = convArgs.split_num; + convArgs.concat_arg.image_out = out_ptr; + convArgs.concat_arg.scale_out = out->scale; + convArgs.concat_arg.height = (uint32_t)filter->dims()[2]; + convArgs.concat_arg.width = (uint32_t)filter->dims()[3]; + + int n = convArgs.split_num; + convArgs.concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *)); + convArgs.concat_arg.scales_in = + (float **)fpga::fpga_malloc(n * sizeof(float *)); + convArgs.concat_arg.channel_num = + (uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t)); + convArgs.concat_arg.image_out = out_ptr; + param->SetFpgaArgs(convArgs); + int element_num = fpga::get_aligned_filter_element_num( + filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + + for (int i = 0; i < n; i++) { + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.scale_address = input->scale; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = + (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( + channel - (n - 1) * element_num_per_div) + : element_num_per_div); + + if (n > 1) { + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].output.address = + fpga::fpga_malloc(input->dims()[2] * input->dims()[3] * + convArgs.conv_args[i].filter_num * sizeof(half)); + } + + else { + convArgs.conv_args[i].output.scale_address = out->scale; + convArgs.conv_args[i].output.address = out_ptr; + } + + convArgs.concat_arg.images_in[i] = + (half *)convArgs.conv_args[i].output.address; + convArgs.concat_arg.scales_in[i] = + (float *)convArgs.conv_args[i].sb_address; + convArgs.concat_arg.channel_num[i] = convArgs.conv_args[i].filter_num; + } return true; } diff --git a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp index c5d3c7ae5da758e7ca8ae30bc9c9d7352c007260..c8f7292f89b7a98b290bdccde1139a2df2d10182 100644 --- a/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_bn_relu_kernel.cpp @@ -23,12 +23,12 @@ template <> bool ConvAddBNReluKernel::Init( FusionConvAddBNReluParam *param) { bool relu_enabled = true; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - Tensor *filter = param->Filter(); - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -39,9 +39,9 @@ bool ConvAddBNReluKernel::Init( "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); - Tensor *new_scale = new Tensor(); - Tensor *new_bias = new Tensor(); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -67,26 +67,45 @@ bool ConvAddBNReluKernel::Init( fpga::format_ofm(out); auto out_ptr = out->mutable_data(); - fpga::ConvArgs convArgs; - convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)filter_ptr; - convArgs.filter_num = filter->dims()[0]; - convArgs.group_num = param->Groups(); - convArgs.sb_address = (void *)bs_ptr; - convArgs.kernel.stride_h = param->Strides()[0]; - convArgs.kernel.stride_w = param->Strides()[1]; - convArgs.kernel.height = filter->dims()[2]; - convArgs.kernel.width = filter->dims()[3]; - convArgs.image.address = (void *)input_ptr; - convArgs.image.channels = input->dims()[1]; - convArgs.image.height = input->dims()[2]; - convArgs.image.width = input->dims()[3]; - convArgs.image.pad_height = param->Paddings()[0]; - convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->scale; - convArgs.output.address = (void *)out_ptr; + fpga::WrapperConvArgs convArgs; + convArgs.group_num = (uint32_t)param->Groups(); + convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); + convArgs.filter_num = (uint32_t)filter->dims()[0]; + convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); + + int element_num = fpga::get_aligned_filter_element_num( + filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + + int n = convArgs.split_num; + for (int i = 0; i < n; i++) { + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = + (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( + channel - (n - 1) * element_num_per_div) + : element_num_per_div); + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; + } + return true; return true; } diff --git a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp index 7c2e627ab2b64cec5a66b57f84ca6de448b4c37d..4b0c877376c44fa5079d52eefd33e1025a60f1c5 100644 --- a/src/operators/kernel/fpga/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_add_relu_kernel.cpp @@ -22,17 +22,17 @@ namespace operators { template <> bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { bool relu_enabled = true; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); const Tensor *bias = param->Bias(); auto bias_ptr = bias->data(); - Tensor *filter = param->Filter(); - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0], "Output channel should be equal to bias number"); int channel = out->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i + channel] = 1; bs_ptr[i] = bias_ptr[i]; @@ -49,27 +49,44 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { fpga::format_ofm(out); auto out_ptr = out->mutable_data(); - fpga::ConvArgs convArgs; - convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)filter_ptr; - convArgs.filter_num = filter->dims()[0]; - convArgs.group_num = param->Groups(); - convArgs.sb_address = (void *)bs_ptr; - convArgs.kernel.stride_h = param->Strides()[0]; - convArgs.kernel.stride_w = param->Strides()[1]; - convArgs.kernel.height = filter->dims()[2]; - convArgs.kernel.width = filter->dims()[3]; - convArgs.image.address = (void *)input_ptr; - convArgs.image.channels = input->dims()[1]; - convArgs.image.height = input->dims()[2]; - convArgs.image.width = input->dims()[3]; - - convArgs.image.pad_height = param->Paddings()[0]; - convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->scale; - convArgs.output.address = (void *)out_ptr; + fpga::WrapperConvArgs convArgs; + convArgs.group_num = (uint32_t)param->Groups(); + convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); + convArgs.filter_num = (uint32_t)filter->dims()[0]; + convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); + + int element_num = fpga::get_aligned_filter_element_num( + filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + + int n = convArgs.split_num; + for (int i = 0; i < n; i++) { + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = + (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( + channel - (n - 1) * element_num_per_div) + : element_num_per_div); + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; + } return true; } diff --git a/src/operators/kernel/fpga/conv_bn_kernel.cpp b/src/operators/kernel/fpga/conv_bn_kernel.cpp index 9a296244bf1de110b655ae4ebd3bde36cee5fe0d..60e562f4ed9357e1279992cd1691aed516e06138 100644 --- a/src/operators/kernel/fpga/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_kernel.cpp @@ -23,11 +23,10 @@ namespace operators { template <> bool ConvBNKernel::Init(FusionConvBNParam *param) { bool relu_enabled = false; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); - Tensor *filter = param->Filter(); - - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -37,10 +36,10 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - float *bs_ptr = + auto bs_ptr = reinterpret_cast(fpga::fpga_malloc(2 * channel * sizeof(float))); - Tensor *new_scale = new Tensor(); - Tensor *new_bias = new Tensor(); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -65,27 +64,44 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { fpga::format_ofm(out); auto out_ptr = out->mutable_data(); - fpga::ConvArgs convArgs; - convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)filter_ptr; - convArgs.filter_num = filter->dims()[0]; - convArgs.group_num = param->Groups(); - convArgs.sb_address = (void *)bs_ptr; - convArgs.kernel.stride_h = param->Strides()[0]; - convArgs.kernel.stride_w = param->Strides()[1]; - convArgs.kernel.height = filter->dims()[2]; - convArgs.kernel.width = filter->dims()[3]; - convArgs.image.address = (void *)input_ptr; - convArgs.image.channels = input->dims()[1]; - convArgs.image.height = input->dims()[2]; - convArgs.image.width = input->dims()[3]; - convArgs.image.pad_height = param->Paddings()[0]; - convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->scale; - convArgs.output.address = (void *)out_ptr; + fpga::WrapperConvArgs convArgs; + convArgs.group_num = (uint32_t)param->Groups(); + convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); + convArgs.filter_num = (uint32_t)filter->dims()[0]; + convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); + int element_num = fpga::get_aligned_filter_element_num( + filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + + int n = convArgs.split_num; + for (int i = 0; i < n; i++) { + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = + (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( + channel - (n - 1) * element_num_per_div) + : element_num_per_div); + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; + } return true; } diff --git a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp index 68a9202f2b9a8a965277807bbdce48c3d07c2c43..95775f30e667cec5c561e466b22a52d5f1dd44e3 100644 --- a/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/conv_bn_relu_kernel.cpp @@ -22,10 +22,10 @@ namespace operators { template <> bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { bool relu_enabled = true; - Tensor *input = const_cast(param->Input()); + auto input = const_cast(param->Input()); auto input_ptr = input->data(); - Tensor *filter = param->Filter(); - Tensor *out = param->Output(); + auto filter = const_cast(param->Filter()); + auto out = param->Output(); auto bn_mean_ptr = param->InputMean()->data(); auto bn_var_ptr = param->InputVariance()->data(); auto bn_scale_ptr = param->InputScale()->data(); @@ -34,9 +34,9 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0], "Output channel should be equal to bias number"); const int channel = out->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); - Tensor *new_scale = new Tensor(); - Tensor *new_bias = new Tensor(); + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + auto new_scale = new Tensor(); + auto new_bias = new Tensor(); auto new_scale_ptr = new_scale->mutable_data({channel}); auto new_bias_ptr = new_bias->mutable_data({channel}); @@ -61,26 +61,44 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { fpga::format_ofm(out); auto out_ptr = out->mutable_data(); - fpga::ConvArgs convArgs; - convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)filter_ptr; - convArgs.filter_num = filter->dims()[0]; - convArgs.group_num = param->Groups(); - convArgs.sb_address = (void *)bs_ptr; - convArgs.kernel.stride_h = param->Strides()[0]; - convArgs.kernel.stride_w = param->Strides()[1]; - convArgs.kernel.height = filter->dims()[2]; - convArgs.kernel.width = filter->dims()[3]; - convArgs.image.address = (void *)input_ptr; - convArgs.image.channels = input->dims()[1]; - convArgs.image.height = input->dims()[2]; - convArgs.image.width = input->dims()[3]; - convArgs.image.pad_height = param->Paddings()[0]; - convArgs.image.pad_width = param->Paddings()[1]; - convArgs.image.scale_address = input->scale; - convArgs.output.address = (void *)out_ptr; + fpga::WrapperConvArgs convArgs; + convArgs.group_num = (uint32_t)param->Groups(); + convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); + convArgs.filter_num = (uint32_t)filter->dims()[0]; + convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); + + int element_num = fpga::get_aligned_filter_element_num( + filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + + int n = convArgs.split_num; + for (int i = 0; i < n; i++) { + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = (uint32_t)param->Groups(); + convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0]; + convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1]; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3]; + convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0]; + convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1]; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = + (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( + channel - (n - 1) * element_num_per_div) + : element_num_per_div); + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input->scale; + } return true; } diff --git a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp index 5323796080073dd97b1eb06a0a0c7d8e5d8d824e..9840f495e89a3e63990bf5f10c65cf4afe8d0854 100644 --- a/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/elementwise_add_relu_kernel.cpp @@ -22,9 +22,9 @@ template <> bool ElementwiseAddReluKernel::Init( ElementwiseAddReluParam *param) { bool relu_enabled = true; - Tensor *input_x = const_cast(param->InputX()); - Tensor *input_y = const_cast(param->InputY()); - Tensor *out = param->Out(); + auto *input_x = const_cast(param->InputX()); + auto *input_y = const_cast(param->InputY()); + auto *out = param->Out(); auto input_x_ptr = input_x->data(); auto input_y_ptr = input_y->data(); fpga::format_ofm(out); @@ -34,22 +34,22 @@ bool ElementwiseAddReluKernel::Init( ewaddArgs.relu_enabled = relu_enabled; ewaddArgs.const0 = 1; ewaddArgs.const1 = 1; - ewaddArgs.image0.address = (void *)input_x_ptr; - ewaddArgs.image0.channels = input_x->dims()[1]; + ewaddArgs.image0.address = input_x_ptr; + ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1]; ewaddArgs.image0.scale_address = input_x->scale; - ewaddArgs.image0.height = input_x->dims()[2]; - ewaddArgs.image0.width = input_x->dims()[3]; + ewaddArgs.image0.height = (uint32_t)input_x->dims()[2]; + ewaddArgs.image0.width = (uint32_t)input_x->dims()[3]; ewaddArgs.image0.pad_height = 0; ewaddArgs.image0.pad_width = 0; - ewaddArgs.image1.address = (void *)input_y_ptr; - ewaddArgs.image1.channels = input_y->dims()[1]; + ewaddArgs.image1.address = input_y_ptr; + ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1]; ewaddArgs.image1.scale_address = input_y->scale; - ewaddArgs.image1.height = input_y->dims()[2]; - ewaddArgs.image1.width = input_y->dims()[3]; + ewaddArgs.image1.height = (uint32_t)input_y->dims()[2]; + ewaddArgs.image1.width = (uint32_t)input_y->dims()[3]; ewaddArgs.image1.pad_height = 0; ewaddArgs.image1.pad_width = 0; ewaddArgs.output.scale_address = out->scale; - ewaddArgs.output.address = (void *)out_ptr; + ewaddArgs.output.address = out_ptr; param->SetFpgaArgs(ewaddArgs); return true; } diff --git a/src/operators/kernel/fpga/fc_relu_kernel.cpp b/src/operators/kernel/fpga/fc_relu_kernel.cpp index 57db757d734bab9fceb1af2845936170b41d185c..75e680199156b5e315e0d59f4010e21e0b23907a 100644 --- a/src/operators/kernel/fpga/fc_relu_kernel.cpp +++ b/src/operators/kernel/fpga/fc_relu_kernel.cpp @@ -14,71 +14,84 @@ limitations under the License. */ #ifdef FUSION_FCRELU_OP #include "operators/kernel/fc_relu_kernel.h" -#include "fpga/api.h" - namespace paddle_mobile { namespace operators { template <> bool FusionFcReluKernel::Init(FusionFcReluParam *param) { bool relu_enabled = true; - Tensor *input_x = const_cast(param->InputX()); + auto input_x = const_cast(param->InputX()); auto input_x_ptr = input_x->data(); - Tensor *input_y = param->InputY(); - const Tensor *input_z = param->InputZ(); + auto filter = const_cast(param->InputY()); + auto input_z = param->InputZ(); auto input_z_ptr = input_z->data(); - Tensor *out = param->Out(); - - PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], + auto out = param->Out(); + PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], "Image channel should be equal to weight number"); - int channel = out->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + int channel = (uint32_t)out->dims()[1]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i + channel] = 1; bs_ptr[i] = input_z_ptr[i]; } - int num = input_y->dims()[1]; - int chw = input_y->dims()[0]; + int num = (uint32_t)filter->dims()[1]; + int chw = (uint32_t)filter->dims()[0]; PADDLE_MOBILE_ENFORCE( chw == input_x->numel(), "Filter element num should be equal to IFM element num"); - int height = input_x->dims()[2]; - int width = input_x->dims()[3]; + int height = (uint32_t)input_x->dims()[2]; + int width = (uint32_t)input_x->dims()[3]; int filter_channel = chw / height / width; - input_y->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(input_y); - fpga::format_filter(input_y, max_value, 1); - auto input_y_ptr = input_y->data(); + filter->Resize(framework::make_ddim({num, filter_channel, height, width})); + float max_value = fpga::filter_find_max(filter); + fpga::format_filter(filter, max_value, 1); + auto filter_ptr = filter->data(); - int element_num_per_div = fpga::get_element_num_per_div(input_y, 1); + int element_num_per_div = fpga::get_element_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); - fpga::format_ofm(out); auto out_ptr = out->mutable_data(); - fpga::ConvArgs convArgs; - convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)input_y_ptr; - convArgs.filter_num = out->dims()[1]; + fpga::WrapperConvArgs convArgs; convArgs.group_num = 1; - convArgs.sb_address = (void *)bs_ptr; - convArgs.kernel.stride_w = 1; - convArgs.kernel.stride_h = 1; - convArgs.kernel.height = input_x->dims()[2]; - convArgs.kernel.width = input_x->dims()[3]; - convArgs.image.address = (void *)input_x_ptr; - convArgs.image.channels = input_x->dims()[1]; - convArgs.image.height = input_x->dims()[2]; - convArgs.image.width = input_x->dims()[3]; - convArgs.image.pad_height = 0; - convArgs.image.pad_width = 0; - convArgs.image.scale_address = input_x->scale; - convArgs.output.address = (void *)out_ptr; + convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); + convArgs.filter_num = (uint32_t)filter->dims()[0]; + convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); + int element_num = fpga::get_aligned_filter_element_num( + filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + + int n = convArgs.split_num; + for (int i = 0; i < n; i++) { + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = 1; + convArgs.conv_args[i].kernel.stride_h = 1; + convArgs.conv_args[i].kernel.stride_w = 1; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_x_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3]; + convArgs.conv_args[i].image.pad_height = 0; + convArgs.conv_args[i].image.pad_width = 0; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = + (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( + channel - (n - 1) * element_num_per_div) + : element_num_per_div); + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input_x->scale; + } return true; } template <> diff --git a/src/operators/kernel/fpga/fusion_fc_kernel.cpp b/src/operators/kernel/fpga/fusion_fc_kernel.cpp index 3254b8cf5dd84ab339a373233278fae4101a15cf..3fe0457dffdea83ef59738a2c436bf558ab9635f 100644 --- a/src/operators/kernel/fpga/fusion_fc_kernel.cpp +++ b/src/operators/kernel/fpga/fusion_fc_kernel.cpp @@ -21,58 +21,78 @@ namespace operators { template <> bool FusionFcKernel::Init(FusionFcParam *param) { bool relu_enabled = false; - Tensor *input_x = const_cast(param->InputX()); + auto input_x = const_cast(param->InputX()); auto input_x_ptr = input_x->data(); - Tensor *input_y = param->InputY(); + auto filter = const_cast(param->InputY()); const Tensor *input_z = param->InputZ(); auto input_z_ptr = input_z->data(); - Tensor *out = param->Out(); + auto out = param->Out(); - PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == input_y->dims()[0], + PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0], "Image channel should be equal to weight number"); - int channel = out->dims()[1]; - float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); + int channel = (uint32_t)out->dims()[1]; + auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float)); for (int i = 0; i < channel; i++) { bs_ptr[i + channel] = 1; bs_ptr[i] = input_z_ptr[i]; } - - int num = input_y->dims()[1]; - int chw = input_y->dims()[0]; + int num = (uint32_t)filter->dims()[1]; + int chw = (uint32_t)filter->dims()[0]; PADDLE_MOBILE_ENFORCE( chw == input_x->numel(), "Filter element num should be equal to IFM element num"); - int height = input_x->dims()[2]; - int width = input_x->dims()[3]; + int height = (uint32_t)input_x->dims()[2]; + int width = (uint32_t)input_x->dims()[3]; int filter_channel = chw / height / width; - input_y->Resize(framework::make_ddim({num, filter_channel, height, width})); - float max_value = fpga::filter_find_max(input_y); - fpga::format_filter(input_y, max_value, 1); - auto input_y_ptr = input_y->data(); - int element_num_per_div = fpga::get_element_num_per_div(input_y, 1); + + filter->Resize(framework::make_ddim({num, filter_channel, height, width})); + float max_value = fpga::filter_find_max(filter); + fpga::format_filter(filter, max_value, 1); + auto filter_ptr = filter->data(); + + int element_num_per_div = fpga::get_element_num_per_div(filter, 1); fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel); + auto out_ptr = out->mutable_data(); - fpga::ConvArgs convArgs; - convArgs.relu_enabled = relu_enabled; - convArgs.filter_address = (void *)input_y_ptr; - convArgs.filter_num = out->dims()[1]; + fpga::WrapperConvArgs convArgs; convArgs.group_num = 1; - convArgs.sb_address = (void *)bs_ptr; - convArgs.kernel.stride_w = 1; - convArgs.kernel.stride_h = 1; - convArgs.kernel.height = input_x->dims()[2]; - convArgs.kernel.width = input_x->dims()[3]; - convArgs.image.address = (void *)input_x_ptr; - convArgs.image.channels = input_x->dims()[1]; - convArgs.image.height = input_x->dims()[2]; - convArgs.image.width = input_x->dims()[3]; - convArgs.image.pad_height = 0; - convArgs.image.pad_width = 0; - convArgs.image.scale_address = input_x->scale; - convArgs.output.address = (void *)out_ptr; + convArgs.split_num = (uint32_t)fpga::get_plit_num(filter); + convArgs.filter_num = (uint32_t)filter->dims()[0]; + convArgs.output.address = out_ptr; convArgs.output.scale_address = out->scale; + convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc( + convArgs.split_num * sizeof(fpga::ConvArgs)); param->SetFpgaArgs(convArgs); + + int element_num = fpga::get_aligned_filter_element_num( + filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); + + int n = convArgs.split_num; + for (int i = 0; i < n; i++) { + convArgs.conv_args[i].relu_enabled = relu_enabled; + convArgs.conv_args[i].group_num = 1; + convArgs.conv_args[i].kernel.stride_h = 1; + convArgs.conv_args[i].kernel.stride_w = 1; + convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2]; + convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3]; + convArgs.conv_args[i].image.address = input_x_ptr; + convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1]; + convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2]; + convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3]; + convArgs.conv_args[i].image.pad_height = 0; + convArgs.conv_args[i].image.pad_width = 0; + convArgs.conv_args[i].filter_address = + &((int8_t *)filter_ptr)[i * element_num]; + convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num]; + convArgs.conv_args[i].filter_num = + (uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num( + channel - (n - 1) * element_num_per_div) + : element_num_per_div); + convArgs.conv_args[i].output.scale_address = + (float *)fpga::fpga_malloc(2 * sizeof(float)); + convArgs.conv_args[i].image.scale_address = input_x->scale; + } return true; } diff --git a/src/operators/kernel/fpga/pool_kernel.cpp b/src/operators/kernel/fpga/pool_kernel.cpp index e8c40086c459998eab0e1997125dc9c64574c8f2..d3df951dbc340814d766f76e8720c3aaef2f3539 100644 --- a/src/operators/kernel/fpga/pool_kernel.cpp +++ b/src/operators/kernel/fpga/pool_kernel.cpp @@ -21,7 +21,7 @@ namespace operators { template <> bool PoolKernel::Init(PoolParam *param) { - Tensor *input = const_cast(param->Input()); + auto *input = const_cast(param->Input()); auto input_ptr = input->data(); Tensor *output = param->Output(); fpga::format_ofm(output); @@ -31,19 +31,19 @@ bool PoolKernel::Init(PoolParam *param) { vector paddings = param->Paddings(); fpga::PoolingArgs poolArgs; - poolArgs.image.address = (void *)input_ptr; - poolArgs.image.channels = input->dims()[1]; - poolArgs.image.height = input->dims()[2]; - poolArgs.image.width = input->dims()[3]; - poolArgs.image.pad_height = paddings[0]; - poolArgs.image.pad_width = paddings[1]; + poolArgs.image.address = input_ptr; + poolArgs.image.channels = (uint32_t)input->dims()[1]; + poolArgs.image.height = (uint32_t)input->dims()[2]; + poolArgs.image.width = (uint32_t)input->dims()[3]; + poolArgs.image.pad_height = (uint32_t)paddings[0]; + poolArgs.image.pad_width = (uint32_t)paddings[1]; poolArgs.image.scale_address = input->scale; poolArgs.output.address = output_ptr; poolArgs.output.scale_address = input->scale; - poolArgs.kernel.height = ksize[0]; - poolArgs.kernel.width = ksize[1]; - poolArgs.kernel.stride_h = strides[0]; - poolArgs.kernel.stride_w = strides[1]; + poolArgs.kernel.height = (uint32_t)ksize[0]; + poolArgs.kernel.width = (uint32_t)ksize[1]; + poolArgs.kernel.stride_h = (uint32_t)strides[0]; + poolArgs.kernel.stride_w = (uint32_t)strides[1]; param->SetFpgaArgs(poolArgs); return true; } diff --git a/src/operators/kernel/fpga/softmax_kernel.cpp b/src/operators/kernel/fpga/softmax_kernel.cpp index d8159acf1ca0420db8b26656571826be30538e80..20c86a5c73bc9c35b8f8fd430013bb97d269fb4a 100644 --- a/src/operators/kernel/fpga/softmax_kernel.cpp +++ b/src/operators/kernel/fpga/softmax_kernel.cpp @@ -33,8 +33,8 @@ bool SoftmaxKernel::Init(SoftmaxParam *param) { args.convert_type = fpga::DATA_FP16_TO_FP32; args.layout_type = fpga::LAYOUT_NO_CONVERT; args.image.address = (void *)(input_ptr); - args.image.height = input->dims()[0]; - args.image.width = input->dims()[1]; + args.image.height = (uint32_t)input->dims()[0]; + args.image.width = (uint32_t)input->dims()[1]; args.image.channels = 1; args.output.address = output_ptr; param->SetFpgaArgs(args); diff --git a/src/operators/math/gemm.cpp b/src/operators/math/gemm.cpp index e5dc220f4c50922c2918251720619de8b4df7b98..e3966d3290fac1d736bfa778635e2f943dfd9398 100644 --- a/src/operators/math/gemm.cpp +++ b/src/operators/math/gemm.cpp @@ -13,7 +13,7 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "operators/math/gemm.h" -#include +#include #include "common/log.h" #include "memory/t_malloc.h" #if __ARM_NEON @@ -2985,6 +2985,8 @@ void WriteWithBn(int mc, int nc, float *c, float *C, int ldc, float *new_scale, void WriteWithBnRelu(int mc, int nc, float *c, float *C, int ldc, float *new_scale, float *new_bias) {} +void WriteWithBnAddRelu(int mc, int nc, float *c, float *C, int ldc, + float *new_scale, float *new_bias, float *bias1) {} #endif // __ARM_NEON diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 91a3941880a8ecb4d11d4589ae860c185d5ed37a..1728e6a6cc778ec223c3f14c971404ba3a5cc0f7 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -489,6 +489,15 @@ class ConcatParam : public OpParam { vector inputs_; GType *out_; int axis_; +#ifdef PADDLE_MOBILE_FPGA + + private: + fpga::ConcatArgs fpga_concat_args; + + public: + const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; } + void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; } +#endif }; #endif @@ -1238,11 +1247,7 @@ class FusionFcParam : public OpParam { } const GType *InputX() const { return input_x_; } -#ifdef PADDLE_MOBILE_FPGA - RType *InputY() const { return input_y_; } -#else const RType *InputY() const { return input_y_; } -#endif const RType *InputZ() const { return input_z_; } @@ -1265,11 +1270,11 @@ class FusionFcParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; @@ -1303,11 +1308,7 @@ class FusionConvAddParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif RType *Output() const { return output_; } @@ -1332,11 +1333,11 @@ class FusionConvAddParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; @@ -1385,11 +1386,7 @@ class FusionConvAddPReluParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif RType *Output() const { return output_; } @@ -1416,11 +1413,11 @@ class FusionConvAddPReluParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1467,11 +1464,7 @@ class FusionConvAddAddPReluParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif RType *Output() const { return output_; } @@ -1502,11 +1495,11 @@ class FusionConvAddAddPReluParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1544,11 +1537,7 @@ class FusionConvAddBNReluParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif RType *Output() const { return output_; } @@ -1604,11 +1593,11 @@ class FusionConvAddBNReluParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1654,11 +1643,7 @@ class FusionConvBNAddReluParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif RType *Output() const { return output_; } @@ -1717,11 +1702,11 @@ class FusionConvBNAddReluParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1754,11 +1739,8 @@ class FusionConvBNParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif + RType *Output() const { return output_y_; } const vector &Strides() const { return strides_; } @@ -1811,11 +1793,11 @@ class FusionConvBNParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -1853,11 +1835,8 @@ class FusionConvAddBNParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif + RType *Output() const { return output_y_; } const vector &Strides() const { return strides_; } @@ -1912,11 +1891,11 @@ class FusionConvAddBNParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -2033,11 +2012,7 @@ class FusionConvBNReluParam : public OpParam { const RType *Input() const { return input_; } -#ifdef PADDLE_MOBILE_FPGA - RType *Filter() const { return filter_; } -#else const RType *Filter() const { return filter_; } -#endif RType *Output() const { return output_; } @@ -2091,11 +2066,11 @@ class FusionConvBNReluParam : public OpParam { #ifdef PADDLE_MOBILE_FPGA private: - fpga::ConvArgs fpga_conv_args; + fpga::WrapperConvArgs fpga_conv_args; public: - const fpga::ConvArgs &FpgaArgs() const { return fpga_conv_args; } - void SetFpgaArgs(const fpga::ConvArgs &args) { fpga_conv_args = args; } + const fpga::WrapperConvArgs &FpgaArgs() const { return fpga_conv_args; } + void SetFpgaArgs(const fpga::WrapperConvArgs &args) { fpga_conv_args = args; } #endif }; #endif @@ -2147,15 +2122,20 @@ class DropoutParam : public OpParam { const AttributeMap &attrs, const Scope &scope) { input_x_ = InputXFrom(inputs, scope); out_ = OutFrom(outputs, scope); + + dropout_prob_ = GetAttr("dropout_prob", attrs); } const RType *InputX() const { return input_x_; } RType *Out() const { return out_; } + float DropoutProb() const { return dropout_prob_; } + private: RType *input_x_; RType *out_; + float dropout_prob_; }; #endif diff --git a/test/CMakeLists.txt b/test/CMakeLists.txt index 4c6cd6f55bbc3db749e4e78de200bbe9b779968a..d9dd2634770fbcfce22f1c35790b0b81ac4fa346 100644 --- a/test/CMakeLists.txt +++ b/test/CMakeLists.txt @@ -208,6 +208,14 @@ else () target_link_libraries(test-gru-op paddle-mobile) # gen test + + ADD_EXECUTABLE(test-inceptionv4 net/test_inceptionv4.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-inceptionv4 paddle-mobile) + + # gen test + ADD_EXECUTABLE(test-alexnet net/test_alexnet.cpp test_helper.h test_include.h executor_for_test.h) + target_link_libraries(test-alexnet paddle-mobile) + ADD_EXECUTABLE(test-googlenetv1 net/test_googlenetv1_combine.cpp test_helper.h test_include.h) target_link_libraries(test-googlenetv1 paddle-mobile) @@ -215,10 +223,13 @@ else () ADD_EXECUTABLE(test-fssd net/test_mobilenet_025_fssd.cpp test_helper.h test_include.h) target_link_libraries(test-fssd paddle-mobile) + #add_library(test-lib-size SHARED common/test_lib_size.h common/test_lib_size.cpp) + + endif() # if(FPGA) diff --git a/test/net/test_alexnet.cpp b/test/net/test_alexnet.cpp new file mode 100644 index 0000000000000000000000000000000000000000..50053fe82f95177fd786c1c8f8f5c9b7a521b888 --- /dev/null +++ b/test/net/test_alexnet.cpp @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(4); + auto time1 = time(); + // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", + // std::string(g_mobilenet_detect) + "/params", true); + + auto isok = paddle_mobile.Load(g_alexnet, true); + if (isok) { + auto time2 = time(); + std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; + + std::vector input; + std::vector dims{1, 3, 224, 224}; + GetInput(g_test_image_1x3x224x224_banana, &input, dims); + + auto vec_result = paddle_mobile.Predict(input, dims); + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + + // 预热十次 + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } + auto time3 = time(); + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } + DLOG << vec_result; + auto time4 = time(); + std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" + << std::endl; + } + + std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " + "是否存在?" + << std::endl; + return 0; +} diff --git a/test/net/test_inceptionv4.cpp b/test/net/test_inceptionv4.cpp new file mode 100644 index 0000000000000000000000000000000000000000..fbbc9dd39e64f7a8ea745cf7489e46f00ffe1413 --- /dev/null +++ b/test/net/test_inceptionv4.cpp @@ -0,0 +1,59 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include +#include "../test_helper.h" +#include "../test_include.h" + +int main() { + paddle_mobile::PaddleMobile paddle_mobile; + paddle_mobile.SetThreadNum(4); + auto time1 = time(); + // auto isok = paddle_mobile.Load(std::string(g_mobilenet_detect) + "/model", + // std::string(g_mobilenet_detect) + "/params", true); + + auto isok = paddle_mobile.Load(g_inceptionv4, true); + if (isok) { + auto time2 = time(); + std::cout << "load cost :" << time_diff(time1, time1) << "ms" << std::endl; + + std::vector input; + std::vector dims{1, 3, 224, 224}; + GetInput(g_test_image_1x3x224x224_banana, &input, dims); + + auto vec_result = paddle_mobile.Predict(input, dims); + std::vector::iterator biggest = + std::max_element(std::begin(vec_result), std::end(vec_result)); + std::cout << " Max element is " << *biggest << " at position " + << std::distance(std::begin(vec_result), biggest) << std::endl; + + // 预热十次 + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } + auto time3 = time(); + for (int i = 0; i < 10; ++i) { + auto vec_result = paddle_mobile.Predict(input, dims); + } + // DLOG << vec_result; + auto time4 = time(); + std::cout << "predict cost :" << time_diff(time3, time4) / 10 << "ms" + << std::endl; + } + + std::cout << "如果结果Nan请查看: test/images/g_test_image_1x3x224x224_banana " + "是否存在?" + << std::endl; + return 0; +} diff --git a/test/test_helper.h b/test/test_helper.h index b6ebdbc04f1212c16a905732084a02f01e540d3c..7581405c3d9f14e7e997e73be91cb624ad6d9798 100644 --- a/test/test_helper.h +++ b/test/test_helper.h @@ -34,6 +34,8 @@ static const char *g_mobilenet_detect = "../models/mobilenet-detect"; static const char *g_squeezenet = "../models/squeezenet"; static const char *g_googlenet = "../models/googlenet"; static const char *g_mobilenet = "../models/mobilenet"; +static const char *g_alexnet = "../models/alexnet"; +static const char *g_inceptionv4 = "../models/inceptionv4"; static const char *g_nlp = "../models/nlp"; static const char *g_resnet_50 = "../models/resnet_50"; static const char *g_resnet = "../models/resnet";