diff --git a/src/common/types.cpp b/src/common/types.cpp index ef2d4ed1fc68bcb96fd1cdea10b654ba3bb05ffd..cf2c4dc87613b4641d7c1126e22d2e4a45ff9594 100644 --- a/src/common/types.cpp +++ b/src/common/types.cpp @@ -72,6 +72,8 @@ const char *G_OP_TYPE_SUM = "sum"; const char *G_OP_TYPE_QUANTIZE = "quantize"; const char *G_OP_TYPE_DEQUANTIZE = "dequantize"; +const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN = "fusion_dequant_add_bn"; +const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU = "fusion_dequant_bn_relu"; const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU = "fusion_dequant_add_bn_relu"; const char *G_OP_TYPE_TANH = "tanh"; @@ -138,6 +140,8 @@ std::unordered_map< {G_OP_TYPE_ELEMENTWISE_MUL, {{"X", "Y"}, {"Out"}}}, {G_OP_TYPE_QUANTIZE, {{"X"}, {"Out", "OutScale"}}}, {G_OP_TYPE_DEQUANTIZE, {{"X", "Scale"}, {"Out"}}}, + {G_OP_TYPE_FUSION_DEQUANT_ADD_BN, {{"X", "Scale"}, {"Y"}}}, + {G_OP_TYPE_FUSION_DEQUANT_BN_RELU, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU, {{"X", "Scale"}, {"Out"}}}, {G_OP_TYPE_TANH, {{"X"}, {"Out"}}}, {G_OP_TYPE_FUSION_DECONV_RELU, {{"Input"}, {"Out"}}}, diff --git a/src/common/types.h b/src/common/types.h index b84f802cb81678c76da8ca29ce36f43e13618c23..a63d2efd23ebdef1ebb0b6d40d356c33574b3818 100644 --- a/src/common/types.h +++ b/src/common/types.h @@ -139,6 +139,8 @@ extern const char *G_OP_TYPE_ELEMENTWISE_MUL; extern const char *G_OP_TYPE_QUANTIZE; extern const char *G_OP_TYPE_DEQUANTIZE; +extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN; +extern const char *G_OP_TYPE_FUSION_DEQUANT_BN_RELU; extern const char *G_OP_TYPE_FUSION_DEQUANT_ADD_BN_RELU; extern const char *G_OP_TYPE_TANH; diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 7c1f15f7c90e0b1ebc15a9ec8f3f6333ff173978..ef7d5c13dce50706d69b28830d8c23748266b216 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].filter_scale_address = filter->scale; - arg->conv_arg[i].filter_address = &( - (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + // arg->conv_arg[i].filter_address = &( + // (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // + // NOLINT + // arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].filter_num = (uint32_t)( i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT : filter_num_per_div); + size_t filter_size = + element_num * arg->conv_arg[i].filter_num * sizeof(int8_t); + auto filter_head = + &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; + arg->conv_arg[i].filter_address = fpga_malloc(filter_size); + memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); + fpga_flush(arg->conv_arg[i].filter_address, filter_size); + + size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float); + auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].sb_address = fpga_malloc(bs_size); + memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); + fpga_flush(arg->conv_arg[i].sb_address, bs_size); + if (n > 1) { arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT arg->conv_arg[i].output.address = - fpga_malloc(input->dims()[2] * - align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num, + fpga_malloc(out->dims()[2] * + align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num, IMAGE_ALIGNMENT) * sizeof(half)); } else { @@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; } + filter->reset_data_ptr(nullptr); + fpga_free(bs_ptr); } } // namespace fpga diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp index 157ac90a60262cadacb648173cbc5ba6c01e674e..d67c9fdc18ac8f715a5389625b6d76b71281e349 100644 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -137,24 +137,23 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - if (num_per_div_after_alignment != num_per_div_before_alignment) { - char *tmp = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - memset(data_tmp, 0, num_element * sizeof(char)); + char *tmp = *data_in; + int div_num = + (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; + int num_element = div_num * num_per_div_after_alignment * align_chw; + char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - for (i = 0; i < div_num; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - num_per_div_before_alignment * align_chw); - } + memset(data_tmp, 0, num_element * sizeof(char)); - *data_in = data_tmp; - fpga_free(tmp); + for (i = 0; i < div_num; i++) { + memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, + *data_in + num_per_div_before_alignment * align_chw * i, + num_per_div_before_alignment * align_chw); } + + *data_in = data_tmp; + fpga_free(tmp); } void reorder(char **data_in, int num_after_alignment, int chw) { @@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width, char **quantize_data = (char **)data_in; // NOLINT convert_to_hwc(quantize_data, num, channel, height, width); align_element(quantize_data, num, chw); - align_num(quantize_data, num_per_div_before_alignment, num, chw); + if (num_after_alignment != num) { + align_num(quantize_data, num_per_div_before_alignment, num, chw); + } + reorder(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw); fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * @@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height, align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * div_num; + int residual = num % num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * + ((residual == 0) ? div_num : (div_num - 1)) + + align_to_x(residual, FILTER_NUM_ALIGNMENT); quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_fc_filter(quantize_data, num, chw); align_element(quantize_data, num, chw); - align_num(quantize_data, num_per_div_before_alignment, num, chw); + if (num_after_alignment != num) { + align_num(quantize_data, num_per_div_before_alignment, num, chw); + } reorder(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw); fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 1f0e5768a7017a4c7f928fea86b8f4ef3cdbae3d..d62f015e66c9bfb7c1ee07c349a307563a4581f2 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -13,16 +13,172 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/common/pe.h" +#include +#include +#include +#include "common/types.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" #include "fpga/common/config.h" #include "fpga/common/driver.h" +using namespace std; +using namespace paddle_mobile::fpga::driver; // NOLINT namespace paddle_mobile { namespace fpga { +#define IMAGE_ALIGN 16 +#define FILTER_ALIGN 16 +#define FILTER_NUM_ALIGN 32 +#define USE_RELU 1 +#define USE_BIAS 2 + +// bypass cmd +#define CMD_FP16_TO_FP16 0 +#define CMD_FP16_TO_FP32 1 +#define CMD_FP32_TO_FP16 2 +#define CMD_FP32_TO_FP32 3 + +// bypass macro +#define SIZE_FP16 2 +#define SIZE_FP32 4 + +#define PE_IRQ_TIMEOUT 1000000 + +/* Interrupt bit-set offset*/ +#define INTERRUPT_RSVD 0x0001 +#define INTERRUPT_BYPASS 0x0002 +#define INTERRUPT_CONV 0x0004 +#define INTERRUPT_POOLING 0x0008 +#define INTERRUPT_EW 0x0010 +//#define INTERRUPT_RESIZE 0x0020 + +/* Register offset */ +#define REG_INTERRUPT 0x000 +#define REG_VERSION 0x008 +#define REG_TEMPERATURE 0x010 +#define REG_FPGA_RESET 0x018 +#define REG_TEST_REGISTER 0x048 +#define REG_HARDWARE_STATUS 0x050 + +#define REG_TIMER_COUNTER 0x070 + +#define REG_SCALE_PARAMETER 0x080 + +#define REG_FLASH_CMD 0x200 +#define REG_FLASH_DATA 0x208 +#define REG_FLASH_CONFIG 0x210 +#define REG_FLASH_STATUS 0x218 +#define REG_SN 0x220 + +//#define REG_READ_SCALE +//#define REG_WRITE_SCALE + +/*bypass*/ +#define REG_CONVERT_CMD 0x400 +#define REG_CONVERT_SRC_ADDR 0x408 +#define REG_CONVERT_DST_ADDR 0x410 +#define REG_CONVERT_LENGTH 0x418 + +/*resize*/ +#define REG_RESIZE_CMD 0x600 +#define REG_RESIZE_CHANNEL_NUMBER 0x608 +#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610 +#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618 +#define REG_RESIZE_INPUT_BASE_ADDR 0x620 +#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628 +#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630 +#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638 + +/*pooling*/ +#define REG_POOLING_CMD 0x800 +#define REG_POOLING_IMAGE_BASE_ADDR 0x808 +#define REG_POOLING_RESULT_BASE_ADDR 0x810 +#define REG_POOLING_IMAGE_PIXEL 0x818 +#define REG_POOLING_WINDOW_SIZE 0x820 +#define REG_POOLING_RESULT_PIXEL 0x828 +#define REG_POOLING_PAD_PIXEL 0x830 +#define REG_POOLING_STEP_PIXEL 0x838 +#define REG_POOLING_CHANNEL_NUMBER 0x840 +#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848 +#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850 +#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858 +#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860 +#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 +#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 +#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 +#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 +#define REG_POOLING_MODE_RECIPROCAL 0x890 + +/*conv*/ +#define REG_CONV_CMD 0xC00 +#define REG_CONV_IMAGE_BASE_ADDR 0xC08 +#define REG_CONV_FILTER_BASE_ADDR 0xC10 +#define REG_CONV_SB_BASE_ADDR 0xC18 +#define REG_CONV_RESULT_BASE_ADDR 0xC20 +#define REG_CONV_IMAGE_PIXEL 0xC28 +#define REG_CONV_FILTER_PIXEL 0xC30 +#define REG_CONV_RESULT_PIXEL 0xC38 +#define REG_CONV_PAD_PIXEL 0xC40 +#define REG_CONV_STEP_PIXEL 0xC48 +#define REG_CONV_GROUP_NUMBER 0xC50 +#define REG_CONV_FILTER_NUMBER 0xC58 +#define REG_CONV_CHANNEL_NUMBER 0xC60 +#define REG_CONV_FILTER_PER_GROUP 0xC68 +#define REG_CONV_CHANNEL_PER_GROUP 0xC70 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78 +#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80 +#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88 +#define REG_CONV_FILTER_AMOUNT_ALL 0xC90 +#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98 +#define REG_CONV_RESULT_LAST_VALID 0xCA0 + +#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8 +#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0 +#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8 +#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0 +#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8 +#define REG_CONV_IMAGE_WIN_CNT 0xCE0 +#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8 +#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8 +#define REG_CONV_PROG_FULL_CNT 0xD08 +#define REG_CONV_POST_PROG_FULL_CNT 0xD10 +#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20 + +#define REG_CONV_IMAGE_SCALE 0xD28 +#define REG_CONV_FILTER_SCALE 0xD30 + +/*ew*/ +#define REG_EW_CMD 0x0F00 +#define REG_EW_IMAGE0_BASE_ADDR 0x0F08 +#define REG_EW_IMAGE1_BASE_ADDR 0x0F10 +#define REG_EW_RESULT_BASE_ADDR 0x0F18 +#define REG_EW_DATA_LEN 0x0F20 +#define REG_EW_COEFFICIENT 0x0F28 +#define REG_EW_IMAGE_PIXEL 0x0F30 +#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38 + int ComputeFpgaConv(const struct SplitConvArgs &args) { - ComputeBasicConv(args.conv_arg[0]); +// ComputeBasicConv(args.conv_arg[0]); +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFPGAConv==========="; + DLOG << " filter_num:" << args.filter_num + << " group_num:" << args.group_num + << " split_num:" << args.split_num; +#endif + + int split_num = args.split_num; + for (int i = 0; i < split_num; i++) { + ComputeBasicConv(args.conv_arg[i]); + } + + if (split_num > 1) { + ComputeFPGAConcat(args.concat_arg); + } } int ComputeBasicConv(const struct ConvArgs &args) { @@ -47,9 +203,237 @@ int ComputeBasicConv(const struct ConvArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif + cout << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + cout << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + cout << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + cout << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Conv"; + // return 0; + uint64_t timer_cnt; + uint64_t output_scale; + uint64_t image_scale; + uint64_t filter_scale; + uint64_t image_address_phy = 0; + uint64_t sb_address_phy = 0; + uint64_t filter_address_phy = 0; + uint64_t output_address_phy = 0; + int ret = 0; + + fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); + fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); + + cout << "image_scale :" << hex << (image_scale) << endl; + cout << "filter_scale :" << hex << (filter_scale) << endl; + + uint64_t filterlen = (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height * + (uint64_t)args.image.channels; + filterlen = align_to_x(filterlen, FILTER_ALIGN); + filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGN); + uint64_t fpga_bias_scale_len = + align_to_x(args.filter_num / args.group_num, 8) * args.group_num; + + uint64_t output_height = + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1; + uint64_t output_width = + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1; + uint64_t output_size = + output_height * output_width * (uint64_t)args.filter_num; + + uint64_t filter_per_group = (uint64_t)(args.filter_num / args.group_num); + uint64_t channel_per_group = (uint64_t)(args.image.channels / args.group_num); + + uint64_t image_row_count = ((uint64_t)args.image.width) * + ((uint64_t)args.image.channels); // without align + uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGN); + uint64_t image_one_pad_per_row = + align_to_x(image_row_count, IMAGE_ALIGN) + + ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels); + uint64_t filter_amount_all = + align_to_x(((uint64_t)args.kernel.height) * + ((uint64_t)args.kernel.width) * channel_per_group, + FILTER_ALIGN); + + uint64_t output_amount_per_row = + align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGN); + + // find the opt partition strategy + uint64_t res_win; + uint64_t res_fit = 0; + for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { + if ((align_to_x( + (args.image.channels * + (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1) * + args.kernel.height > + 2048) { + break; + } + } + + if (res_win != output_width) { + res_win -= 1; + } + + if (((res_win % 2) != 0) && (res_win != 1)) { + res_win = res_win - 1; + } + res_fit = res_win; + + uint64_t block_num = (output_width + res_fit - 1) / res_fit; + uint64_t block_len = res_fit; + uint64_t block_last = output_width - res_fit * (block_num - 1); + + uint64_t res_amount_per_row = output_width * args.filter_num; + uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; + + uint64_t image_block_amount_per_row = + args.kernel.stride_w * (res_fit)*args.image.channels; + uint64_t filter_pad_width_mul_channel = + args.image.pad_width * args.image.channels; + uint64_t image_amount_per_row_multi_win_first = + image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); + uint64_t image_amount_per_row_multi_win = + image_amount_per_row * (4 * args.kernel.stride_h); + + uint64_t image_block_num = block_num; + uint64_t image_block_len = + align_to_x((args.image.channels * + (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1; + uint64_t image_block_len_last = + align_to_x( + (args.image.channels * + (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1; + uint64_t image_win_cnt = block_len; + uint64_t image_win_cnt_last = block_last; + uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; + uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; + if (prog_full_cnt == 1023) { + prog_full_cnt--; + } + uint64_t post_prog_full_cnt = + (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) + ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) + : 0; + + image_address_phy = vaddr_to_paddr(args.image.address); + sb_address_phy = vaddr_to_paddr(args.sb_address); + filter_address_phy = vaddr_to_paddr(args.filter_address); + output_address_phy = vaddr_to_paddr(args.output.address); + + /*SDK刷Cache保证数据一致性*/ + uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR); + reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR); + reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR); + reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR); + + reg_writeq( + ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), + REG_CONV_IMAGE_PIXEL); + reg_writeq( + ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), + REG_CONV_FILTER_PIXEL); + reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL); + reg_writeq(((uint64_t)args.image.pad_height) | + (((uint64_t)args.image.pad_width) << 32), + REG_CONV_PAD_PIXEL); + reg_writeq(((uint64_t)args.kernel.stride_h) | + (((uint64_t)args.kernel.stride_w) << 32), + REG_CONV_STEP_PIXEL); + + reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); + reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); + reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); + + reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP); + reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP); + + reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); + reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); + reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); + + reg_writeq(image_block_amount_per_row, 0xca8); + reg_writeq(filter_pad_width_mul_channel, 0xcb0); + reg_writeq(image_amount_per_row_multi_win_first, 0xcb8); + reg_writeq(image_amount_per_row_multi_win, 0xcc0); + reg_writeq(image_block_num, 0xcc8); + reg_writeq(image_block_len, 0xcd0); + reg_writeq(image_block_len_last, 0xcd8); + reg_writeq(image_win_cnt, 0xce0); + reg_writeq(image_win_cnt_last, 0xce8); + reg_writeq(res_row_data_align4_pad, 0xcf8); + reg_writeq(prog_full_cnt, 0xd08); + reg_writeq(post_prog_full_cnt, 0xd10); + reg_writeq(fpga_bias_scale_len / 4, 0xd20); + + /*write scale*/ + reg_writeq(image_scale, REG_CONV_IMAGE_SCALE); + reg_writeq(filter_scale, REG_CONV_FILTER_SCALE); + + reg_writeq(cmd, REG_CONV_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; + ret = -EIO; + DLOG << "Conv Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + cout << "output_scale :" << hex << (output_scale) << endl; + + //*(args.output.scale_address) = output_scale; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + + return ret; #endif return 0; @@ -74,8 +458,135 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Polling"; + // return 0; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + int ret = 0; + uint64_t cmd = 0; + + uint64_t image_physical_address = 0; + uint64_t output_physical_address = 0; + + image_physical_address = vaddr_to_paddr(args.image.address); + output_physical_address = vaddr_to_paddr(args.output.address); + + uint32_t output_height = (uint32_t)( + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1); + uint32_t output_width = (uint32_t)( + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1); + + uint64_t image_amount_per_row = align_to_x( + (uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGN); + uint64_t image_one_pad_per_row = + align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, + FILTER_ALIGN) + + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; + uint64_t image_two_pad_per_row = align_to_x( + ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * + (uint64_t)args.image.channels, + IMAGE_ALIGN); + uint64_t image_row_mul_pooling_hight = + image_amount_per_row * (uint64_t)args.kernel.height; + uint64_t image_row_mul_pad_hight = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_row_mul_step_hight = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t result_amount_align_32 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, FILTER_ALIGN); + uint64_t result_amount_align_64 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGN); + uint64_t image_calcu_height = + (uint64_t)args.kernel.height + + ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; + + uint64_t image_pad_left = args.image.channels * args.image.pad_width; + uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; + + uint64_t image_padleft_skipwindow = + (image_skip_window << 32) | image_pad_left; + + uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | + (((uint64_t)args.kernel_reciprocal)); + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); + reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); + + reg_writeq( + ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), + REG_POOLING_IMAGE_PIXEL); + reg_writeq( + ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), + REG_POOLING_WINDOW_SIZE); + + reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), + REG_POOLING_RESULT_PIXEL); + + reg_writeq(((uint64_t)args.image.pad_height) | + (((uint64_t)args.image.pad_width) << 32), + REG_POOLING_PAD_PIXEL); + reg_writeq(((uint64_t)args.kernel.stride_h) | + (((uint64_t)args.kernel.stride_w) << 32), + REG_POOLING_STEP_PIXEL); + + reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); + + reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); + reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); + + reg_writeq(image_row_mul_pooling_hight, + REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); + reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); + reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); + + reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); + reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); + + reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); + + reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); + reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); + + /*SDK刷Cache保证数据一致性*/ + + reg_writeq(cmd, REG_POOLING_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + ret = -EIO; + DLOG << "Pooling Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + + // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + + return ret; #endif return 0; } @@ -103,8 +614,73 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Conv"; + // return 0; + int ret = 0; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + uint64_t image0_address_phy = 0; + uint64_t image1_address_phy = 0; + uint64_t output_address_phy = 0; + + uint64_t cmd = args.relu_enabled ? USE_RELU : 0; + uint64_t datalen = (uint64_t)args.image0.width * + (uint64_t)args.image0.height * + (uint64_t)args.image0.channels; + uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + image0_address_phy = vaddr_to_paddr(args.image0.address); + image1_address_phy = vaddr_to_paddr(args.image1.address); + output_address_phy = vaddr_to_paddr(args.output.address); + + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, + IMAGE_ALIGN); + uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | + ((uint64_t)args.image0.width << 16) | + (uint64_t)args.image0.height; + + /*SDK刷Cache保证数据一致性*/ + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); + reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); + reg_writeq(datalen, REG_EW_DATA_LEN); + reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL); + reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); + + reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR); + reg_writeq(coefficient, REG_EW_COEFFICIENT); + + reg_writeq(cmd, REG_EW_CMD); + + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + ret = -EIO; + DLOG << "EW Wait Irq Timeout!"; + } + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + + //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; #endif return 0; } @@ -126,8 +702,117 @@ int PerformBypass(const struct BypassArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Bypass"; + // return 0; + struct fpga_pe *pe; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + uint64_t cmd = 0; + uint64_t datalen = 0; + uint64_t input_address_phy = 0; + uint64_t output_address_phy = 0; + uint8_t data_cell_in = 0; + uint8_t data_cell_out = 0; + + int ret = 0; + + datalen = (uint64_t)args.image.width * (uint64_t)args.image.height * + (uint64_t)args.image.channels; + datalen = align_to_x(datalen, 16); + + input_address_phy = vaddr_to_paddr(args.image.address); + output_address_phy = vaddr_to_paddr(args.output.address); + DLOG << "input_phy:" << input_address_phy; + DLOG << "output_phy:" << output_address_phy; + + switch (args.input_data_type) { + case DATA_TYPE_FP16: { + switch (args.output_data_type) { + case DATA_TYPE_FP16: + data_cell_in = SIZE_FP16; + data_cell_out = SIZE_FP16; + cmd = CMD_FP16_TO_FP16; + break; + + case DATA_TYPE_FP32: + data_cell_in = SIZE_FP16; + data_cell_out = SIZE_FP32; + cmd = CMD_FP16_TO_FP32; + break; + + default: + break; + } + } break; + + case DATA_TYPE_FP32: { + switch (args.output_data_type) { + case DATA_TYPE_FP16: + data_cell_in = SIZE_FP32; + data_cell_out = SIZE_FP16; + cmd = CMD_FP32_TO_FP16; + break; + + case DATA_TYPE_FP32: + data_cell_in = SIZE_FP32; + data_cell_out = SIZE_FP32; + cmd = CMD_FP32_TO_FP32; + break; + + default: + break; + } + } break; + + default: + break; + } + if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 && + cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) { + return -EFAULT; + } + if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32) || + (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) { + return -EFAULT; + } + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) { + ret = -EIO; + DLOG << "Bypass Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR); + reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR); + reg_writeq(datalen, REG_CONVERT_LENGTH); + + /*SDK刷Cache保证数据一致性*/ + reg_writeq(cmd, REG_CONVERT_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; + ret = -EIO; + DLOG << "BYPASS Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + + //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; #endif return 0; @@ -138,11 +823,14 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { DLOG << "=============ComputeFpgaConcat==========="; DLOG << " Image_num: " << args.image_num << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out; + << " out_scale_address:" << args.scale_out + << " out_channel:" << args.out_channel; DLOG << " image_height:" << args.height << " image_width:" << args.width; for (int i = 0; i < args.image_num; i++) { DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.channel_num[i] + DLOG << " channel_num:" + << args.channel_num[i] + // << " aligned_channel_num:" << args.aligned_channel_num[i] << " image_address:" << args.images_in[i] << " image_scale_address:" << args.scales_in[i]; } @@ -154,6 +842,82 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { return 0; } +void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, + int sub_height, int sub_width, int omit_size) { + int origin_h = sub_height * sub_conv_n; + int origin_w = sub_width * sub_conv_n; + int align_origin_w = align_to_x(origin_w * channel, 16); + int deconv_h = origin_h - 2 * omit_size; + int deconv_w = origin_w - 2 * omit_size; + int deconv_row_len = deconv_w * channel; + int align_deconv_row_len = align_to_x(deconv_row_len, 16); + half *ptr_tmp = *data_in; + half *ptr_deconv = + (half *)fpga_malloc(num * align_deconv_row_len * deconv_h * sizeof(half)); + memset(ptr_deconv, 0, num * align_deconv_row_len * deconv_h * sizeof(half)); + int deconv_idx = 0; + for (int nn = 0; nn < num; ++nn) { + for (int hh = 0; hh < origin_h; ++hh) { + int hx = (hh % sub_conv_n); + half *sub_t = ptr_tmp + hx * sub_height * align_origin_w; // sub(hx,:); + + int hi = (hh / sub_conv_n); + + if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; + + // for (int ww = 0; ww < origin_w; ++ww){ + + // if((ww < omit_size) )// || (ww >= (origin_w-omit_size)) + // continue; + + int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + + omit_size * channel); + + fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx, + sizeof(half) * deconv_row_len); + deconv_idx += align_deconv_row_len; + //} + } + } + + *data_in = ptr_deconv; + fpga_free(ptr_tmp); +} +int ComputeFpgaDeconv(const struct DeconvArgs &args) { +#ifdef FPGA_TEST_MODE + DLOG << "=============ComputeFPGADeConv==========="; + DLOG << " filter_num:" << args.filter_num + << " group_num:" << args.group_num + << " sub_conv_num:" << args.sub_conv_num; +#endif + + int sub_conv_num = args.sub_conv_num; + + for (int i = 0; i < sub_conv_num; i++) { + //#if CPU_SIMULATE + + //#else + ComputeBasicConv(args.conv_args[i]); + //#endif + } + + if (sub_conv_num > 1) { + float max_scale = -1.0; + for (int i = 0; i < sub_conv_num; i++) { + float ptr_scale = (args.conv_args[i].output.scale_address)[0]; + if (ptr_scale > max_scale) { + args.output.scale_address[0] = ptr_scale; + args.output.scale_address[1] = + (args.conv_args[i].output.scale_address)[1]; + } + } + deconv_post_process((half **)(&(args.output.address)), args.sub_conv_num, 1, + args.filter_num, (args.sub_output_height), + (args.sub_output_width), args.omit_size); + } + return 0; +} + int ComputeFPGASplit(const struct SplitArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFpgaSplit==========="; @@ -173,6 +937,5 @@ int ComputeFPGASplit(const struct SplitArgs &args) { args.height, args.width); return 0; } - } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index 8c59ac14fb11282b29a837152194d873bd65d87d..2f592fe45d951230427595f2f8ff5b4a148c0276 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { for (i = 0; i < timeout; i++) { if (val == reg_readq(reg)) { + std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg + << std::endl; break; } } - if (i <= timeout) { + if (i < timeout) { return 0; } else { return -1; @@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); unsigned int nr = (unsigned int)_nr; int ret = 0; + DLOG << size; + DLOG << _nr; + DLOG << nr; + + uint64_t a_size = FPGA_PAGE_SIZE * nr; + DLOG << a_size; pthread_mutex_lock(&memory->mutex); @@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { *addr = address_ofset; } else { + DLOG << "memory request failed!"; ret = -ENOMEM; } @@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) { if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { paddr = iter->second; } else { - DLOG << "Invalid pointer"; + DLOG << "Invalid pointer: " << address; } return paddr; @@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) { fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, g_fpgainfo.memory_info->nr[pos]); pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); + + auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr); + if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { + g_fpgainfo.fpga_vaddr2paddr_map.erase(iter); + } } else { DLOG << "Invalid pointer"; } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index 2dad07ec5206a7ca64449aa38ebe0603d72b71e3..c204370be7ecd3aca229b2c130ec7861116a3ef7 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include @@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1; const int PE_IDX_EW = 2; const int PE_IDX_BYPASS = 3; -enum pe_status { IDLE = 0, BUSY = 1 }; +enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 }; struct MemoryCacheArgs { void *offset; @@ -58,7 +59,7 @@ struct MemoryCacheArgs { struct fpga_pe { char type_name[MAX_TYPE_NAME_LENTH + 1]; struct pe_data_s *outer; - pe_status status; // 0=idle 1=busy -1=fail + pe_status status; uint64_t interrupt_cnt; }; @@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) { uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset); // NOLINT + // DLOG << "read end"; + usleep(10); return value; } @@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) { // DLOG << "offset : " << offset << ", value : " << value; *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset) = value; + // DLOG << "write end"; + usleep(10); } int open_device_driver(); diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index b3f619f2f24aba47d99f7f427c4b67af8c0d430d..fdda65afda595d281a6f4db6f8132213f8f8d9e5 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -74,12 +74,21 @@ struct ConcatArgs { void* image_out; float* scale_out; uint32_t* channel_num; - // uint32_t* aligned_channel_num; - // uint32_t out_channel; + uint32_t* aligned_channel_num; + uint32_t out_channel; uint32_t height; uint32_t width; }; +struct SplitConvArgs { + uint32_t split_num; + uint32_t group_num; + uint32_t filter_num; + struct ImageOutputArgs output; + struct ConvArgs* conv_arg; + struct ConcatArgs concat_arg; +}; + struct SplitArgs { uint32_t image_num; int16_t* image_in; @@ -91,15 +100,6 @@ struct SplitArgs { uint32_t width; }; -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_arg; - struct ConcatArgs concat_arg; -}; - struct PoolingArgs { int16_t mode; // mode: 0:max, 1:avg int16_t kernel_reciprocal; @@ -127,7 +127,14 @@ struct BypassArgs { }; struct DeconvArgs { - struct ConvArgs conv_arg; + uint32_t sub_conv_num; + uint32_t group_num; + uint32_t filter_num; + uint32_t omit_size; + uint32_t sub_output_width; + uint32_t sub_output_height; + struct ImageOutputArgs output; + struct ConvArgs* conv_args; }; static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } diff --git a/src/framework/load_ops.h b/src/framework/load_ops.h index 135ef9083e42271fe63cdc29ee53e876f532c287..2534217d58674f912f0e5da741dfcae41827edf1 100644 --- a/src/framework/load_ops.h +++ b/src/framework/load_ops.h @@ -233,6 +233,14 @@ LOAD_OP1(quantize, CPU); #ifdef DEQUANT_OP LOAD_OP1(dequantize, CPU); #endif +#ifdef FUSION_DEQUANT_ADD_BN_OP +LOAD_OP1(fusion_dequant_add_bn, CPU); +LOAD_FUSION_MATCHER(fusion_dequant_add_bn); +#endif +#ifdef FUSION_DEQUANT_BN_RELU_OP +LOAD_OP1(fusion_dequant_bn_relu, CPU); +LOAD_FUSION_MATCHER(fusion_dequant_bn_relu); +#endif #ifdef FUSION_DEQUANT_ADD_BN_RELU_OP LOAD_OP1(fusion_dequant_add_bn_relu, CPU); LOAD_FUSION_MATCHER(fusion_dequant_add_bn_relu); diff --git a/src/io/ios_io/PaddleMobileCPU.mm b/src/io/ios_io/PaddleMobileCPU.mm index 2416c0d4e708813f8abf18c9dcb6e5d8b3c37a90..209022b64e90f700dc83c43d11f6e619c66673b6 100644 --- a/src/io/ios_io/PaddleMobileCPU.mm +++ b/src/io/ios_io/PaddleMobileCPU.mm @@ -95,7 +95,8 @@ static std::mutex shared_mutex; andModelParamsLen:(size_t)combinedParamsLen andCombinedParamsBuf:(const uint8_t *)combinedParamsBuf { pam_->SetThreadNum(2); - return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, combinedParamsBuf); + return loaded_ = pam_->LoadCombinedMemory(modelLen, modelBuf, combinedParamsLen, + const_cast(combinedParamsBuf)); } - (BOOL)load:(NSString *)modelAndWeightPath{ diff --git a/src/operators/depthwise_conv_op.h b/src/operators/depthwise_conv_op.h index 102d65670d3e50acd15745e95b85d7b843994ed7..26253e0e0a7d3c52808a691d4257e7074e1da6e2 100644 --- a/src/operators/depthwise_conv_op.h +++ b/src/operators/depthwise_conv_op.h @@ -18,7 +18,7 @@ limitations under the License. */ #include #include "framework/operator.h" -#include "operators/kernel/depthwise_conv_kernel.h" +#include "operators/kernel/conv_kernel.h" namespace paddle_mobile { namespace operators { @@ -26,19 +26,16 @@ namespace operators { template class DepthwiseConvOp : public framework::OperatorWithKernel< DeviceType, ConvParam, - operators::DepthwiseConvKernel> { + operators::ConvKernel> { public: DepthwiseConvOp(const std::string &type, const VariableNameMap &inputs, const VariableNameMap &outputs, const framework::AttributeMap &attrs, std::shared_ptr scope) - : framework::OperatorWithKernel< - DeviceType, ConvParam, - operators::DepthwiseConvKernel>( + : framework::OperatorWithKernel, + operators::ConvKernel>( type, inputs, outputs, attrs, scope) {} void InferShape() const override; - - private: }; } // namespace operators diff --git a/src/operators/kernel/arm/depthwise_conv_kernel.cpp b/src/operators/fusion_dequant_add_bn_op.cpp similarity index 58% rename from src/operators/kernel/arm/depthwise_conv_kernel.cpp rename to src/operators/fusion_dequant_add_bn_op.cpp index 000d59baa8c804201cbd2e2a731c2077196b698f..4df50af22b0dc9e214b0cabe303bf70edf50c307 100644 --- a/src/operators/kernel/arm/depthwise_conv_kernel.cpp +++ b/src/operators/fusion_dequant_add_bn_op.cpp @@ -12,27 +12,27 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef DEPTHWISECONV_OP +#ifdef FUSION_DEQUANT_ADD_BN_OP -#include "operators/kernel/depthwise_conv_kernel.h" -#include "operators/kernel/central-arm-func/depthwise_conv_arm_func.h" +#include "operators/fusion_dequant_add_bn_op.h" namespace paddle_mobile { namespace operators { -template <> -bool DepthwiseConvKernel::Init(ConvParam *param) { - return true; +template +void FusionDequantAddBNOp::InferShape() const { + const auto& input_dims = this->param_.input_->dims(); + this->param_.output_->Resize(input_dims); } -template <> -void DepthwiseConvKernel::Compute(const ConvParam ¶m) { - DepthwiseConvCompute(param); -} - -template class DepthwiseConvKernel; - } // namespace operators } // namespace paddle_mobile +namespace ops = paddle_mobile::operators; +REGISTER_FUSION_MATCHER(fusion_dequant_add_bn, ops::FusionDequantAddBNMatcher); + +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(fusion_dequant_add_bn, ops::FusionDequantAddBNOp); +#endif + #endif diff --git a/src/operators/fusion_dequant_add_bn_op.h b/src/operators/fusion_dequant_add_bn_op.h new file mode 100644 index 0000000000000000000000000000000000000000..8c4f353a81705c41c75a5aff92f2637b92755a2c --- /dev/null +++ b/src/operators/fusion_dequant_add_bn_op.h @@ -0,0 +1,74 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DEQUANT_ADD_BN_OP + +#pragma once + +#include +#include +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/dequant_add_bn_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +class FusionDequantAddBNMatcher : public framework::FusionOpMatcher { + public: + FusionDequantAddBNMatcher() { + node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); + node_ > std::make_shared(G_OP_TYPE_ELEMENTWISE_ADD) > + std::make_shared(G_OP_TYPE_BATCHNORM); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_ELEMENTWISE_ADD, {{"Y", "Y"}}}, + {G_OP_TYPE_BATCHNORM, + {{"Scale", "BNScale"}, + {"Mean", "BNMean"}, + {"Bias", "BNBias"}, + {"Variance", "BNVariance"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_ADD_BN; } +}; + +template +class FusionDequantAddBNOp + : public framework::OperatorWithKernel< + DeviceType, FusionDequantAddBNParam, + operators::FusionDequantAddBNKernel> { + public: + FusionDequantAddBNOp(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDequantAddBNParam, + operators::FusionDequantAddBNKernel>( + type, inputs, outputs, attrs, scope) {} + // inference output shape + void InferShape() const override; +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/fusion_dequant_add_bn_relu_op.h b/src/operators/fusion_dequant_add_bn_relu_op.h index dbd9ad0de2ece751ffd4da05cb09f0091a5755aa..b33d3c210ca56f27b769789fee08023ebb8c80de 100644 --- a/src/operators/fusion_dequant_add_bn_relu_op.h +++ b/src/operators/fusion_dequant_add_bn_relu_op.h @@ -20,7 +20,7 @@ limitations under the License. */ #include #include "framework/operator.h" #include "framework/program/program-optimize/fusion_op_register.h" -#include "operators/kernel/dequant_add_bn_relu_kernel.h" +#include "operators/kernel/dequant_bn_relu_kernel.h" #include "operators/op_param.h" namespace paddle_mobile { diff --git a/src/operators/kernel/depthwise_conv_kernel.h b/src/operators/fusion_dequant_bn_relu_op.cpp similarity index 56% rename from src/operators/kernel/depthwise_conv_kernel.h rename to src/operators/fusion_dequant_bn_relu_op.cpp index 3ee5bf86e97baa3970239e32b7fd5fc341e09f92..c843889a61a128c86915b14b0229ed172df2325b 100644 --- a/src/operators/kernel/depthwise_conv_kernel.h +++ b/src/operators/fusion_dequant_bn_relu_op.cpp @@ -12,29 +12,28 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef DEPTHWISECONV_OP +#ifdef FUSION_DEQUANT_BN_RELU_OP -#pragma once - -#include "framework/operator.h" -#include "operators/math/im2col.h" -#include "operators/math/math_function.h" -#include "operators/math/vol2col.h" -#include "operators/op_param.h" +#include "operators/fusion_dequant_bn_relu_op.h" namespace paddle_mobile { namespace operators { -using framework::OpKernelBase; +template +void FusionDequantBNReluOp::InferShape() const { + const auto& input_dims = this->param_.input_->dims(); + this->param_.output_->Resize(input_dims); +} -template -class DepthwiseConvKernel - : public OpKernelBase> { - public: - void Compute(const ConvParam ¶m); - bool Init(ConvParam *param); -}; } // namespace operators } // namespace paddle_mobile +namespace ops = paddle_mobile::operators; +REGISTER_FUSION_MATCHER(fusion_dequant_bn_relu, + ops::FusionDequantBNReluMatcher); + +#ifdef PADDLE_MOBILE_CPU +REGISTER_OPERATOR_CPU(fusion_dequant_bn_relu, ops::FusionDequantBNReluOp); +#endif + #endif diff --git a/src/operators/fusion_dequant_bn_relu_op.h b/src/operators/fusion_dequant_bn_relu_op.h new file mode 100644 index 0000000000000000000000000000000000000000..b556df1e3707736be0eaf58eb8323cdbb64cbd74 --- /dev/null +++ b/src/operators/fusion_dequant_bn_relu_op.h @@ -0,0 +1,73 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_DEQUANT_BN_RELU_OP + +#pragma once + +#include +#include +#include "framework/operator.h" +#include "framework/program/program-optimize/fusion_op_register.h" +#include "operators/kernel/dequant_bn_relu_kernel.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +class FusionDequantBNReluMatcher : public framework::FusionOpMatcher { + public: + FusionDequantBNReluMatcher() { + node_ = framework::Node(G_OP_TYPE_DEQUANTIZE); + node_ > std::make_shared(G_OP_TYPE_BATCHNORM) > + std::make_shared(G_OP_TYPE_RELU); + } + + void FolderNodes( + framework::Node *node, + std::vector> *removed_nodes) { + node->Folder(node_.Depth(), Type(), + {{G_OP_TYPE_BATCHNORM, + {{"Scale", "BNScale"}, + {"Mean", "BNMean"}, + {"Bias", "BNBias"}, + {"Variance", "BNVariance"}}}}, + removed_nodes); + } + + std::string Type() { return G_OP_TYPE_FUSION_DEQUANT_BN_RELU; } +}; + +template +class FusionDequantBNReluOp + : public framework::OperatorWithKernel< + DeviceType, FusionDequantBNReluParam, + operators::FusionDequantBNReluKernel> { + public: + FusionDequantBNReluOp(const std::string &type, const VariableNameMap &inputs, + const VariableNameMap &outputs, + const framework::AttributeMap &attrs, + std::shared_ptr scope) + : framework::OperatorWithKernel< + DeviceType, FusionDequantBNReluParam, + operators::FusionDequantBNReluKernel>( + type, inputs, outputs, attrs, scope) {} + // inference output shape + void InferShape() const override; +}; + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/arm/conv_kernel.cpp b/src/operators/kernel/arm/conv_kernel.cpp index 840be6c67d2e350c914a7d8aa8e9a32acdd00fb1..5384faf2b8ae0e0fe6aed1b6c0cd7d4d16978ac9 100644 --- a/src/operators/kernel/arm/conv_kernel.cpp +++ b/src/operators/kernel/arm/conv_kernel.cpp @@ -22,41 +22,43 @@ namespace operators { template <> bool ConvKernel::Init(ConvParam *param) { + bool conv3x3 = param->Filter()->dims()[2] == param->Filter()->dims()[3] && + param->Filter()->dims()[2] == 3; + bool depth3x3 = conv3x3 && param->Groups() == param->Input()->dims()[1] && + param->Input()->dims()[1] == param->Output()->dims()[1]; if (param->Filter()->type() == typeid(int8_t)) { - if (param->Groups() == param->Input()->dims()[1] && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Filter()->dims()[2] == 3 && param->Strides()[0] < 3 && + if (depth3x3 && param->Strides()[0] < 3 && param->Strides()[0] == param->Strides()[1]) { param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_INT8; } else { param->ExecMode() = ConvParam::EXEC_GEMM_INT8; } } else { - if (param->Groups() == param->Input()->dims()[1] && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1) { + if (depth3x3 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 1 && param->Paddings()[0] == 1 && + param->Paddings()[0] == param->Paddings()[1]) { param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S1P1_FLOAT; - } else if (param->Groups() == param->Input()->dims()[1] && - param->Input()->dims()[1] == param->Output()->dims()[1] && - param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Filter()->dims()[2] == 3) { - param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3_FLOAT; + } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 2 && param->Paddings()[0] == 0 && + param->Paddings()[0] == param->Paddings()[1]) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S2P0_FLOAT; + } else if (depth3x3 && param->Strides()[0] == param->Strides()[1] && + param->Strides()[0] == 2 && param->Paddings()[0] == 1 && + param->Paddings()[0] == param->Paddings()[1]) { + param->ExecMode() = ConvParam::EXEC_DEPTHWISE3x3S2P1_FLOAT; #ifndef __aarch64__ - } else if (param->Filter()->dims()[2] == param->Filter()->dims()[3] && - param->Strides()[0] == param->Strides()[1] && + } else if (conv3x3 && param->Strides()[0] == param->Strides()[1] && param->Dilations()[0] == param->Dilations()[1] && - param->Filter()->dims()[2] == 3 && param->Strides()[0] == 1 && - param->Dilations()[0] == 1 && param->Output()->dims()[1] >= 16 && + param->Strides()[0] == 1 && param->Dilations()[0] == 1 && + param->Output()->dims()[1] >= 16 && param->Input()->dims()[1] >= 16 && param->Input()->dims()[2] <= 140 /* refered from ncnn */) { param->ExecMode() = ConvParam::EXEC_WINOGRAD3X3_FLOAT; // transform weight - framework::Tensor *transformed_weight = new framework::Tensor; + framework::Tensor transformed_weight; operators::math::winograd_transform_weight<8, 3>(*param->Filter(), - transformed_weight); - param->Filter() = transformed_weight; + &transformed_weight); + framework::TensorCopy(transformed_weight, param->Filter()); #endif } else { param->ExecMode() = ConvParam::EXEC_GEMM_FLOAT; @@ -78,9 +80,13 @@ void ConvKernel::Compute(const ConvParam ¶m) { math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), nullptr, false); break; - case ConvParam::EXEC_DEPTHWISE3x3_FLOAT: - math::DepthwiseConv3x3(param.Input(), param.Strides(), param.Paddings(), - param.Filter(), nullptr, param.Output(), false); + case ConvParam::EXEC_DEPTHWISE3x3S2P1_FLOAT: + math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), + param.Output(), nullptr, false); + break; + case ConvParam::EXEC_DEPTHWISE3x3S2P0_FLOAT: + math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), + nullptr, false); break; case ConvParam::EXEC_WINOGRAD3X3_FLOAT: WinogradConv3x3<8, 3>(param); diff --git a/src/operators/kernel/arm/dequant_add_bn_relu_kernel.cpp b/src/operators/kernel/arm/dequant_add_bn_kernel.cpp similarity index 86% rename from src/operators/kernel/arm/dequant_add_bn_relu_kernel.cpp rename to src/operators/kernel/arm/dequant_add_bn_kernel.cpp index bfe1935c216f94d660997b1bfa42f18e63295992..65fb0190f76a34a584d065bd43841567e9658bb8 100644 --- a/src/operators/kernel/arm/dequant_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/arm/dequant_add_bn_kernel.cpp @@ -12,9 +12,9 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP +#ifdef FUSION_DEQUANT_ADD_BN_OP -#include "operators/kernel/dequant_add_bn_relu_kernel.h" +#include "operators/kernel/dequant_add_bn_kernel.h" #include #if defined(__ARM_NEON__) || defined(__ARM_NEON) #include @@ -24,8 +24,8 @@ namespace paddle_mobile { namespace operators { template <> -bool FusionDequantAddBNReluKernel::Init( - FusionDequantAddBNReluParam *param) { +bool FusionDequantAddBNKernel::Init( + FusionDequantAddBNParam *param) { // elementwise add params const Tensor *bias = param->bias_; // batch norm params @@ -49,8 +49,8 @@ bool FusionDequantAddBNReluKernel::Init( } template <> -void FusionDequantAddBNReluKernel::Compute( - const FusionDequantAddBNReluParam ¶m) { +void FusionDequantAddBNKernel::Compute( + const FusionDequantAddBNParam ¶m) { const int32_t *input = param.input_->data(); const float *bn_scale = param.bn_scale_->data(); const float *bn_bias = param.bn_bias_->data(); @@ -78,7 +78,6 @@ void FusionDequantAddBNReluKernel::Compute( remain = spatial_size & 0xF; float32x4_t __scale = vdupq_n_f32(scale); float32x4_t __bias = vdupq_n_f32(bias); - float32x4_t __zero = vdupq_n_f32(0.f); for (int k = 0; k < loop; ++k, x += 16, y += 16) { int32x4_t r0 = vld1q_s32(x); @@ -93,10 +92,6 @@ void FusionDequantAddBNReluKernel::Compute( f1 = vmlaq_f32(__bias, __scale, f1); f2 = vmlaq_f32(__bias, __scale, f2); f3 = vmlaq_f32(__bias, __scale, f3); - f0 = vmaxq_f32(__zero, f0); - f1 = vmaxq_f32(__zero, f1); - f2 = vmaxq_f32(__zero, f2); - f3 = vmaxq_f32(__zero, f3); vst1q_f32(y, f0); vst1q_f32(y + 4, f1); vst1q_f32(y + 8, f2); @@ -104,7 +99,7 @@ void FusionDequantAddBNReluKernel::Compute( } #endif // __ARM_NEON__ for (int k = 0; k < remain; ++k) { - y[k] = std::max(scale * x[k] + bias, 0.f); + y[k] = scale * x[k] + bias; } } } @@ -113,4 +108,4 @@ void FusionDequantAddBNReluKernel::Compute( } // namespace operators } // namespace paddle_mobile -#endif // FUSION_DEQUANT_ADD_BN_RELU_OP +#endif // FUSION_DEQUANT_ADD_BN_OP diff --git a/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp b/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..4d656712c193aa81a8be11c53856c868e2b82483 --- /dev/null +++ b/src/operators/kernel/arm/dequant_bn_relu_kernel.cpp @@ -0,0 +1,150 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#include "operators/kernel/dequant_bn_relu_kernel.h" +#include +#if defined(__ARM_NEON__) || defined(__ARM_NEON) +#include +#endif + +namespace paddle_mobile { +namespace operators { + +#if defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_RELU_OP) +void DequantBNReluCompute(const FusionDequantBNParam *param) { + const int32_t *input = param->input_->data(); + const float *bn_scale = param->bn_scale_->data(); + const float *bn_bias = param->bn_bias_->data(); + // dequantize params + const float activation_scale = param->activation_scale_->data()[0]; + const float weight_scale = param->weight_scale_; + const float dequant_scale = activation_scale / weight_scale; + + float *output = param->output_->mutable_data(); + int batch_size = param->input_->dims()[0]; + int channels = param->input_->dims()[1]; + size_t spatial_size = param->input_->dims()[2] * param->input_->dims()[3]; + + #pragma omp parallel for collapse(2) + for (int batch = 0; batch < batch_size; ++batch) { + for (int c = 0; c < channels; ++c) { + float scale = bn_scale[c] * dequant_scale; + float bias = bn_bias[c]; + size_t offset = (batch * channels + c) * spatial_size; + const int32_t *x = input + offset; + float *y = output + offset; + size_t remain = spatial_size; +#if defined(__ARM_NEON__) || defined(__ARM_NEON) + int loop = spatial_size >> 4; + remain = spatial_size & 0xF; + float32x4_t __scale = vdupq_n_f32(scale); + float32x4_t __bias = vdupq_n_f32(bias); + float32x4_t __zero = vdupq_n_f32(0.f); + + for (int k = 0; k < loop; ++k, x += 16, y += 16) { + int32x4_t r0 = vld1q_s32(x); + int32x4_t r1 = vld1q_s32(x + 4); + int32x4_t r2 = vld1q_s32(x + 8); + int32x4_t r3 = vld1q_s32(x + 12); + float32x4_t f0 = vcvtq_f32_s32(r0); + float32x4_t f1 = vcvtq_f32_s32(r1); + float32x4_t f2 = vcvtq_f32_s32(r2); + float32x4_t f3 = vcvtq_f32_s32(r3); + f0 = vmlaq_f32(__bias, __scale, f0); + f1 = vmlaq_f32(__bias, __scale, f1); + f2 = vmlaq_f32(__bias, __scale, f2); + f3 = vmlaq_f32(__bias, __scale, f3); + f0 = vmaxq_f32(__zero, f0); + f1 = vmaxq_f32(__zero, f1); + f2 = vmaxq_f32(__zero, f2); + f3 = vmaxq_f32(__zero, f3); + vst1q_f32(y, f0); + vst1q_f32(y + 4, f1); + vst1q_f32(y + 8, f2); + vst1q_f32(y + 12, f3); + } +#endif // __ARM_NEON__ + for (int k = 0; k < remain; ++k) { + y[k] = std::max(scale * x[k] + bias, 0.f); + } + } + } +} +#endif + +#ifdef FUSION_DEQUANT_BN_RELU_OP +template <> +bool FusionDequantBNReluKernel::Init( + FusionDequantBNReluParam *param) { + // batch norm params + const Tensor *bn_mean = param->bn_mean_; + const Tensor *bn_variance = param->bn_variance_; + Tensor *bn_scale = param->bn_scale_; + Tensor *bn_bias = param->bn_bias_; + const float epsilon = param->epsilon_; + + const float *mean_ptr = bn_mean->data(); + const float *var_ptr = bn_variance->data(); + float *bn_scale_ptr = bn_scale->mutable_data(); + float *bn_bias_ptr = bn_bias->mutable_data(); + for (int c = 0; c < bn_scale->numel(); ++c) { + float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon)); + bn_scale_ptr[c] = inv_scale; + bn_bias_ptr[c] = bn_bias_ptr[c] - inv_scale * mean_ptr[c]; + } + return true; +} + +template <> +void FusionDequantBNReluKernel::Compute( + const FusionDequantBNReluParam ¶m) { + DequantBNReluCompute(¶m); +} +#endif // FUSION_DEQUANT_BN_RELU_OP + +#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP +template <> +bool FusionDequantAddBNReluKernel::Init( + FusionDequantAddBNReluParam *param) { + // elementwise add params + const Tensor *bias = param->bias_; + // batch norm params + const Tensor *bn_mean = param->bn_mean_; + const Tensor *bn_variance = param->bn_variance_; + Tensor *bn_scale = param->bn_scale_; + Tensor *bn_bias = param->bn_bias_; + const float epsilon = param->epsilon_; + + const float *bias_ptr = bias->data(); + const float *mean_ptr = bn_mean->data(); + const float *var_ptr = bn_variance->data(); + float *bn_scale_ptr = bn_scale->mutable_data(); + float *bn_bias_ptr = bn_bias->mutable_data(); + for (int c = 0; c < bn_scale->numel(); ++c) { + float inv_scale = bn_scale_ptr[c] / (std::sqrt(var_ptr[c] + epsilon)); + bn_scale_ptr[c] = inv_scale; + bn_bias_ptr[c] = inv_scale * (bias_ptr[c] - mean_ptr[c]) + bn_bias_ptr[c]; + } + return true; +} + +template <> +void FusionDequantAddBNReluKernel::Compute( + const FusionDequantAddBNReluParam ¶m) { + DequantBNReluCompute(¶m); +} +#endif // FUSION_DEQUANT_ADD_BN_RELU_OP + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/kernel/arm/quantize_kernel.cpp b/src/operators/kernel/arm/quantize_kernel.cpp index 1e7623436a1a73644aca61e4634a7cd405bd64ad..ca3fa71f98f778752ac9dd7728385f5525696a02 100644 --- a/src/operators/kernel/arm/quantize_kernel.cpp +++ b/src/operators/kernel/arm/quantize_kernel.cpp @@ -20,6 +20,9 @@ limitations under the License. */ #if defined(__ARM_NEON__) || defined(__ARM_NEON) #include +namespace paddle_mobile { +namespace operators { + #ifndef __aarch64__ inline float32_t vmaxvq_f32(float32x4_t r) { float32x2_t v = vmax_f32(vget_high_f32(r), vget_low_f32(r)); @@ -27,9 +30,13 @@ inline float32_t vmaxvq_f32(float32x4_t r) { } #endif -inline int32x4_t vrnd_towards_zero(float32x4_t r) { return vcvtq_s32_f32(r); } +template +inline int32x4_t vround_f32(float32x4_t r) { + return vcvtq_s32_f32(r); +} -inline int32x4_t vrnd_away_zero(float32x4_t r) { +template <> +inline int32x4_t vround_f32(float32x4_t r) { float32x4_t plus = vdupq_n_f32(0.5); float32x4_t minus = vdupq_n_f32(-0.5); float32x4_t zero = vdupq_n_f32(0); @@ -40,31 +47,13 @@ inline int32x4_t vrnd_away_zero(float32x4_t r) { return ret; } -inline int32x4_t vrnd_to_even(float32x4_t r) { -#if 0 - int32x4_t ret; - float value[4]; - vst1q_f32(value, r); - for (int i = 0; i < 4; ++i) { - float v = round(value[i]); - int32_t q = (int32_t)v; - if (abs(abs(v - value[i]) - 0.5) > 0) { - ret[i] = q; - } else { - if (abs(q) % 2 == 0) { - ret[i] = q; - } else { - ret[i] = q + ((q > 0) ? -1 : 1); - } - } - } - return ret; -#else +template <> +inline int32x4_t vround_f32(float32x4_t r) { float32x4_t point5 = vdupq_n_f32(0.5); int32x4_t one = vdupq_n_s32(1); int32x4_t zero = vdupq_n_s32(0); - int32x4_t rnd = vrnd_away_zero(r); + int32x4_t rnd = vround_f32(r); float32x4_t frnd = vcvtq_f32_s32(rnd); frnd = vsubq_f32(frnd, r); frnd = vabsq_f32(frnd); @@ -82,115 +71,39 @@ inline int32x4_t vrnd_to_even(float32x4_t r) { smask = vsubq_s32(smask, one); rnd = vaddq_s32(rnd, smask); return rnd; -#endif } - -namespace paddle_mobile { -namespace operators { - -static float find_abs_max(const Tensor *input) { - float max_abs = 0.f; - const float *x = input->data(); - size_t size = input->numel(); -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = size >> 4; - size_t remain = size & 0xF; - for (size_t i = 0; i < loop; ++i) { - float32x4_t max; - float32x4_t r0 = vld1q_f32(x); - float32x4_t r1 = vld1q_f32(x + 4); - float32x4_t r2 = vld1q_f32(x + 8); - float32x4_t r3 = vld1q_f32(x + 12); - r0 = vabsq_f32(r0); - r1 = vabsq_f32(r1); - r2 = vabsq_f32(r2); - r3 = vabsq_f32(r3); - max[0] = vmaxvq_f32(r0); - max[1] = vmaxvq_f32(r1); - max[2] = vmaxvq_f32(r2); - max[3] = vmaxvq_f32(r3); - max[0] = vmaxvq_f32(max); - if (max[0] > max_abs) { - max_abs = max[0]; - } - x += 16; - } - size = remain; #endif - for (size_t i = 0; i < size; ++i) { - float value = std::abs(x[i]); - if (value > max_abs) { - max_abs = value; - } - } - return max_abs; + +template +inline int8_t Round(const float &x) { + return static_cast(x); } -#ifdef __aarch64__ -static void quantize_round_to_even(const Tensor *input, const float scale, - Tensor *output) { - const float *x = input->data(); - int8_t *y = output->mutable_data(); - size_t size = input->numel(); -#if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = size >> 4; - size_t remain = size & 0xF; +template <> +inline int8_t Round(const float &x) { + return std::round(x); +} - #pragma omp parallel for - for (size_t i = 0; i < loop; ++i) { - const float *local_x = x + (i << 4); - int8_t *local_y = y + (i << 4); - float32x4_t r0 = vld1q_f32(local_x); - float32x4_t r1 = vld1q_f32(local_x + 4); - float32x4_t r2 = vld1q_f32(local_x + 8); - float32x4_t r3 = vld1q_f32(local_x + 12); - r0 = vmulq_n_f32(r0, scale); - r1 = vmulq_n_f32(r1, scale); - r2 = vmulq_n_f32(r2, scale); - r3 = vmulq_n_f32(r3, scale); - int32x4_t q0 = vrnd_to_even(r0); - int32x4_t q1 = vrnd_to_even(r1); - int32x4_t q2 = vrnd_to_even(r2); - int32x4_t q3 = vrnd_to_even(r3); - int16x4_t d0 = vmovn_s32(q0); - int16x4_t d1 = vmovn_s32(q1); - int16x4_t d2 = vmovn_s32(q2); - int16x4_t d3 = vmovn_s32(q3); - int16x8_t q5 = vcombine_s16(d0, d1); - int16x8_t q6 = vcombine_s16(d2, d3); - int8x8_t d5 = vmovn_s16(q5); - int8x8_t d6 = vmovn_s16(q6); - vst1_s8(local_y, d5); - vst1_s8(local_y + 8, d6); - } - size = remain; - x += (loop << 4); - y += (loop << 4); -#endif - for (size_t i = 0; i < size; ++i) { - float value = x[i] * scale; - float v = round(value); - int32_t q = (int32_t)v; - if (abs(abs(q - value) - 0.5) > 0) { - y[i] = q; - } else { - if (abs(q) % 2 == 0) { - y[i] = q; - } else { - y[i] = q + ((q > 0) ? -1 : 1); - } +template <> +inline int8_t Round(const float &x) { + float v = std::round(x); + int32_t q = static_cast(v); + if (std::abs(std::abs(q - v) - 0.5) <= 0) { + if (std::abs(q) % 2 != 0) { + q = q + ((q > 0) ? -1 : 1); } } + return static_cast(q); } -static void quantize_round_to_zero(const Tensor *input, const float scale, - Tensor *output) { +template +static void Quantize(const Tensor *input, const float scale, Tensor *output) { const float *x = input->data(); int8_t *y = output->mutable_data(); - size_t size = input->numel(); + size_t remain = input->numel(); #if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = size >> 4; - size_t remain = size & 0xF; + size_t loop = remain >> 4; + remain = remain & 0xF; #pragma omp parallel for for (size_t i = 0; i < loop; ++i) { @@ -204,10 +117,10 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, r1 = vmulq_n_f32(r1, scale); r2 = vmulq_n_f32(r2, scale); r3 = vmulq_n_f32(r3, scale); - int32x4_t q0 = vrnd_towards_zero(r0); - int32x4_t q1 = vrnd_towards_zero(r1); - int32x4_t q2 = vrnd_towards_zero(r2); - int32x4_t q3 = vrnd_towards_zero(r3); + int32x4_t q0 = vround_f32(r0); + int32x4_t q1 = vround_f32(r1); + int32x4_t q2 = vround_f32(r2); + int32x4_t q3 = vround_f32(r3); int16x4_t d0 = vmovn_s32(q0); int16x4_t d1 = vmovn_s32(q1); int16x4_t d2 = vmovn_s32(q2); @@ -219,561 +132,44 @@ static void quantize_round_to_zero(const Tensor *input, const float scale, vst1_s8(local_y, d5); vst1_s8(local_y + 8, d6); } - size = remain; x += (loop << 4); y += (loop << 4); #endif - for (size_t i = 0; i < size; ++i) { - y[i] = static_cast(x[i] * scale); + for (size_t i = 0; i < remain; ++i) { + y[i] = Round(x[i] * scale); } } -static void quantize_round_to_nearest(const Tensor *input, const float scale, - Tensor *output) { +float find_abs_max(const Tensor *input) { + float max_abs = 0.f; const float *x = input->data(); - int8_t *y = output->mutable_data(); - size_t size = input->numel(); + size_t remain = input->numel(); #if defined(__ARM_NEON__) || defined(__ARM_NEON) - size_t loop = size >> 4; - size_t remain = size & 0xF; + size_t loop = remain >> 4; + remain = remain & 0xF; + float32x4_t __max = {0.f, 0.f, 0.f, 0.f}; - #pragma omp parallel for - for (size_t i = 0; i < loop; ++i) { - const float *local_x = x + (i << 4); - int8_t *local_y = y + (i << 4); - float32x4_t r0 = vld1q_f32(local_x); - float32x4_t r1 = vld1q_f32(local_x + 4); - float32x4_t r2 = vld1q_f32(local_x + 8); - float32x4_t r3 = vld1q_f32(local_x + 12); - r0 = vmulq_n_f32(r0, scale); - r1 = vmulq_n_f32(r1, scale); - r2 = vmulq_n_f32(r2, scale); - r3 = vmulq_n_f32(r3, scale); - int32x4_t q0 = vrnd_away_zero(r0); - int32x4_t q1 = vrnd_away_zero(r1); - int32x4_t q2 = vrnd_away_zero(r2); - int32x4_t q3 = vrnd_away_zero(r3); - int16x4_t d0 = vmovn_s32(q0); - int16x4_t d1 = vmovn_s32(q1); - int16x4_t d2 = vmovn_s32(q2); - int16x4_t d3 = vmovn_s32(q3); - int16x8_t q5 = vcombine_s16(d0, d1); - int16x8_t q6 = vcombine_s16(d2, d3); - int8x8_t d5 = vmovn_s16(q5); - int8x8_t d6 = vmovn_s16(q6); - vst1_s8(local_y, d5); - vst1_s8(local_y + 8, d6); + for (size_t i = 0; i < loop; ++i, x += 16) { + float32x4_t r0 = vld1q_f32(x); + float32x4_t r1 = vld1q_f32(x + 4); + float32x4_t r2 = vld1q_f32(x + 8); + float32x4_t r3 = vld1q_f32(x + 12); + r0 = vabsq_f32(r0); + r1 = vabsq_f32(r1); + r2 = vabsq_f32(r2); + r3 = vabsq_f32(r3); + r0 = vmaxq_f32(r0, r1); + r1 = vmaxq_f32(r2, r3); + r0 = vmaxq_f32(r0, r1); + __max = vmaxq_f32(r0, __max); } - size = remain; - x += (loop << 4); - y += (loop << 4); + max_abs = vmaxvq_f32(__max); #endif - for (size_t i = 0; i < size; ++i) { - y[i] = round(x[i] * scale); - } -} -#else // __aarch64__ - -static void quantize_round_to_even(const Tensor *input, const float scale, - const std::vector &paddings, - const int8_t padding_val, Tensor *output) {} - -static void quantize_round_to_nearest(const Tensor *input, const float scale, - const std::vector &paddings, - const int8_t padding_val, - Tensor *output) {} - -static void quantize_round_to_zero(const Tensor *input, const float scale, - const std::vector &paddings, - const int8_t padding_val, Tensor *output) { - int channels = input->dims()[1]; - int input_h = input->dims()[2]; - int input_w = input->dims()[3]; - int output_h = output->dims()[2]; - int output_w = output->dims()[3]; - int input_spatial_size = input_h * input_w; - int output_spatial_size = output_h * output_w; - const float *x = input->data(); - int8_t *y = output->mutable_data(); - // valid area start - int start = paddings[0] * output_w + paddings[1]; - - for (int batch = 0; batch < input->dims()[0]; ++batch) { - #pragma omp parallel for - for (int c = 0; c < channels - 3; c += 4) { - const float *input0 = x + (batch * channels + c) * input_spatial_size; - const float *input1 = input0 + input_spatial_size; - const float *input2 = input1 + input_spatial_size; - const float *input3 = input2 + input_spatial_size; - size_t offset = (batch * channels + c) * output_spatial_size; - for (int h = 0; h < 2; ++h) { - int8_t *y0 = - y + offset + h * ((input_h + paddings[0]) * output_w - paddings[1]); - int8_t *y1 = y0 + output_spatial_size; - int8_t *y2 = y1 + output_spatial_size; - int8_t *y3 = y2 + output_spatial_size; - int loop = start >> 4; - int remain = start & 0xF; - asm volatile( - "vdup.s8 q0, %[val] \n" - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - - "store_16w_%=: \n" - "vst1.32 {q0}, [%[y0]]! \n" - "vst1.32 {q0}, [%[y1]]! \n" - "vst1.32 {q0}, [%[y2]]! \n" - "vst1.32 {q0}, [%[y3]]! \n" - "subs %[loop], #1 \n" - "bne store_16w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #8 \n" - "blt store_4w_%= \n" - "vst1.32 {d0}, [%[y0]]! \n" - "vst1.32 {d0}, [%[y1]]! \n" - "vst1.32 {d0}, [%[y2]]! \n" - "vst1.32 {d0}, [%[y3]]! \n" - "sub %[remain], #8 \n" - - "store_4w_%=: \n" - "cmp %[remain], #4 \n" - "blt store_2w_%= \n" - "vst1.32 {d0[0]}, [%[y0]]! \n" - "vst1.32 {d0[0]}, [%[y1]]! \n" - "vst1.32 {d0[0]}, [%[y2]]! \n" - "vst1.32 {d0[0]}, [%[y3]]! \n" - "sub %[remain], #4 \n" - - "store_2w_%=: \n" - "cmp %[remain], #4 \n" - "blt store_1w_%= \n" - "vst1.16 {d0[0]}, [%[y0]]! \n" - "vst1.16 {d0[0]}, [%[y1]]! \n" - "vst1.16 {d0[0]}, [%[y2]]! \n" - "vst1.16 {d0[0]}, [%[y3]]! \n" - "sub %[remain], #2 \n" - - "store_1w_%=: \n" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.8 {d0[0]}, [%[y0]]! \n" - "vst1.8 {d0[0]}, [%[y1]]! \n" - "vst1.8 {d0[0]}, [%[y2]]! \n" - "vst1.8 {d0[0]}, [%[y3]]! \n" - "end_%=: \n" - : [y0] "+r"(y0), [y1] "+r"(y1), [y2] "+r"(y2), [y3] "+r"(y3), - [loop] "+r"(loop), [remain] "+r"(remain) - : [val] "r"(padding_val) - : "cc", "memory", "q0"); - } - // quantize valid area - int8_t *y0 = y + offset + start; - int8_t *y1 = y0 + output_spatial_size; - int8_t *y2 = y1 + output_spatial_size; - int8_t *y3 = y2 + output_spatial_size; - for (int h = 0; h < input_h; ++h) { - const float *x0 = input0 + h * input_w; - const float *x1 = input1 + h * input_w; - const float *x2 = input2 + h * input_w; - const float *x3 = input3 + h * input_w; - int loop = input_w >> 4; - int remain = input_w & 0xF; - int pad_loop = paddings[1] >> 1; // (paddings[1] << 1) >> 2 - int pad_remain = (paddings[1] << 1) & 0x3; - int remain_steps = remain; - asm volatile( - "vdup.f32 q0, %[scale] \n" - "cmp %[loop], #0 \n" - "ble quantize_remain_%= \n" - - "loop_quantize_%=: \n" - "vld1.32 {q1, q2}, [%[x0]]! \n" - "vld1.32 {q3, q4}, [%[x1]]! \n" - "vld1.32 {q5, q6}, [%[x2]]! \n" - "vld1.32 {q7, q8}, [%[x3]]! \n" - "vmul.f32 q1, q1, q0 \n" - "vmul.f32 q2, q2, q0 \n" - "vmul.f32 q3, q3, q0 \n" - "vmul.f32 q4, q4, q0 \n" - "vmul.f32 q5, q5, q0 \n" - "vmul.f32 q6, q6, q0 \n" - "vmul.f32 q7, q7, q0 \n" - "vmul.f32 q8, q8, q0 \n" - "vcvt.s32.f32 q1, q1 \n" - "vcvt.s32.f32 q2, q2 \n" - "vcvt.s32.f32 q3, q3 \n" - "vcvt.s32.f32 q4, q4 \n" - "vcvt.s32.f32 q5, q5 \n" - "vcvt.s32.f32 q6, q6 \n" - "vcvt.s32.f32 q7, q7 \n" - "vcvt.s32.f32 q8, q8 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s32 d4, q3 \n" - "vmovn.s32 d5, q4 \n" - "vmovn.s32 d6, q5 \n" - "vmovn.s32 d7, q6 \n" - "vmovn.s32 d8, q7 \n" - "vmovn.s32 d9, q8 \n" - "vmovn.s16 d18, q1 \n" - "vmovn.s16 d20, q2 \n" - "vmovn.s16 d22, q3 \n" - "vmovn.s16 d24, q4 \n" - "vld1.32 {q1, q2}, [%[x0]]! \n" - "vld1.32 {q3, q4}, [%[x1]]! \n" - "vld1.32 {q5, q6}, [%[x2]]! \n" - "vld1.32 {q7, q8}, [%[x3]]! \n" - "vmul.f32 q1, q1, q0 \n" - "vmul.f32 q2, q2, q0 \n" - "vmul.f32 q3, q3, q0 \n" - "vmul.f32 q4, q4, q0 \n" - "vmul.f32 q5, q5, q0 \n" - "vmul.f32 q6, q6, q0 \n" - "vmul.f32 q7, q7, q0 \n" - "vmul.f32 q8, q8, q0 \n" - "vcvt.s32.f32 q1, q1 \n" - "vcvt.s32.f32 q2, q2 \n" - "vcvt.s32.f32 q3, q3 \n" - "vcvt.s32.f32 q4, q4 \n" - "vcvt.s32.f32 q5, q5 \n" - "vcvt.s32.f32 q6, q6 \n" - "vcvt.s32.f32 q7, q7 \n" - "vcvt.s32.f32 q8, q8 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s32 d4, q3 \n" - "vmovn.s32 d5, q4 \n" - "vmovn.s32 d6, q5 \n" - "vmovn.s32 d7, q6 \n" - "vmovn.s32 d8, q7 \n" - "vmovn.s32 d9, q8 \n" - "vmovn.s16 d19, q1 \n" - "vmovn.s16 d21, q2 \n" - "vmovn.s16 d23, q3 \n" - "vmovn.s16 d25, q4 \n" - "vst1.32 {q9}, [%[y0]]! \n" - "vst1.32 {q10}, [%[y1]]! \n" - "vst1.32 {q11}, [%[y2]]! \n" - "vst1.32 {q12}, [%[y3]]! \n" - - "subs %[loop], #1 \n" - "bne loop_quantize_%= \n" - - "quantize_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble end_%= \n" - - "vld1.32 {q1, q2}, [%[x0]]! \n" - "vld1.32 {q3, q4}, [%[x1]]! \n" - "vld1.32 {q5, q6}, [%[x2]]! \n" - "vld1.32 {q7, q8}, [%[x3]]! \n" - "vmul.f32 q1, q1, q0 \n" - "vmul.f32 q2, q2, q0 \n" - "vmul.f32 q3, q3, q0 \n" - "vmul.f32 q4, q4, q0 \n" - "vmul.f32 q5, q5, q0 \n" - "vmul.f32 q6, q6, q0 \n" - "vmul.f32 q7, q7, q0 \n" - "vmul.f32 q8, q8, q0 \n" - "vcvt.s32.f32 q1, q1 \n" - "vcvt.s32.f32 q2, q2 \n" - "vcvt.s32.f32 q3, q3 \n" - "vcvt.s32.f32 q4, q4 \n" - "vcvt.s32.f32 q5, q5 \n" - "vcvt.s32.f32 q6, q6 \n" - "vcvt.s32.f32 q7, q7 \n" - "vcvt.s32.f32 q8, q8 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s32 d4, q3 \n" - "vmovn.s32 d5, q4 \n" - "vmovn.s32 d6, q5 \n" - "vmovn.s32 d7, q6 \n" - "vmovn.s32 d8, q7 \n" - "vmovn.s32 d9, q8 \n" - "vmovn.s16 d18, q1 \n" - "vmovn.s16 d20, q2 \n" - "vmovn.s16 d22, q3 \n" - "vmovn.s16 d24, q4 \n" - "vld1.32 {q1, q2}, [%[x0]] \n" - "vld1.32 {q3, q4}, [%[x1]] \n" - "vld1.32 {q5, q6}, [%[x2]] \n" - "vld1.32 {q7, q8}, [%[x3]] \n" - "vmul.f32 q1, q1, q0 \n" - "vmul.f32 q2, q2, q0 \n" - "vmul.f32 q3, q3, q0 \n" - "vmul.f32 q4, q4, q0 \n" - "vmul.f32 q5, q5, q0 \n" - "vmul.f32 q6, q6, q0 \n" - "vmul.f32 q7, q7, q0 \n" - "vmul.f32 q8, q8, q0 \n" - "vcvt.s32.f32 q1, q1 \n" - "vcvt.s32.f32 q2, q2 \n" - "vcvt.s32.f32 q3, q3 \n" - "vcvt.s32.f32 q4, q4 \n" - "vcvt.s32.f32 q5, q5 \n" - "vcvt.s32.f32 q6, q6 \n" - "vcvt.s32.f32 q7, q7 \n" - "vcvt.s32.f32 q8, q8 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s32 d4, q3 \n" - "vmovn.s32 d5, q4 \n" - "vmovn.s32 d6, q5 \n" - "vmovn.s32 d7, q6 \n" - "vmovn.s32 d8, q7 \n" - "vmovn.s32 d9, q8 \n" - "vmovn.s16 d19, q1 \n" - "vmovn.s16 d21, q2 \n" - "vmovn.s16 d23, q3 \n" - "vmovn.s16 d25, q4 \n" - - "cmp %[remain], #8 \n" - "blt store_4w_%= \n" - "vst1.32 {d18}, [%[y0]]! \n" - "vst1.32 {d20}, [%[y1]]! \n" - "vst1.32 {d22}, [%[y2]]! \n" - "vst1.32 {d24}, [%[y3]]! \n" - "vmov.32 d18, d19 \n" - "vmov.32 d20, d21 \n" - "vmov.32 d22, d23 \n" - "vmov.32 d24, d25 \n" - "sub %[remain], #8 \n" - - "store_4w_%=: \n" - "cmp %[remain], #4 \n" - "blt store_2w_%= \n" - "vst1.32 {d18[0]}, [%[y0]]! \n" - "vst1.32 {d20[0]}, [%[y1]]! \n" - "vst1.32 {d22[0]}, [%[y2]]! \n" - "vst1.32 {d24[0]}, [%[y3]]! \n" - "vext.32 d18, d18, d18, #1 \n" - "vext.32 d20, d20, d20, #1 \n" - "vext.32 d22, d22, d22, #1 \n" - "vext.32 d24, d24, d24, #1 \n" - "sub %[remain], #4 \n" - - "store_2w_%=: \n" - "cmp %[remain], #2 \n" - "blt store_1w_%= \n" - "vst1.16 {d18[0]}, [%[y0]]! \n" - "vst1.16 {d20[0]}, [%[y1]]! \n" - "vst1.16 {d22[0]}, [%[y2]]! \n" - "vst1.16 {d24[0]}, [%[y3]]! \n" - "vext.16 d18, d18, d18, #1 \n" - "vext.16 d20, d20, d20, #1 \n" - "vext.16 d22, d22, d22, #1 \n" - "vext.16 d24, d24, d24, #1 \n" - "sub %[remain], #2 \n" - - "store_1w_%=:" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.8 {d18[0]}, [%[y0]]! \n" - "vst1.8 {d20[0]}, [%[y1]]! \n" - "vst1.8 {d22[0]}, [%[y2]]! \n" - "vst1.8 {d24[0]}, [%[y3]]! \n" - - "end_%=: \n" - : [x0] "+r"(x0), [x1] "+r"(x1), [x2] "+r"(x2), [x3] "+r"(x3), - [y0] "+r"(y0), [y1] "+r"(y1), [y2] "+r"(y2), [y3] "+r"(y3), - [loop] "+r"(loop), [remain] "+r"(remain) - : [scale] "r"(scale) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12"); - asm volatile( - "vdup.s8 d0, %[val] \n" - "cmp %[pad_loop], #0 \n" - "ble store_pad_2w_%= \n" - "loop_pad_4w_%=: \n" - "vst1.32 {d0[0]}, [%[y0]]! \n" - "vst1.32 {d0[0]}, [%[y1]]! \n" - "vst1.32 {d0[0]}, [%[y2]]! \n" - "vst1.32 {d0[0]}, [%[y3]]! \n" - "subs %[pad_loop], #1 \n" - "bne loop_pad_4w_%= \n" - - "store_pad_2w_%=: \n" - "cmp %[pad_remain], #2 \n" - "blt store_pad_1w_%= \n" - "vst1.16 {d0[0]}, [%[y0]]! \n" - "vst1.16 {d0[0]}, [%[y1]]! \n" - "vst1.16 {d0[0]}, [%[y2]]! \n" - "vst1.16 {d0[0]}, [%[y3]]! \n" - "sub %[pad_remain], #2 \n" - - "store_pad_1w_%=: \n" - "cmp %[pad_remain], #1 \n" - "blt end_%= \n" - "vst1.8 {d0[0]}, [%[y0]]! \n" - "vst1.8 {d0[0]}, [%[y1]]! \n" - "vst1.8 {d0[0]}, [%[y2]]! \n" - "vst1.8 {d0[0]}, [%[y3]]! \n" - "end_%=: \n" - : [y0] "+r"(y0), [y1] "+r"(y1), [y2] "+r"(y2), [y3] "+r"(y3), - [pad_loop] "+r"(pad_loop), [pad_remain] "+r"(pad_remain) - : [val] "r"(padding_val) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", - "q8", "q9", "q10", "q11", "q12"); - } - } - for (int c = (channels & 0xFFFC); c < channels; ++c) { - const float *input0 = x + (batch * channels + c) * input_spatial_size; - size_t offset = (batch * channels + c) * output_spatial_size; - for (int h = 0; h < 2; ++h) { - int8_t *y0 = - y + offset + h * ((input_h + paddings[0]) * output_w - paddings[1]); - int loop = start >> 4; - int remain = start & 0xF; - asm volatile( - "vdup.s8 q0, %[val] \n" - "cmp %[loop], #0 \n" - "ble start_remain_%= \n" - - "store_16w_%=: \n" - "vst1.32 {q0}, [%[y0]]! \n" - "subs %[loop], #1 \n" - "bne store_16w_%= \n" - - "start_remain_%=: \n" - "cmp %[remain], #8 \n" - "blt store_4w_%= \n" - "vst1.32 {d0}, [%[y0]]! \n" - "sub %[remain], #8 \n" - - "store_4w_%=: \n" - "cmp %[remain], #4 \n" - "blt store_2w_%= \n" - "vst1.32 {d0[0]}, [%[y0]]! \n" - "sub %[remain], #4 \n" - - "store_2w_%=: \n" - "cmp %[remain], #4 \n" - "blt store_1w_%= \n" - "vst1.16 {d0[0]}, [%[y0]]! \n" - "sub %[remain], #2 \n" - - "store_1w_%=: \n" - "cmp %[remain], #1 \n" - "blt end_%= \n" - "vst1.8 {d0[0]}, [%[y0]]! \n" - "end_%=: \n" - : [y0] "+r"(y0), [loop] "+r"(loop), [remain] "+r"(remain) - : [val] "r"(padding_val) - : "cc", "memory", "q0"); - } - // quantize valid area - int8_t *y0 = y + offset + start; - for (int h = 0; h < input_h; ++h) { - const float *x0 = input0 + h * input_w; - int loop = input_w >> 4; - int remain = input_w & 0xF; - int pad_loop = paddings[1] >> 1; // (paddings[1] << 1) >> 2 - int pad_remain = (paddings[1] << 1) & 0x3; - asm volatile( - "vdup.f32 q0, %[scale] \n" - "cmp %[loop], #0 \n" - "ble quantize_remain_%= \n" - - "loop_quantize_%=: \n" - "vld1.32 {q1, q2}, [%[x0]]! \n" - "vmul.f32 q1, q1, q0 \n" - "vmul.f32 q2, q2, q0 \n" - "vcvt.s32.f32 q1, q1 \n" - "vcvt.s32.f32 q2, q2 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s16 d18, q1 \n" - "vld1.32 {q1, q2}, [%[x0]]! \n" - "vmul.f32 q1, q1, q0 \n" - "vmul.f32 q2, q2, q0 \n" - "vcvt.s32.f32 q1, q1 \n" - "vcvt.s32.f32 q2, q2 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s16 d19, q1 \n" - "vst1.32 {q9}, [%[y0]]! \n" - - "subs %[loop], #1 \n" - "bne loop_quantize_%= \n" - - "quantize_remain_%=: \n" - "cmp %[remain], #0 \n" - "ble start_pad_%= \n" - - "vldm %[x0], {d2-d9} \n" - "vmul.f32 q1, q1, q0 \n" - "vmul.f32 q2, q2, q0 \n" - "vcvt.s32.f32 q1, q1 \n" - "vcvt.s32.f32 q2, q2 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s16 d18, q1 \n" - "vmul.f32 q3, q3, q0 \n" - "vmul.f32 q4, q4, q0 \n" - "vcvt.s32.f32 q1, q3 \n" - "vcvt.s32.f32 q2, q4 \n" - "vmovn.s32 d2, q1 \n" - "vmovn.s32 d3, q2 \n" - "vmovn.s16 d19, q1 \n" - - "cmp %[remain], #8 \n" - "blt store_4w_%= \n" - "vst1.32 {d18}, [%[y0]]! \n" - "vmov.32 d18, d19 \n" - "sub %[remain], #8 \n" - - "store_4w_%=: \n" - "cmp %[remain], #4 \n" - "blt store_2w_%= \n" - "vst1.32 {d18[0]}, [%[y0]]! \n" - "vext.32 d18, d18, d18, #1 \n" - "sub %[remain], #4 \n" - - "store_2w_%=: \n" - "cmp %[remain], #2 \n" - "blt store_1w_%= \n" - "vst1.16 {d18[0]}, [%[y0]]! \n" - "vext.16 d18, d18, d18, #1 \n" - "sub %[remain], #2 \n" - - "store_1w_%=:" - "cmp %[remain], #1 \n" - "blt start_pad_%= \n" - "vst1.8 {d18[0]}, [%[y0]]! \n" - - "start_pad_%=: \n" - "vdup.s8 d0, %[val] \n" - "cmp %[pad_loop], #0 \n" - "ble pad_remain_%= \n" - "loop_pad_4w_%=: \n" - "vst1.32 {d0[0]}, [%[y0]]! \n" - "subs %[pad_loop], #1 \n" - "bne loop_pad_4w_%= \n" - - "pad_remain_%=: \n" - "cmp %[pad_remain], #2 \n" - "blt store_pad_1w_%= \n" - "vst1.16 {d0[0]}, [%[y0]]! \n" - "sub %[pad_remain], #2 \n" - - "store_pad_1w_%=: \n" - "cmp %[pad_remain], #1 \n" - "blt end_%= \n" - "vst1.8 {d0[0]}, [%[y0]]! \n" - "end_%=: \n" - : [x0] "+r"(x0), [y0] "+r"(y0), [loop] "+r"(loop), - [remain] "+r"(remain), [pad_loop] "+r"(pad_loop), - [pad_remain] "+r"(pad_remain) - : [scale] "r"(scale), [val] "r"(padding_val) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q9"); - } - } + for (size_t i = 0; i < remain; ++i) { + max_abs = std::max(max_abs, std::abs(x[i])); } + return max_abs; } -#endif // __aarch64__ -#endif // ARM_NEON template <> bool QuantizeKernel::Init(QuantizeParam *param) { @@ -795,19 +191,15 @@ void QuantizeKernel::Compute(const QuantizeParam ¶m) { // only support int8 currently float scale = 127 / max_abs; param.online_scale_->mutable_data()[0] = max_abs; - const auto &paddings = param.paddings_; - // std::vector paddings = {0, 0}; - // const auto padding_val = param.padding_val_; - int8_t padding_val = 0; switch (param.round_type_) { case ROUND_NEAREST_TO_EVEN: - quantize_round_to_even(input, scale, paddings, padding_val, output); + Quantize(input, scale, output); break; case ROUND_NEAREST_TOWARDS_ZERO: - quantize_round_to_zero(input, scale, paddings, padding_val, output); + Quantize(input, scale, output); break; case ROUND_NEAREST_AWAY_ZERO: - quantize_round_to_nearest(input, scale, paddings, padding_val, output); + Quantize(input, scale, output); break; default: LOG(kLOG_ERROR) << "round type is not supported."; diff --git a/src/operators/kernel/central-arm-func/conv_add_arm_func.h b/src/operators/kernel/central-arm-func/conv_add_arm_func.h index 3b5924ecbf886159d129212cc36c8630cb8cce2f..988f0b0f03b84c25a2e17e9d14054f99dcce4916 100644 --- a/src/operators/kernel/central-arm-func/conv_add_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_add_arm_func.h @@ -132,10 +132,10 @@ void ConvAddCompute(const FusionConvAddParam ¶m) { // param.Output(), false); if (param.Paddings()[0] == 0) { math::DepthwiseConv3x3s2p0(param.Input(), param.Filter(), param.Output(), - *param.Bias(), true); + param.Bias(), true); } else { math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), - param.Output(), *param.Bias(), true); + param.Output(), param.Bias(), true); } } else { ConvAddBasic(param); diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 95299b0799764639bfb36721f4707b1382533bb6..11667dfcc9cf2e25712a8f5c57d665cd41e9a9c6 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -163,31 +163,21 @@ template inline void DepthwiseConv3x3(const ConvParam ¶m) { const Tensor *input = param.Input(); const Tensor *filter = param.Filter(); + const std::vector &paddings = param.Paddings(); + const std::vector &strides = param.Strides(); + const int batch_size = input->dims()[0]; Tensor *output = param.Output(); output->mutable_data(); - const std::vector &paddings = param.Paddings(); - const std::vector &strides = param.Strides(); - const int batch_size = static_cast(input->dims()[0]); - Tensor input_pad; - math::PadFunctor pad; for (int i = 0; i < batch_size; i++) { Tensor in_batch = input->Slice(i, i + 1); Tensor out_batch = output->Slice(i, i + 1); - if (paddings[0] || paddings[1]) { - framework::DDim pad_shape = in_batch.dims(); - pad_shape[2] += 2 * paddings[0]; - pad_shape[3] += 2 * paddings[1]; - input_pad.mutable_data(pad_shape); - pad(in_batch, paddings[0], paddings[0], paddings[1], paddings[1], - &input_pad); - } else { - input_pad = in_batch; - } if (strides[0] == 1) { - math::DepthwiseConv3x3s1(input_pad, *filter, &out_batch); + math::DepthwiseConv3x3S1(in_batch, *filter, paddings, + &out_batch); } else if (strides[0] == 2) { - math::DepthwiseConv3x3s2(input_pad, *filter, &out_batch); + math::DepthwiseConv3x3S2(in_batch, *filter, paddings, + &out_batch); } else { // math::DepthwiseConv3x3(input_pad, *filter, // &out_batch); diff --git a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h b/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h deleted file mode 100644 index b48b03491bab9594f36cad0b21485ae72c8c3c31..0000000000000000000000000000000000000000 --- a/src/operators/kernel/central-arm-func/depthwise_conv_arm_func.h +++ /dev/null @@ -1,53 +0,0 @@ -/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. - -Licensed under the Apache License, Version 2.0 (the "License"); -you may not use this file except in compliance with the License. -You may obtain a copy of the License at - - http://www.apache.org/licenses/LICENSE-2.0 - -Unless required by applicable law or agreed to in writing, software -distributed under the License is distributed on an "AS IS" BASIS, -WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. -See the License for the specific language governing permissions and -limitations under the License. */ - -#ifdef DEPTHWISECONV_OP - -#pragma once -#include -#include "operators/kernel/central-arm-func/conv_arm_func.h" -#include "operators/math/depthwise_conv3x3.h" -#include "operators/op_param.h" - -namespace paddle_mobile { -namespace operators { - -template -void DepthwiseConvCompute(const ConvParam ¶m) { - Tensor Bias; - Bias.mutable_data({param.Groups()}); - if (param.Groups() == param.Input()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 1) { - math::DepthwiseConv3x3s1p1(param.Input(), param.Filter(), param.Output(), - &Bias, false); - } else if (param.Groups() == param.Input()->dims()[1] && - param.Input()->dims()[1] == param.Output()->dims()[1] && - param.Filter()->dims()[2] == param.Filter()->dims()[3] && - param.Filter()->dims()[2] == 3 && param.Strides()[0] == 2) { - // math::DepthwiseConv3x3(param.Input(), param.Strides(), - // param.Paddings(), - // param.Filter(), &Bias, param.Output(), false); - math::DepthwiseConv3x3s2p1v2(param.Input(), param.Filter(), param.Output(), - Bias, false); - - } else { - GemmConv(param); - } -} - -} // namespace operators -} // namespace paddle_mobile - -#endif diff --git a/src/operators/kernel/dequant_add_bn_relu_kernel.h b/src/operators/kernel/dequant_add_bn_kernel.h similarity index 75% rename from src/operators/kernel/dequant_add_bn_relu_kernel.h rename to src/operators/kernel/dequant_add_bn_kernel.h index 7138e5c415caca6766913f9959bd41def0943d34..2fcdad6903e378121c265080f68c35c451714e30 100644 --- a/src/operators/kernel/dequant_add_bn_relu_kernel.h +++ b/src/operators/kernel/dequant_add_bn_kernel.h @@ -14,7 +14,7 @@ limitations under the License. */ #pragma once -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP +#ifdef FUSION_DEQUANT_ADD_BN_OP #include "framework/operator.h" #include "operators/op_param.h" @@ -23,12 +23,12 @@ namespace paddle_mobile { namespace operators { template -class FusionDequantAddBNReluKernel +class FusionDequantAddBNKernel : public framework::OpKernelBase> { + FusionDequantAddBNParam> { public: - void Compute(const FusionDequantAddBNReluParam ¶m); - bool Init(FusionDequantAddBNReluParam *param); + void Compute(const FusionDequantAddBNParam ¶m); + bool Init(FusionDequantAddBNParam *param); }; } // namespace operators diff --git a/src/operators/kernel/dequant_bn_relu_kernel.h b/src/operators/kernel/dequant_bn_relu_kernel.h new file mode 100644 index 0000000000000000000000000000000000000000..edea449dd68db474b14b02304bbdf63768e1bfb0 --- /dev/null +++ b/src/operators/kernel/dequant_bn_relu_kernel.h @@ -0,0 +1,46 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma once + +#include "framework/operator.h" +#include "operators/op_param.h" + +namespace paddle_mobile { +namespace operators { + +#ifdef FUSION_DEQUANT_BN_RELU_OP +template +class FusionDequantBNReluKernel + : public framework::OpKernelBase> { + public: + void Compute(const FusionDequantBNReluParam ¶m); + bool Init(FusionDequantBNReluParam *param); +}; +#endif + +#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP +template +class FusionDequantAddBNReluKernel + : public framework::OpKernelBase> { + public: + void Compute(const FusionDequantAddBNReluParam ¶m); + bool Init(FusionDequantAddBNReluParam *param); +}; +#endif + +} // namespace operators +} // namespace paddle_mobile diff --git a/src/operators/math/depthwise_conv3x3.cpp b/src/operators/math/depthwise_conv3x3.cpp index 39b9b8d3f1c5c2bf09a3db5de5216dd1a08b491a..a4466a52fac228812e8df205a61bdb594775d327 100644 --- a/src/operators/math/depthwise_conv3x3.cpp +++ b/src/operators/math/depthwise_conv3x3.cpp @@ -1272,13 +1272,16 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input, void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, const framework::Tensor *filter, - framework::Tensor *output, framework::Tensor bias, + framework::Tensor *output, framework::Tensor *bias, bool if_bias) { #if __ARM_NEON const float *input_data = input->data(); const float *filter_data = filter->data(); float *output_data = output->data(); - const float *bias_data = bias.data(); + const float *bias_data; + if (if_bias) { + bias_data = bias->data(); + } const int in_h = static_cast(input->dims()[2]); const int in_w = static_cast(input->dims()[3]); @@ -1905,7 +1908,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, void DepthwiseConv3x3s2p0(const framework::Tensor *input, const framework::Tensor *filter, - framework::Tensor *output, framework::Tensor bias, + framework::Tensor *output, framework::Tensor *bias, bool if_bias) { #if __ARM_NEON @@ -1925,7 +1928,7 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, for (int c = 0; c < input_channel; c++) { const float *filter_data = filter->data() + c * 9; const float *input_data = input->data() + c * inhxw; - const float *bias_data = bias.data() + c; + const float *bias_data = bias->data() + c; float *output_data = output->data() + c * outhxw; float w00 = filter_data[0]; float w01 = filter_data[1]; diff --git a/src/operators/math/depthwise_conv3x3.h b/src/operators/math/depthwise_conv3x3.h index 72cadaf21553a428e1479d5548d2aa5f4fcdf90c..ca8f45fa5186fc1a2642a53f27526c6898bfb8e3 100644 --- a/src/operators/math/depthwise_conv3x3.h +++ b/src/operators/math/depthwise_conv3x3.h @@ -50,7 +50,7 @@ void DepthwiseConvAddBNRelu3x3s2p1(const framework::Tensor *input, void DepthwiseConv3x3s2p1v2(const framework::Tensor *input, const framework::Tensor *filter, - framework::Tensor *output, framework::Tensor bias, + framework::Tensor *output, framework::Tensor *bias, bool if_bias); void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, @@ -62,7 +62,7 @@ void DepthwiseConvAddBNRelu3x3s2p1v2(const framework::Tensor *input, void DepthwiseConv3x3s2p0(const framework::Tensor *input, const framework::Tensor *filter, - framework::Tensor *output, framework::Tensor bias, + framework::Tensor *output, framework::Tensor *bias, bool if_bias); // TODO(hjchen2) need to be implemented @@ -70,16 +70,19 @@ void DepthwiseConv3x3s2p0(const framework::Tensor *input, // void DepthwiseConv3x3(const framework::Tensor *input, // const framework::Tensor *filter, // const std::vector &strides, +// const std::vector &paddings, // framework::Tensor *output); template -void DepthwiseConv3x3s1(const framework::Tensor &input, +void DepthwiseConv3x3S1(const framework::Tensor &input, const framework::Tensor &filter, + const std::vector &paddings, framework::Tensor *output); template -void DepthwiseConv3x3s2(const framework::Tensor &input, +void DepthwiseConv3x3S2(const framework::Tensor &input, const framework::Tensor &filter, + const std::vector &paddings, framework::Tensor *output); } // namespace math diff --git a/src/operators/math/depthwise_conv3x3_int8.cpp b/src/operators/math/depthwise_conv3x3_int8.cpp index ddd8f79f7ce350e048585917f96d82639d4ea951..9b4c6096ecdbd7adee27728ebaae47149392dad9 100644 --- a/src/operators/math/depthwise_conv3x3_int8.cpp +++ b/src/operators/math/depthwise_conv3x3_int8.cpp @@ -12,12 +12,300 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ +#if defined(__ARM_NEON__) && !defined(__aarch64__) + #include "operators/math/depthwise_conv3x3.h" +#ifdef __ARM_NEON__ +#include +#endif namespace paddle_mobile { namespace operators { namespace math { +template +inline void Depth3x3ValidColLoadInput(const int8_t *input, const int input_w, + const int valid_cols, int16x8_t *y0, + int16x8_t *y1, int16x8_t *y2) { + PADDLE_MOBILE_THROW_EXCEPTION("Stride %d is not supported.", Stride); +} + +template <> +inline void Depth3x3ValidColLoadInput<1>(const int8_t *input, const int input_w, + const int valid_cols, int16x8_t *y0, + int16x8_t *y1, int16x8_t *y2) { + int8_t fake_input[3][8]; + if (valid_cols == 1) { + for (int i = 0; i < 8; ++i, input += input_w) { + fake_input[0][i] = input[0]; + } + } else if (valid_cols == 2) { + for (int i = 0; i < 8; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + } + } else { + for (int i = 0; i < 8; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + fake_input[2][i] = input[2]; + } + } + int8x8_t input0 = vld1_s8(fake_input[0]); + int8x8_t input1 = vld1_s8(fake_input[1]); + int8x8_t input2 = vld1_s8(fake_input[2]); + y0[0] = vmovl_s8(input0); + y1[0] = vmovl_s8(input1); + y2[0] = vmovl_s8(input2); + y0[1] = vextq_s16(y0[0], y0[0], 1); + y0[2] = vextq_s16(y0[0], y0[0], 2); + y1[1] = vextq_s16(y1[0], y1[0], 1); + y1[2] = vextq_s16(y1[0], y1[0], 2); + y2[1] = vextq_s16(y2[0], y2[0], 1); + y2[2] = vextq_s16(y2[0], y2[0], 2); +} + +template <> +inline void Depth3x3ValidColLoadInput<2>(const int8_t *input, const int input_w, + const int valid_cols, int16x8_t *y0, + int16x8_t *y1, int16x8_t *y2) { + int8_t fake_input[3][13]; + if (valid_cols == 1) { + for (int i = 0; i < 13; ++i, input += input_w) { + fake_input[0][i] = input[0]; + } + } else if (valid_cols == 2) { + for (int i = 0; i < 13; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + } + } else { + for (int i = 0; i < 13; ++i, input += input_w) { + fake_input[0][i] = input[0]; + fake_input[1][i] = input[1]; + fake_input[2][i] = input[2]; + } + } + int8x8x2_t input0 = vld2_s8(fake_input[0]); + int8x8x2_t input1 = vld2_s8(fake_input[1]); + int8x8x2_t input2 = vld2_s8(fake_input[2]); + y0[0] = vmovl_s8(input0.val[0]); + y0[1] = vmovl_s8(input0.val[1]); + y0[2] = vextq_s16(y0[0], y0[0], 1); + y1[0] = vmovl_s8(input1.val[0]); + y1[1] = vmovl_s8(input1.val[1]); + y1[2] = vextq_s16(y1[0], y1[0], 1); + y2[0] = vmovl_s8(input2.val[0]); + y2[1] = vmovl_s8(input2.val[1]); + y2[2] = vextq_s16(y2[0], y2[0], 1); +} + +template +inline void DepthwiseConv3x3ValidCol(const int8_t *input, const int8_t *filter, + const int h_output, const int h_output_end, + const int w_output, const int input_h, + const int input_w, const int padding_h, + const int padding_w, const int output_w, + int32_t *output) { + const int w_in_start = -padding_w + w_output * Stride_w; + const int w_in_end = w_in_start + 3; + const int w_start = w_in_start > 0 ? w_in_start : 0; + const int w_end = w_in_end < input_w ? w_in_end : input_w; + int remain_start = h_output; + +#ifdef __ARM_NEON__ + int output_tiles = (h_output_end - h_output) / 6; + remain_start = h_output + output_tiles * 6; + int input_h_start = h_output * Stride_h - padding_h; + size_t input_offset = input_h_start * input_w + w_start; + size_t output_offset = h_output * output_w + w_output; + int16x8_t _input[3][3]; + int16x4_t _kernel[3]; + int32x4_t _sum0, _sum1; + const int8_t *filter_ptr = filter; + asm volatile( + "mov r0, #3 \n" + "vld1.s8 d10, [%[filter]], r0 \n" + "vld1.s8 d11, [%[filter]], r0 \n" + "vld1.s8 d12, [%[filter]] \n" + "vtrn.8 d10, d11 \n" + "vtrn.8 d12, d13 \n" + "vtrn.16 d10, d12 \n" + "vtrn.16 d11, d13 \n" + "vmovl.s8 q7, d10 \n" + "vmovl.s8 q8, d11 \n" + "vmovl.s8 q9, d12 \n" + "vmov.32 %[_kernel0], d14 \n" + "vmov.32 %[_kernel1], d16 \n" + "vmov.32 %[_kernel2], d18 \n" + : [_kernel0] "+w"(_kernel[0]), [_kernel1] "+w"(_kernel[1]), + [_kernel2] "+w"(_kernel[2]) + : [filter] "r"(filter_ptr) + : "memory", "q5", "q6", "q7", "q8", "q9", "r0"); + int valid_cols = w_end - w_start; + for (int h = 0; h < output_tiles * 6; h += 6) { + int32_t *output0 = output + output_offset; + int32_t *output1 = output0 + output_w; + int32_t *output2 = output1 + output_w; + int32_t *output3 = output2 + output_w; + int32_t *output4 = output3 + output_w; + int32_t *output5 = output4 + output_w; + Depth3x3ValidColLoadInput(input + input_offset, input_w, + valid_cols, _input[0], _input[1], + _input[2]); + _sum0 = veorq_s32(_sum0, _sum0); + _sum1 = veorq_s32(_sum1, _sum1); + for (int w_in = 0; w_in < valid_cols; ++w_in) { + int index = w_in + w_start - w_in_start; + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][0]), + _kernel[index], 0); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][1]), + _kernel[index], 1); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(_input[w_in][2]), + _kernel[index], 2); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][0]), + _kernel[index], 0); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][1]), + _kernel[index], 1); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(_input[w_in][2]), + _kernel[index], 2); + } + vst1q_lane_s32(output0, _sum0, 0); + vst1q_lane_s32(output1, _sum0, 1); + vst1q_lane_s32(output2, _sum0, 2); + vst1q_lane_s32(output3, _sum0, 3); + vst1q_lane_s32(output4, _sum1, 0); + vst1q_lane_s32(output5, _sum1, 1); + input_offset += 6 * Stride_h * input_w; + output_offset += 6 * output_w; + } +#endif + for (int h = remain_start; h < h_output_end; ++h) { + int32_t value = 0; + const int h_in_start = -padding_h + h * Stride_h; + for (int i = 0; i < 3; ++i) { + for (int w_in = w_start; w_in < w_end; ++w_in) { + value += filter[i * 3 + (w_in - w_in_start)] * + input[(h_in_start + i) * input_w + w_in]; + } + } + output[h * output_w + w_output] = value; + } +} + +#define DEPTHWISE_CONV_NORMAL_BORDER(start, end) \ + for (int w = start; w < end; ++w) { \ + const int w_in_start = -padding_w + w * Stride_w; \ + const int w_in_end = w_in_start + 3; \ + const int w_start = w_in_start > 0 ? w_in_start : 0; \ + const int w_end = w_in_end < input_w ? w_in_end : input_w; \ + int32_t value = 0; \ + for (int h_in = h_start; h_in < h_end; ++h_in) { \ + for (int w_in = w_start; w_in < w_end; ++w_in) { \ + value += filter[(h_in - h_in_start) * 3 + (w_in - w_in_start)] * \ + input[h_in * input_w + w_in]; \ + } \ + } \ + output_ptr[w] = value; \ + } + +template +inline void Depth3x3NormalRowLoadInput(const int8_t *input, + int16x8_t &y0, // NOLINT + int16x8_t &y1, // NOLINT + int16x8_t &y2) { // NOLINT + PADDLE_MOBILE_THROW_EXCEPTION("Stride %d is not supported.", Stride); +} + +template <> +inline void Depth3x3NormalRowLoadInput<1>(const int8_t *input, + int16x8_t &y0, // NOLINT + int16x8_t &y1, // NOLINT + int16x8_t &y2) { // NOLINT + int8x8_t x0 = vld1_s8(input); + y0 = vmovl_s8(x0); + y1 = vextq_s16(y0, y0, 1); + y2 = vextq_s16(y1, y1, 1); +} + +template <> +inline void Depth3x3NormalRowLoadInput<2>(const int8_t *input, + int16x8_t &y0, // NOLINT + int16x8_t &y1, // NOLINT + int16x8_t &y2) { // NOLINT + int8x8x2_t x0 = vld2_s8(input); + y0 = vmovl_s8(x0.val[0]); + y1 = vmovl_s8(x0.val[1]); + y2 = vextq_s16(y0, y0, 1); +} + +template +inline void DepthwiseConv3x3NormalRow(const int8_t *input, const int8_t *filter, + const int h_output, const int input_h, + const int input_w, const int padding_h, + const int padding_w, const int output_w, + int32_t *output) { + const int h_in_start = -padding_h + h_output * Stride_h; + const int h_in_end = h_in_start + 3; + const int h_start = h_in_start > 0 ? h_in_start : 0; + const int h_end = h_in_end < input_h ? h_in_end : input_h; + + int valid_w_start = (padding_w + Stride_w - 1) / Stride_w; + int valid_w_end = output_w - valid_w_start; + + int32_t *output_ptr = output + h_output * output_w; + // border left + DEPTHWISE_CONV_NORMAL_BORDER(0, valid_w_start) + // middle + int remain_start = valid_w_start; +#ifdef __ARM_NEON__ + int output_tiles = (valid_w_end - valid_w_start) / 6; + remain_start = valid_w_start + output_tiles * 6; + int32x4_t _sum0, _sum1; + int16x8_t y0, y1, y2; + int16x4_t _kernel[3]; + for (int h_in = h_start; h_in < h_end; ++h_in) { + int index = h_in - h_in_start; + int8x8_t w0 = vld1_s8(filter + index * 3); + int16x8_t w1 = vmovl_s8(w0); + _kernel[index] = vget_low_s16(w1); + } + for (int w = 0; w < output_tiles * 6; w += 6) { + _sum0 = veorq_s32(_sum0, _sum0); + _sum1 = veorq_s32(_sum1, _sum1); + int output_offset = valid_w_start + w; + int input_w_offset = output_offset * Stride_w - padding_w; + for (int h_in = h_start; h_in < h_end; ++h_in) { + int index = h_in - h_in_start; + Depth3x3NormalRowLoadInput( + input + h_in * input_w + input_w_offset, y0, y1, y2); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y0), _kernel[index], 0); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y1), _kernel[index], 1); + _sum0 = vmlal_lane_s16(_sum0, vget_low_s16(y2), _kernel[index], 2); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y0), _kernel[index], 0); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y1), _kernel[index], 1); + _sum1 = vmlal_lane_s16(_sum1, vget_high_s16(y2), _kernel[index], 2); + } + vst1q_s32(output_ptr + output_offset, _sum0); + vst1q_lane_s32(output_ptr + output_offset + 4, _sum1, 0); + vst1q_lane_s32(output_ptr + output_offset + 5, _sum1, 1); + } +#endif + for (int w = remain_start; w < valid_w_end; ++w) { + int32_t value = 0; + int input_start = -padding_w + w * Stride_w; + for (int h_in = h_start; h_in < h_end; ++h_in) { + for (int j = 0; j < 3; ++j) { + value += filter[(h_in - h_in_start) * 3 + j] * + input[h_in * input_w + j + input_start]; + } + } + output_ptr[w] = value; + } + // border right + DEPTHWISE_CONV_NORMAL_BORDER(valid_w_end, output_w) +} + // template<> // void DepthwiseConv3x3( // const framework::Tensor *input, const framework::Tensor *filter, @@ -27,43 +315,72 @@ namespace math { // } template <> -void DepthwiseConv3x3s1(const framework::Tensor &input, +void DepthwiseConv3x3S1(const framework::Tensor &input, const framework::Tensor &filter, + const std::vector &paddings, framework::Tensor *output) { const int8_t *input_data = input.data(); const int8_t *filter_data = filter.data(); int32_t *out_data = output->mutable_data(); - // make sure that batch size is 1 - int input_c = input.dims()[1]; int input_h = input.dims()[2]; int input_w = input.dims()[3]; - int output_c = output->dims()[1]; int output_h = output->dims()[2]; int output_w = output->dims()[3]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; int image_size = input_h * input_w; int out_image_size = output_h * output_w; -#if __aarch64__ - // TODO(hjchen2) -#else + int valid_h_start = padding_h; + int valid_h_end = output_h - valid_h_start; + int valid_h = valid_h_end - valid_h_start; + int valid_w_start = padding_w; + int valid_w_end = output_w - valid_w_start; + int valid_w = valid_w_end - valid_w_start; + #pragma omp parallel for - for (int g = 0; g < input_c; ++g) { - const int8_t* input_ptr = input_data + g * image_size; - const int8_t* filter_ptr = filter_data + g * 9; - int32_t* output_ptr = out_data + g * out_image_size; - int loops = (input_w - 2) / 6; - int remain = input_w - 2 - loops * 6; - for (int h = 0; h < input_h - 5 /*(input_h - 2) - 3*/; h += 4) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - const int8_t* input_ptr3 = input_ptr2 + input_w; - const int8_t* input_ptr4 = input_ptr3 + input_w; - const int8_t* input_ptr5 = input_ptr4 + input_w; - int32_t* output_ptr0 = output_ptr + h * output_w; - int32_t* output_ptr1 = output_ptr0 + output_w; - int32_t* output_ptr2 = output_ptr1 + output_w; - int32_t* output_ptr3 = output_ptr2 + output_w; - int loop = loops; + for (int g = 0; g < input.dims()[1]; ++g) { + const int8_t *input_ptr = input_data + g * image_size; + const int8_t *filter_ptr = filter_data + g * 9; + int32_t *output_ptr = out_data + g * out_image_size; + // top + for (int h = 0; h < valid_h_start; ++h) { + DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // left + for (int w = 0; w < valid_w_start; ++w) { + DepthwiseConv3x3ValidCol<1, 1>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // right + for (int w = valid_w_end; w < output_w; ++w) { + DepthwiseConv3x3ValidCol<1, 1>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // bottom + for (int h = valid_h_end; h < output_h; ++h) { + DepthwiseConv3x3NormalRow<1, 1>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // valid + int output_w_tiles = valid_w / 6; + int output_w_remain = valid_w - output_w_tiles * 6; + for (int h = valid_h_start; h < valid_h_end - 3; h += 4) { + const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + const int8_t *input_ptr3 = input_ptr2 + input_w; + const int8_t *input_ptr4 = input_ptr3 + input_w; + const int8_t *input_ptr5 = input_ptr4 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int32_t *output_ptr1 = output_ptr0 + output_w; + int32_t *output_ptr2 = output_ptr1 + output_w; + int32_t *output_ptr3 = output_ptr2 + output_w; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -377,27 +694,27 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" "vst1.32 {d28[0]}, [%[output_ptr2]]! \n" "vst1.32 {d10[0]}, [%[output_ptr3]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), [output_ptr2] "+r"(output_ptr2), [output_ptr3] "+r"(output_ptr3), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); } // remain height - int start_h = (input_h - 2) & 0xFFFC; - for (int h = start_h; h < input_h - 3 /*(input_h - 2) - 1*/; h += 2) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - const int8_t* input_ptr3 = input_ptr2 + input_w; - int32_t* output_ptr0 = output_ptr + h * output_w; - int32_t* output_ptr1 = output_ptr0 + output_w; - int loop = loops; + int start_h = valid_h_start + (valid_h & 0xFFFC); + for (int h = start_h; h < valid_h_end - 1; h += 2) { + const int8_t *input_ptr0 = input_ptr + (h - padding_h) * input_w; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + const int8_t *input_ptr3 = input_ptr2 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int32_t *output_ptr1 = output_ptr0 + output_w; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -415,9 +732,9 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #6 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #6 \n" // loop 6 widths "loop_2h6w_%=: \n" "vld1.32 {d9}, [%[input_ptr0]], r0 \n" @@ -589,23 +906,23 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, "blt end_%= \n" "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "r0"); } - start_h = (input_h - 2) & 0xFFFE; - if (start_h < input_h - 2) { - const int8_t* input_ptr0 = input_ptr + start_h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - int32_t* output_ptr0 = output_ptr + start_h * output_w; - int loop = loops; + start_h = valid_h_start + (valid_h & 0xFFFE); + if (start_h < valid_h_end) { + const int8_t *input_ptr0 = input_ptr + (start_h - padding_h) * input_w; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + int32_t *output_ptr0 = output_ptr + start_h * output_w + valid_w_start; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -623,9 +940,9 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #6 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #6 \n" // loop 6 widths "loop_1h6w_%=: \n" "vld1.32 {d9}, [%[input_ptr0]], r0 \n" @@ -736,56 +1053,91 @@ void DepthwiseConv3x3s1(const framework::Tensor &input, "cmp %[remain], #1 \n" "blt end_%= \n" "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "r0"); } } -#endif // __aarch64__ } template <> -void DepthwiseConv3x3s2(const framework::Tensor &input, +void DepthwiseConv3x3S2(const framework::Tensor &input, const framework::Tensor &filter, + const std::vector &paddings, framework::Tensor *output) { const int8_t *input_data = input.data(); const int8_t *filter_data = filter.data(); int32_t *out_data = output->mutable_data(); - // make sure that batch size is 1 - int input_c = input.dims()[1]; int input_h = input.dims()[2]; int input_w = input.dims()[3]; - int output_c = output->dims()[1]; int output_h = output->dims()[2]; int output_w = output->dims()[3]; + int padding_h = paddings[0]; + int padding_w = paddings[1]; int image_size = input_h * input_w; int out_image_size = output_h * output_w; -#if __aarch64__ - // TODO(hjchen2) -#else + int valid_h_start = (padding_h + 1) / 2; + int valid_h_end = output_h - valid_h_start; + int valid_h = valid_h_end - valid_h_start; + int valid_w_start = (padding_w + 1) / 2; + int valid_w_end = output_w - valid_w_start; + int valid_w = valid_w_end - valid_w_start; + + // DLOG << "valid_h_start: " << valid_h_start; + // DLOG << "valid_h_end: " << valid_h_end; + // DLOG << "valid_w_start: " << valid_w_start; + // DLOG << "valid_w_end: " << valid_w_end; + #pragma omp parallel for - for (int g = 0; g < input_c; ++g) { - const int8_t* input_ptr = input_data + g * image_size; - const int8_t* filter_ptr = filter_data + g * 9; - int32_t* output_ptr = out_data + g * out_image_size; - int loops = output_w / 6; - int remain = output_w - loops * 6; - for (int h = 0; h < input_h - 6 /*(input_h - 1) - 5*/; h += 6) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - const int8_t* input_ptr3 = input_ptr2 + input_w; - const int8_t* input_ptr4 = input_ptr3 + input_w; - const int8_t* input_ptr5 = input_ptr4 + input_w; - const int8_t* input_ptr6 = input_ptr5 + input_w; - int32_t* output_ptr0 = output_ptr + (h >> 1) * output_w; - int32_t* output_ptr1 = output_ptr0 + output_w; - int32_t* output_ptr2 = output_ptr1 + output_w; - int loop = loops; + for (int g = 0; g < input.dims()[1]; ++g) { + const int8_t *input_ptr = input_data + g * image_size; + const int8_t *filter_ptr = filter_data + g * 9; + int32_t *output_ptr = out_data + g * out_image_size; + // top + for (int h = 0; h < valid_h_start; ++h) { + DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // left + for (int w = 0; w < valid_w_start; ++w) { + DepthwiseConv3x3ValidCol<2, 2>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // right + for (int w = valid_w_end; w < output_w; ++w) { + DepthwiseConv3x3ValidCol<2, 2>( + input_ptr, filter_ptr, valid_h_start, valid_h_end, w, input_h, + input_w, padding_h, padding_w, output_w, output_ptr); + } + // bottom + for (int h = valid_h_end; h < output_h; ++h) { + DepthwiseConv3x3NormalRow<2, 2>(input_ptr, filter_ptr, h, input_h, + input_w, padding_h, padding_w, output_w, + output_ptr); + } + // valid + int input_w_start = 2 * valid_w_start - padding_w; + int output_w_tiles = valid_w / 6; + int output_w_remain = valid_w - output_w_tiles * 6; + for (int h = valid_h_start; h < valid_h_end - 2; h += 3) { + size_t offset = (2 * h - padding_h) * input_w + input_w_start; + const int8_t *input_ptr0 = input_ptr + offset; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + const int8_t *input_ptr3 = input_ptr2 + input_w; + const int8_t *input_ptr4 = input_ptr3 + input_w; + const int8_t *input_ptr5 = input_ptr4 + input_w; + const int8_t *input_ptr6 = input_ptr5 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int32_t *output_ptr1 = output_ptr0 + output_w; + int32_t *output_ptr2 = output_ptr1 + output_w; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -803,9 +1155,9 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #12 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #12 \n" // loop 6 widths "loop_3h6w_%=: \n" "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" @@ -1048,25 +1400,26 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, "vst1.32 {d20[0]}, [%[output_ptr0]]! \n" "vst1.32 {d24[0]}, [%[output_ptr1]]! \n" "vst1.32 {d28[0]}, [%[output_ptr2]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [output_ptr1] "+r"(output_ptr1), [output_ptr2] "+r"(output_ptr2), [input_ptr6] "+r"(input_ptr6), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [input_ptr3] "+r"(input_ptr3), [input_ptr4] "+r"(input_ptr4), [input_ptr5] "+r"(input_ptr5), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "q13", "q14", "q15", "r0"); } - int start_h = (output_h / 3) * 6; - for (int h = start_h; h < input_h - 2 /*(input_h - 1) - 1*/; h += 2) { - const int8_t* input_ptr0 = input_ptr + h * input_w; - const int8_t* input_ptr1 = input_ptr0 + input_w; - const int8_t* input_ptr2 = input_ptr1 + input_w; - int32_t* output_ptr0 = output_ptr + (h >> 1) * output_w; - int loop = loops; + int start_h = valid_h_start + valid_h / 3 * 3; + for (int h = start_h; h < valid_h_end; ++h) { + size_t offset = (2 * h - padding_h) * input_w + input_w_start; + const int8_t *input_ptr0 = input_ptr + offset; + const int8_t *input_ptr1 = input_ptr0 + input_w; + const int8_t *input_ptr2 = input_ptr1 + input_w; + int32_t *output_ptr0 = output_ptr + h * output_w + valid_w_start; + int loop = output_w_tiles; asm volatile( "vld1.32 {q0}, [%[filter_ptr]] \n" "vmovl.s8 q14, d0 \n" @@ -1084,9 +1437,9 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, : [filter_ptr] "r"(filter_ptr) : "memory", "q0", "q1", "q2", "q3", "q4", "q14", "q15"); asm volatile( - "mov r0, #12 \n" "cmp %[loop], #0 \n" "ble start_remain_%= \n" + "mov r0, #12 \n" // loop 6 widths "loop_1h6w_%=: \n" "vld2.8 {d10, d11}, [%[input_ptr0]], r0 \n" @@ -1190,18 +1543,19 @@ void DepthwiseConv3x3s2(const framework::Tensor &input, "cmp %[remain], #1 \n" "blt end_%= \n" "vst1.32 {d22[0]}, [%[output_ptr0]]! \n" - "end_%=: \n" + "end_%=: \n" : [output_ptr0] "+r"(output_ptr0), [input_ptr0] "+r"(input_ptr0), [input_ptr1] "+r"(input_ptr1), [input_ptr2] "+r"(input_ptr2), [loop] "+r"(loop) - : [remain] "r"(remain) + : [remain] "r"(output_w_remain) : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", "q9", "q10", "q11", "q12", "r0"); } } -#endif // __aarch64__ } } // namespace math } // namespace operators } // namespace paddle_mobile + +#endif diff --git a/src/operators/op_param.h b/src/operators/op_param.h index 289e29c382ad39006fb65b38be3bc9ebfc58fed6..381b66199892df9f24eca63470314e7652f5a72a 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -405,9 +405,9 @@ class ConvParam : public OpParam { const RType *Input() const { return input_; } - RType *&Filter() const { return filter_; } + RType *Filter() const { return filter_; } - RType *&Output() const { return output_; } + RType *Output() const { return output_; } const vector &Strides() const { return strides_; } @@ -419,6 +419,8 @@ class ConvParam : public OpParam { EXEC_INVALID = 0, EXEC_GEMM_FLOAT, EXEC_DEPTHWISE3x3S1P1_FLOAT, + EXEC_DEPTHWISE3x3S2P0_FLOAT, + EXEC_DEPTHWISE3x3S2P1_FLOAT, EXEC_DEPTHWISE3x3_FLOAT, EXEC_WINOGRAD3X3_FLOAT, EXEC_WINOGRAD5X5_FLOAT, @@ -439,8 +441,8 @@ class ConvParam : public OpParam { protected: RType *input_; - mutable RType *output_; - mutable RType *filter_; + RType *output_; + RType *filter_; vector strides_; vector paddings_; vector dilations_; @@ -2585,7 +2587,9 @@ class DequantizeParam : public OpParam { DequantizeParam(const VariableNameMap &inputs, const VariableNameMap &outputs, const AttributeMap &attrs, const Scope &scope) { input_ = InputXFrom(inputs, scope); - output_ = OutFrom(outputs, scope); + if (outputs.count("Out")) { + output_ = OutFrom(outputs, scope); + } activation_scale_ = OpParam::GetVarValue("Scale", inputs, scope); // dequantization is performed as x = x / static_scale / online_scale if (HasAttr("weight_scale", attrs)) { @@ -2605,20 +2609,19 @@ class DequantizeParam : public OpParam { }; #endif -#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP +#if defined(FUSION_DEQUANT_ADD_BN_OP) || \ + defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || \ + defined(FUSION_DEQUANT_BN_RELU_OP) || defined(FUSION_DEQUANT_BN_OP) template -class FusionDequantAddBNReluParam : public DequantizeParam { +class FusionDequantBNParam : public DequantizeParam { typedef typename DtypeTensorTrait::gtype GType; typedef typename DtypeTensorTrait::rtype RType; public: - FusionDequantAddBNReluParam(const VariableNameMap &inputs, - const VariableNameMap &outputs, - const AttributeMap &attrs, const Scope &scope) + FusionDequantBNParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) : DequantizeParam(inputs, outputs, attrs, scope) { - // element wise add params - axis_ = OpParam::GetAttr("axis", attrs); - bias_ = OpParam::InputYFrom(inputs, scope); // batch norm params bn_mean_ = OpParam::GetVarValue("BNMean", inputs, scope); bn_variance_ = OpParam::GetVarValue("BNVariance", inputs, scope); @@ -2626,21 +2629,83 @@ class FusionDequantAddBNReluParam : public DequantizeParam { bn_bias_ = OpParam::GetVarValue("BNBias", inputs, scope); epsilon_ = OpParam::GetAttr("epsilon", attrs); // output - output_ = OpParam::OutFrom(outputs, scope); + if (outputs.count("Y")) { + this->output_ = OpParam::OutputYFrom(outputs, scope); + } } public: - // elementwise add - int axis_; - RType *bias_; // batch norm RType *bn_mean_; RType *bn_variance_; RType *bn_scale_; RType *bn_bias_; float epsilon_; - // output - RType *output_; +}; +#endif + +#if defined(FUSION_DEQUANT_ADD_BN_RELU_OP) || defined(FUSION_DEQUANT_ADD_BN_OP) +template +class FusionDequantAddBNParam : public FusionDequantBNParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDequantAddBNParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) + : FusionDequantBNParam(inputs, outputs, attrs, scope) { + // element wise add params + axis_ = OpParam::GetAttr("axis", attrs); + bias_ = OpParam::InputYFrom(inputs, scope); + // output + if (outputs.count("Y")) { + this->output_ = OpParam::OutputYFrom(outputs, scope); + } + } + + public: + // elementwise add + int axis_; + RType *bias_; +}; +#endif + +#ifdef FUSION_DEQUANT_BN_RELU_OP +template +class FusionDequantBNReluParam : public FusionDequantBNParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDequantBNReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) + : FusionDequantBNParam(inputs, outputs, attrs, scope) { + // output + if (outputs.count("Out")) { + this->output_ = OpParam::OutFrom(outputs, scope); + } + } +}; +#endif + +#ifdef FUSION_DEQUANT_ADD_BN_RELU_OP +template +class FusionDequantAddBNReluParam : public FusionDequantAddBNParam { + typedef typename DtypeTensorTrait::gtype GType; + typedef typename DtypeTensorTrait::rtype RType; + + public: + FusionDequantAddBNReluParam(const VariableNameMap &inputs, + const VariableNameMap &outputs, + const AttributeMap &attrs, const Scope &scope) + : FusionDequantAddBNParam(inputs, outputs, attrs, scope) { + // output + if (outputs.count("Out")) { + this->output_ = OpParam::OutFrom(outputs, scope); + } + } }; #endif diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index 4d05328179fa2acc771e08a6dfddea4f770d9780..1a5daafe2b784b98b102fa2eab04f71c67260d9c 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include +#include +#include #include "../test_include.h" #ifdef PADDLE_MOBILE_FPGA_V1 @@ -87,26 +89,29 @@ int main() { paddle_mobile::PaddleMobile paddle_mobile; if (paddle_mobile.Load(std::string(g_resnet50), true)) { Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(0), - static_cast(1)); + SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(2), + static_cast(2)); readStream(g_image_src_float, input_tensor.mutable_data({1, 3, 224, 224})); paddle_mobile.FeedData(input_tensor); paddle_mobile.Predict_To(-1); - /*for(int i = 0; i < 73; i++) - { + for (int i = 0; i < 73; i++) { auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "resnet50_result_" + std::to_string (i); + std::string saveName = "resnet50_result_" + std::to_string(i); paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data(), - tensor_ptr->numel()); dump_stride(saveName, (*tensor_ptr), 20); - //dump(saveName, (*tensor_ptr)); - }*/ + tensor_ptr->numel() * sizeof(half)); + dump_stride(saveName, (*tensor_ptr), 20); + // dump(saveName, (*tensor_ptr)); + } - /*std::shared_ptr output_tensor = paddle_mobile.FetchResult(73); - (*output_tensor).dump("resnet50_result_73"); + std::shared_ptr output_tensor = paddle_mobile.FetchResult(73); + //(*output_tensor).dump("resnet50_result_73"); output_tensor = paddle_mobile.FetchResult(74); - (*output_tensor).dump("resnet50_result_74");*/ - std::shared_ptr output_tensor = paddle_mobile.FetchResult(74); + //(*output_tensor).dump("resnet50_result_74"); + // std::shared_ptr output_tensor = paddle_mobile.FetchResult(74); + + // output_tensor = paddle_mobile.FetchResult(74); + float max = 0; auto data_ptr = output_tensor->data(); int maximumIdx = 0; @@ -116,7 +121,7 @@ int main() { max = data_ptr[i]; } } - std::cout << "index : " << maximumIdx << ", value : " << max + std::cout << "index : " << std::dec << maximumIdx << ", value : " << max << std::endl; std::cout << "Computation done" << std::endl; return 0; diff --git a/test/operators/test_quantize_op.cpp b/test/operators/test_quantize_op.cpp index 9988661bcb898daa5e79b6d22d65d90cfa03c668..50c0e7bd05da7f7a5ee1fd6912be0eff2f6e2958 100644 --- a/test/operators/test_quantize_op.cpp +++ b/test/operators/test_quantize_op.cpp @@ -44,25 +44,19 @@ struct Round { template <> struct Round { int8_t operator()(float x) { - int8_t ret = 0; float v = std::round(x); - int32_t q = (int32_t)v; - if (abs(abs(q - x) - 0.5) > 0) { - ret = q; - } else { - if (abs(q) % 2 == 0) { - ret = q; - } else { - ret = q + ((q > 0) ? -1 : 1); + int32_t q = static_cast(v); + if (abs(abs(q - v) - 0.5) <= 0) { + if (abs(q) % 2 != 0) { + q = q + ((q > 0) ? -1 : 1); } } - return ret; + return static_cast(q); } }; template -static void quantize(const Tensor *input, const float scale, const int pad, - const int8_t pad_val, Tensor *output) { +static void quantize(const Tensor *input, const float scale, Tensor *output) { int batch_size = input->dims()[0]; int channels = input->dims()[1]; int input_h = input->dims()[2]; @@ -77,29 +71,9 @@ static void quantize(const Tensor *input, const float scale, const int pad, for (int nc = 0; nc < batch_size * channels; ++nc) { const float *xh = x + nc * input_spatial; int8_t *yh = y + nc * output_spatial; - // pad top - for (int h = 0; h < pad; ++h, yh += output_w) { - for (int w = 0; w < output_w; ++w) { - yh[w] = pad_val; - } - } for (int h = 0; h < input_h; ++h, yh += output_w, xh += input_w) { - // pad left - for (int w = 0; w < pad; ++w) { - yh[w] = pad_val; - } for (int w = 0; w < input_w; ++w) { - yh[w + pad] = Round()(xh[w] * scale); - } - // pad right - for (int w = 0; w < pad; ++w) { - yh[pad + input_w + w] = pad_val; - } - } - // pad bottom - for (int h = 0; h < pad; ++h, yh += output_w) { - for (int w = 0; w < output_w; ++w) { - yh[w] = pad_val; + yh[w] = Round()(xh[w] * scale); } } } @@ -120,19 +94,14 @@ static float find_abs_max(const Tensor *input) { int TestQuqntizeOp(int argc, char *argv[]) { if (argc < 5) { - std::cout - << "Usage: ./test-quantize-op batch_size channel height width [pad]" - << std::endl; + std::cout << "Usage: ./test-quantize-op batch_size channel height width" + << std::endl; return 1; } - int pad = 0; int batch_size = atoi(argv[1]); int channel = atoi(argv[2]); int height = atoi(argv[3]); int width = atoi(argv[4]); - if (argc == 6) { - pad = atoi(argv[5]); - } std::cout << "batch_size: " << batch_size << ", channel: " << channel << ", height: " << height << ", width: " << width << std::endl; framework::DDim dim = @@ -153,7 +122,6 @@ int TestQuqntizeOp(int argc, char *argv[]) { auto output_scale_var = scope.get()->Var("output_scale"); framework::AttributeMap attrs; - attrs["paddings"].Set>(std::vector({pad, pad})); auto *op = new operators::QuantizeOp("quantize", inputs, outputs, attrs, scope); op->InferShape(); @@ -172,9 +140,9 @@ int TestQuqntizeOp(int argc, char *argv[]) { framework::Tensor output_cmp; output_cmp.Resize(output->dims()); float scale = 127 / output_scale_cmp; - // quantize(input, scale, pad, 0, &output_cmp); - // quantize(input, scale, pad, 0, &output_cmp); - quantize(input, scale, pad, 0, &output_cmp); + // quantize(input, scale, &output_cmp); + // quantize(input, scale, &output_cmp); + quantize(input, scale, &output_cmp); int8_t *output_cmp_data = output_cmp.data(); for (int i = 0; i < output->numel(); ++i) { PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i], diff --git a/tools/op.cmake b/tools/op.cmake index ce9a9079c682b5e9e3dff1754d1d37e98f3a5f3b..52d745565cedc81a0eeac49dda56dab08ffa1dc0 100644 --- a/tools/op.cmake +++ b/tools/op.cmake @@ -250,7 +250,9 @@ if(NOT FOUND_MATCH) set(SUM_OP ON) set(QUANT_OP ON) set(DEQUANT_OP ON) - set(FUSION_DEQUANT_ADD_BN_RELU ON) + set(FUSION_DEQUANT_ADD_BN_OP ON) + set(FUSION_DEQUANT_BN_RELU_OP ON) + set(FUSION_DEQUANT_ADD_BN_RELU_OP ON) endif() # option(BATCHNORM_OP "" ON) @@ -455,10 +457,17 @@ endif() if (DEQUANT_OP) add_definitions(-DDEQUANT_OP) endif() -if (FUSION_DEQUANT_ADD_BN_RELU) +if (FUSION_DEQUANT_ADD_BN_OP) + add_definitions(-DFUSION_DEQUANT_ADD_BN_OP) +endif() +if (FUSION_DEQUANT_BN_RELU_OP) + add_definitions(-DFUSION_DEQUANT_BN_RELU_OP) +endif() +if (FUSION_DEQUANT_ADD_BN_RELU_OP) add_definitions(-DFUSION_DEQUANT_ADD_BN_RELU_OP) endif() + if (TANH_OP) add_definitions(-DTANH_OP) endif() @@ -471,3 +480,4 @@ endif() if (FUSION_DECONVADDRELU_OP) add_definitions(-DFUSION_DECONVADDRELU_OP) endif() +