diff --git a/src/fpga/V1/api.cpp b/src/fpga/V1/api.cpp index 7c1f15f7c90e0b1ebc15a9ec8f3f6333ff173978..ef7d5c13dce50706d69b28830d8c23748266b216 100644 --- a/src/fpga/V1/api.cpp +++ b/src/fpga/V1/api.cpp @@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].filter_scale_address = filter->scale; - arg->conv_arg[i].filter_address = &( - (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT - arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + // arg->conv_arg[i].filter_address = &( + // (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // + // NOLINT + // arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].filter_num = (uint32_t)( i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT : filter_num_per_div); + size_t filter_size = + element_num * arg->conv_arg[i].filter_num * sizeof(int8_t); + auto filter_head = + &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; + arg->conv_arg[i].filter_address = fpga_malloc(filter_size); + memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); + fpga_flush(arg->conv_arg[i].filter_address, filter_size); + + size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float); + auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; + arg->conv_arg[i].sb_address = fpga_malloc(bs_size); + memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); + fpga_flush(arg->conv_arg[i].sb_address, bs_size); + if (n > 1) { arg->conv_arg[i].output.scale_address = (float *)fpga_malloc(2 * sizeof(float)); // NOLINT arg->conv_arg[i].output.address = - fpga_malloc(input->dims()[2] * - align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num, + fpga_malloc(out->dims()[2] * + align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num, IMAGE_ALIGNMENT) * sizeof(half)); } else { @@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; } + filter->reset_data_ptr(nullptr); + fpga_free(bs_ptr); } } // namespace fpga diff --git a/src/fpga/V1/filter.cpp b/src/fpga/V1/filter.cpp index 157ac90a60262cadacb648173cbc5ba6c01e674e..d67c9fdc18ac8f715a5389625b6d76b71281e349 100644 --- a/src/fpga/V1/filter.cpp +++ b/src/fpga/V1/filter.cpp @@ -137,24 +137,23 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num, int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); int num_per_div_after_alignment = align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); - if (num_per_div_after_alignment != num_per_div_before_alignment) { - char *tmp = *data_in; - int div_num = - (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_element = div_num * num_per_div_after_alignment * align_chw; - char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - memset(data_tmp, 0, num_element * sizeof(char)); + char *tmp = *data_in; + int div_num = + (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; + int num_element = div_num * num_per_div_after_alignment * align_chw; + char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT - for (i = 0; i < div_num; i++) { - memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, - *data_in + num_per_div_before_alignment * align_chw * i, - num_per_div_before_alignment * align_chw); - } + memset(data_tmp, 0, num_element * sizeof(char)); - *data_in = data_tmp; - fpga_free(tmp); + for (i = 0; i < div_num; i++) { + memcpy(data_tmp + num_per_div_after_alignment * align_chw * i, + *data_in + num_per_div_before_alignment * align_chw * i, + num_per_div_before_alignment * align_chw); } + + *data_in = data_tmp; + fpga_free(tmp); } void reorder(char **data_in, int num_after_alignment, int chw) { @@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width, char **quantize_data = (char **)data_in; // NOLINT convert_to_hwc(quantize_data, num, channel, height, width); align_element(quantize_data, num, chw); - align_num(quantize_data, num_per_div_before_alignment, num, chw); + if (num_after_alignment != num) { + align_num(quantize_data, num_per_div_before_alignment, num, chw); + } + reorder(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw); fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * @@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height, align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); int div_num = (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; - int num_after_alignment = num_per_div_after_alignment * div_num; + int residual = num % num_per_div_before_alignment; + int num_after_alignment = num_per_div_after_alignment * + ((residual == 0) ? div_num : (div_num - 1)) + + align_to_x(residual, FILTER_NUM_ALIGNMENT); quantize(data_in, data_size, max); - char **quantize_data = (char **)data_in; // NOLINT - convert_fc_filter(quantize_data, num, chw); align_element(quantize_data, num, chw); - align_num(quantize_data, num_per_div_before_alignment, num, chw); + if (num_after_alignment != num) { + align_num(quantize_data, num_per_div_before_alignment, num, chw); + } reorder(quantize_data, num_after_alignment, chw); interleave(quantize_data, num_after_alignment, chw); fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) * diff --git a/src/fpga/V1/pe.cpp b/src/fpga/V1/pe.cpp index 1f0e5768a7017a4c7f928fea86b8f4ef3cdbae3d..d62f015e66c9bfb7c1ee07c349a307563a4581f2 100644 --- a/src/fpga/V1/pe.cpp +++ b/src/fpga/V1/pe.cpp @@ -13,16 +13,172 @@ See the License for the specific language governing permissions and limitations under the License. */ #include "fpga/common/pe.h" +#include +#include +#include +#include "common/types.h" #include "fpga/V1/filter.h" #include "fpga/V1/image.h" #include "fpga/common/config.h" #include "fpga/common/driver.h" +using namespace std; +using namespace paddle_mobile::fpga::driver; // NOLINT namespace paddle_mobile { namespace fpga { +#define IMAGE_ALIGN 16 +#define FILTER_ALIGN 16 +#define FILTER_NUM_ALIGN 32 +#define USE_RELU 1 +#define USE_BIAS 2 + +// bypass cmd +#define CMD_FP16_TO_FP16 0 +#define CMD_FP16_TO_FP32 1 +#define CMD_FP32_TO_FP16 2 +#define CMD_FP32_TO_FP32 3 + +// bypass macro +#define SIZE_FP16 2 +#define SIZE_FP32 4 + +#define PE_IRQ_TIMEOUT 1000000 + +/* Interrupt bit-set offset*/ +#define INTERRUPT_RSVD 0x0001 +#define INTERRUPT_BYPASS 0x0002 +#define INTERRUPT_CONV 0x0004 +#define INTERRUPT_POOLING 0x0008 +#define INTERRUPT_EW 0x0010 +//#define INTERRUPT_RESIZE 0x0020 + +/* Register offset */ +#define REG_INTERRUPT 0x000 +#define REG_VERSION 0x008 +#define REG_TEMPERATURE 0x010 +#define REG_FPGA_RESET 0x018 +#define REG_TEST_REGISTER 0x048 +#define REG_HARDWARE_STATUS 0x050 + +#define REG_TIMER_COUNTER 0x070 + +#define REG_SCALE_PARAMETER 0x080 + +#define REG_FLASH_CMD 0x200 +#define REG_FLASH_DATA 0x208 +#define REG_FLASH_CONFIG 0x210 +#define REG_FLASH_STATUS 0x218 +#define REG_SN 0x220 + +//#define REG_READ_SCALE +//#define REG_WRITE_SCALE + +/*bypass*/ +#define REG_CONVERT_CMD 0x400 +#define REG_CONVERT_SRC_ADDR 0x408 +#define REG_CONVERT_DST_ADDR 0x410 +#define REG_CONVERT_LENGTH 0x418 + +/*resize*/ +#define REG_RESIZE_CMD 0x600 +#define REG_RESIZE_CHANNEL_NUMBER 0x608 +#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610 +#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618 +#define REG_RESIZE_INPUT_BASE_ADDR 0x620 +#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628 +#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630 +#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638 + +/*pooling*/ +#define REG_POOLING_CMD 0x800 +#define REG_POOLING_IMAGE_BASE_ADDR 0x808 +#define REG_POOLING_RESULT_BASE_ADDR 0x810 +#define REG_POOLING_IMAGE_PIXEL 0x818 +#define REG_POOLING_WINDOW_SIZE 0x820 +#define REG_POOLING_RESULT_PIXEL 0x828 +#define REG_POOLING_PAD_PIXEL 0x830 +#define REG_POOLING_STEP_PIXEL 0x838 +#define REG_POOLING_CHANNEL_NUMBER 0x840 +#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848 +#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850 +#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858 +#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860 +#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868 +#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878 +#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880 +#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888 +#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898 +#define REG_POOLING_MODE_RECIPROCAL 0x890 + +/*conv*/ +#define REG_CONV_CMD 0xC00 +#define REG_CONV_IMAGE_BASE_ADDR 0xC08 +#define REG_CONV_FILTER_BASE_ADDR 0xC10 +#define REG_CONV_SB_BASE_ADDR 0xC18 +#define REG_CONV_RESULT_BASE_ADDR 0xC20 +#define REG_CONV_IMAGE_PIXEL 0xC28 +#define REG_CONV_FILTER_PIXEL 0xC30 +#define REG_CONV_RESULT_PIXEL 0xC38 +#define REG_CONV_PAD_PIXEL 0xC40 +#define REG_CONV_STEP_PIXEL 0xC48 +#define REG_CONV_GROUP_NUMBER 0xC50 +#define REG_CONV_FILTER_NUMBER 0xC58 +#define REG_CONV_CHANNEL_NUMBER 0xC60 +#define REG_CONV_FILTER_PER_GROUP 0xC68 +#define REG_CONV_CHANNEL_PER_GROUP 0xC70 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78 +#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80 +#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88 +#define REG_CONV_FILTER_AMOUNT_ALL 0xC90 +#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98 +#define REG_CONV_RESULT_LAST_VALID 0xCA0 + +#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8 +#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8 +#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0 +#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8 +#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0 +#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8 +#define REG_CONV_IMAGE_WIN_CNT 0xCE0 +#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8 +#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8 +#define REG_CONV_PROG_FULL_CNT 0xD08 +#define REG_CONV_POST_PROG_FULL_CNT 0xD10 +#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20 + +#define REG_CONV_IMAGE_SCALE 0xD28 +#define REG_CONV_FILTER_SCALE 0xD30 + +/*ew*/ +#define REG_EW_CMD 0x0F00 +#define REG_EW_IMAGE0_BASE_ADDR 0x0F08 +#define REG_EW_IMAGE1_BASE_ADDR 0x0F10 +#define REG_EW_RESULT_BASE_ADDR 0x0F18 +#define REG_EW_DATA_LEN 0x0F20 +#define REG_EW_COEFFICIENT 0x0F28 +#define REG_EW_IMAGE_PIXEL 0x0F30 +#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38 + int ComputeFpgaConv(const struct SplitConvArgs &args) { - ComputeBasicConv(args.conv_arg[0]); +// ComputeBasicConv(args.conv_arg[0]); +#ifdef FPGA_PRINT_MODE + DLOG << "=============ComputeFPGAConv==========="; + DLOG << " filter_num:" << args.filter_num + << " group_num:" << args.group_num + << " split_num:" << args.split_num; +#endif + + int split_num = args.split_num; + for (int i = 0; i < split_num; i++) { + ComputeBasicConv(args.conv_arg[i]); + } + + if (split_num > 1) { + ComputeFPGAConcat(args.concat_arg); + } } int ComputeBasicConv(const struct ConvArgs &args) { @@ -47,9 +203,237 @@ int ComputeBasicConv(const struct ConvArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif + cout << " relu_enabled:" << args.relu_enabled + << " sb_address:" << args.sb_address + << " filter_address:" << args.filter_address + << " filter_num:" << args.filter_num + << " group_num:" << args.group_num; + cout << " image_address:" << args.image.address + << " image_scale_address:" << args.image.scale_address + << " image_channels:" << args.image.channels + << " image_height:" << args.image.height + << " image_width:" << args.image.width + << " pad_height:" << args.image.pad_height + << " pad_width:" << args.image.pad_width; + cout << " kernel_height:" << args.kernel.height + << " kernel_width:" << args.kernel.width + << " stride_h:" << args.kernel.stride_h + << " stride_w:" << args.kernel.stride_w; + cout << " out_address:" << args.output.address + << " out_scale_address:" << args.output.scale_address; -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Conv"; + // return 0; + uint64_t timer_cnt; + uint64_t output_scale; + uint64_t image_scale; + uint64_t filter_scale; + uint64_t image_address_phy = 0; + uint64_t sb_address_phy = 0; + uint64_t filter_address_phy = 0; + uint64_t output_address_phy = 0; + int ret = 0; + + fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); + fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); + + cout << "image_scale :" << hex << (image_scale) << endl; + cout << "filter_scale :" << hex << (filter_scale) << endl; + + uint64_t filterlen = (uint64_t)args.kernel.width * + (uint64_t)args.kernel.height * + (uint64_t)args.image.channels; + filterlen = align_to_x(filterlen, FILTER_ALIGN); + filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGN); + uint64_t fpga_bias_scale_len = + align_to_x(args.filter_num / args.group_num, 8) * args.group_num; + + uint64_t output_height = + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1; + uint64_t output_width = + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1; + uint64_t output_size = + output_height * output_width * (uint64_t)args.filter_num; + + uint64_t filter_per_group = (uint64_t)(args.filter_num / args.group_num); + uint64_t channel_per_group = (uint64_t)(args.image.channels / args.group_num); + + uint64_t image_row_count = ((uint64_t)args.image.width) * + ((uint64_t)args.image.channels); // without align + uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGN); + uint64_t image_one_pad_per_row = + align_to_x(image_row_count, IMAGE_ALIGN) + + ((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels); + uint64_t filter_amount_all = + align_to_x(((uint64_t)args.kernel.height) * + ((uint64_t)args.kernel.width) * channel_per_group, + FILTER_ALIGN); + + uint64_t output_amount_per_row = + align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGN); + + // find the opt partition strategy + uint64_t res_win; + uint64_t res_fit = 0; + for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { + if ((align_to_x( + (args.image.channels * + (args.kernel.width + (res_win - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1) * + args.kernel.height > + 2048) { + break; + } + } + + if (res_win != output_width) { + res_win -= 1; + } + + if (((res_win % 2) != 0) && (res_win != 1)) { + res_win = res_win - 1; + } + res_fit = res_win; + + uint64_t block_num = (output_width + res_fit - 1) / res_fit; + uint64_t block_len = res_fit; + uint64_t block_last = output_width - res_fit * (block_num - 1); + + uint64_t res_amount_per_row = output_width * args.filter_num; + uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; + + uint64_t image_block_amount_per_row = + args.kernel.stride_w * (res_fit)*args.image.channels; + uint64_t filter_pad_width_mul_channel = + args.image.pad_width * args.image.channels; + uint64_t image_amount_per_row_multi_win_first = + image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); + uint64_t image_amount_per_row_multi_win = + image_amount_per_row * (4 * args.kernel.stride_h); + + uint64_t image_block_num = block_num; + uint64_t image_block_len = + align_to_x((args.image.channels * + (args.kernel.width + (block_len - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1; + uint64_t image_block_len_last = + align_to_x( + (args.image.channels * + (args.kernel.width + (block_last - 1) * args.kernel.stride_w)), + IMAGE_ALIGN) / + 16 + + 1; + uint64_t image_win_cnt = block_len; + uint64_t image_win_cnt_last = block_last; + uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; + uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; + if (prog_full_cnt == 1023) { + prog_full_cnt--; + } + uint64_t post_prog_full_cnt = + (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) + ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) + : 0; + + image_address_phy = vaddr_to_paddr(args.image.address); + sb_address_phy = vaddr_to_paddr(args.sb_address); + filter_address_phy = vaddr_to_paddr(args.filter_address); + output_address_phy = vaddr_to_paddr(args.output.address); + + /*SDK刷Cache保证数据一致性*/ + uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR); + reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR); + reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR); + reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR); + + reg_writeq( + ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), + REG_CONV_IMAGE_PIXEL); + reg_writeq( + ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), + REG_CONV_FILTER_PIXEL); + reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL); + reg_writeq(((uint64_t)args.image.pad_height) | + (((uint64_t)args.image.pad_width) << 32), + REG_CONV_PAD_PIXEL); + reg_writeq(((uint64_t)args.kernel.stride_h) | + (((uint64_t)args.kernel.stride_w) << 32), + REG_CONV_STEP_PIXEL); + + reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); + reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); + reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); + + reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP); + reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP); + + reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW); + reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL); + reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW); + + reg_writeq(image_block_amount_per_row, 0xca8); + reg_writeq(filter_pad_width_mul_channel, 0xcb0); + reg_writeq(image_amount_per_row_multi_win_first, 0xcb8); + reg_writeq(image_amount_per_row_multi_win, 0xcc0); + reg_writeq(image_block_num, 0xcc8); + reg_writeq(image_block_len, 0xcd0); + reg_writeq(image_block_len_last, 0xcd8); + reg_writeq(image_win_cnt, 0xce0); + reg_writeq(image_win_cnt_last, 0xce8); + reg_writeq(res_row_data_align4_pad, 0xcf8); + reg_writeq(prog_full_cnt, 0xd08); + reg_writeq(post_prog_full_cnt, 0xd10); + reg_writeq(fpga_bias_scale_len / 4, 0xd20); + + /*write scale*/ + reg_writeq(image_scale, REG_CONV_IMAGE_SCALE); + reg_writeq(filter_scale, REG_CONV_FILTER_SCALE); + + reg_writeq(cmd, REG_CONV_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; + ret = -EIO; + DLOG << "Conv Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + cout << "output_scale :" << hex << (output_scale) << endl; + + //*(args.output.scale_address) = output_scale; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + + return ret; #endif return 0; @@ -74,8 +458,135 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Polling"; + // return 0; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + int ret = 0; + uint64_t cmd = 0; + + uint64_t image_physical_address = 0; + uint64_t output_physical_address = 0; + + image_physical_address = vaddr_to_paddr(args.image.address); + output_physical_address = vaddr_to_paddr(args.output.address); + + uint32_t output_height = (uint32_t)( + (args.image.height + args.image.pad_height * 2 - args.kernel.height) / + args.kernel.stride_h + + 1); + uint32_t output_width = (uint32_t)( + (args.image.width + args.image.pad_width * 2 - args.kernel.width) / + args.kernel.stride_w + + 1); + + uint64_t image_amount_per_row = align_to_x( + (uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGN); + uint64_t image_one_pad_per_row = + align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels, + FILTER_ALIGN) + + (uint64_t)args.image.pad_width * (uint64_t)args.image.channels; + uint64_t image_two_pad_per_row = align_to_x( + ((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) * + (uint64_t)args.image.channels, + IMAGE_ALIGN); + uint64_t image_row_mul_pooling_hight = + image_amount_per_row * (uint64_t)args.kernel.height; + uint64_t image_row_mul_pad_hight = + image_amount_per_row * (uint64_t)args.image.pad_height; + uint64_t image_row_mul_step_hight = + image_amount_per_row * (uint64_t)args.kernel.stride_h; + uint64_t result_amount_align_32 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, FILTER_ALIGN); + uint64_t result_amount_align_64 = align_to_x( + (uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGN); + uint64_t image_calcu_height = + (uint64_t)args.kernel.height + + ((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h; + + uint64_t image_pad_left = args.image.channels * args.image.pad_width; + uint64_t image_skip_window = args.image.channels * args.kernel.stride_w; + + uint64_t image_padleft_skipwindow = + (image_skip_window << 32) | image_pad_left; + + uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 | + (((uint64_t)args.kernel_reciprocal)); + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR); + reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR); + + reg_writeq( + ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), + REG_POOLING_IMAGE_PIXEL); + reg_writeq( + ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), + REG_POOLING_WINDOW_SIZE); + + reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32), + REG_POOLING_RESULT_PIXEL); + + reg_writeq(((uint64_t)args.image.pad_height) | + (((uint64_t)args.image.pad_width) << 32), + REG_POOLING_PAD_PIXEL); + reg_writeq(((uint64_t)args.kernel.stride_h) | + (((uint64_t)args.kernel.stride_w) << 32), + REG_POOLING_STEP_PIXEL); + + reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER); + + reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW); + reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW); + reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW); + + reg_writeq(image_row_mul_pooling_hight, + REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT); + reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT); + reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT); + + reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32); + reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64); + + reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT); + + reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW); + reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL); + + /*SDK刷Cache保证数据一致性*/ + + reg_writeq(cmd, REG_POOLING_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + ret = -EIO; + DLOG << "Pooling Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + + // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + + return ret; #endif return 0; } @@ -103,8 +614,73 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Conv"; + // return 0; + int ret = 0; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + uint64_t image0_address_phy = 0; + uint64_t image1_address_phy = 0; + uint64_t output_address_phy = 0; + + uint64_t cmd = args.relu_enabled ? USE_RELU : 0; + uint64_t datalen = (uint64_t)args.image0.width * + (uint64_t)args.image0.height * + (uint64_t)args.image0.channels; + uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) { + ret = -EIO; + DLOG << "Conv Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + image0_address_phy = vaddr_to_paddr(args.image0.address); + image1_address_phy = vaddr_to_paddr(args.image1.address); + output_address_phy = vaddr_to_paddr(args.output.address); + + uint64_t image_amount_per_row = + align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels, + IMAGE_ALIGN); + uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | + ((uint64_t)args.image0.width << 16) | + (uint64_t)args.image0.height; + + /*SDK刷Cache保证数据一致性*/ + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); + reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); + reg_writeq(datalen, REG_EW_DATA_LEN); + reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL); + reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); + + reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR); + reg_writeq(coefficient, REG_EW_COEFFICIENT); + + reg_writeq(cmd, REG_EW_CMD); + + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; + ret = -EIO; + DLOG << "EW Wait Irq Timeout!"; + } + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + + //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; #endif return 0; } @@ -126,8 +702,117 @@ int PerformBypass(const struct BypassArgs &args) { DLOG << " out_address:" << args.output.address << " out_scale_address:" << args.output.scale_address; #endif -#ifndef PADDLE_MOBILE_ZU5 - return 0; +#ifdef PADDLE_MOBILE_ZU5 + DLOG << "Bypass"; + // return 0; + struct fpga_pe *pe; + uint64_t output_scale = 0; + uint64_t timer_cnt = 0; + uint64_t cmd = 0; + uint64_t datalen = 0; + uint64_t input_address_phy = 0; + uint64_t output_address_phy = 0; + uint8_t data_cell_in = 0; + uint8_t data_cell_out = 0; + + int ret = 0; + + datalen = (uint64_t)args.image.width * (uint64_t)args.image.height * + (uint64_t)args.image.channels; + datalen = align_to_x(datalen, 16); + + input_address_phy = vaddr_to_paddr(args.image.address); + output_address_phy = vaddr_to_paddr(args.output.address); + DLOG << "input_phy:" << input_address_phy; + DLOG << "output_phy:" << output_address_phy; + + switch (args.input_data_type) { + case DATA_TYPE_FP16: { + switch (args.output_data_type) { + case DATA_TYPE_FP16: + data_cell_in = SIZE_FP16; + data_cell_out = SIZE_FP16; + cmd = CMD_FP16_TO_FP16; + break; + + case DATA_TYPE_FP32: + data_cell_in = SIZE_FP16; + data_cell_out = SIZE_FP32; + cmd = CMD_FP16_TO_FP32; + break; + + default: + break; + } + } break; + + case DATA_TYPE_FP32: { + switch (args.output_data_type) { + case DATA_TYPE_FP16: + data_cell_in = SIZE_FP32; + data_cell_out = SIZE_FP16; + cmd = CMD_FP32_TO_FP16; + break; + + case DATA_TYPE_FP32: + data_cell_in = SIZE_FP32; + data_cell_out = SIZE_FP32; + cmd = CMD_FP32_TO_FP32; + break; + + default: + break; + } + } break; + + default: + break; + } + if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 && + cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) { + return -EFAULT; + } + if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32) || + (data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) { + return -EFAULT; + } + + pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); + if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) { + ret = -EIO; + DLOG << "Bypass Status Error!"; + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; + } + + /*restart scale*/ + reg_writeq(output_scale, REG_SCALE_PARAMETER); + + reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR); + reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR); + reg_writeq(datalen, REG_CONVERT_LENGTH); + + /*SDK刷Cache保证数据一致性*/ + reg_writeq(cmd, REG_CONVERT_CMD); + + DLOG << "before reg poll"; + if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) { + g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR; + ret = -EIO; + DLOG << "BYPASS Wait Irq Timeout!"; + } + DLOG << "after reg poll"; + usleep(40); + + /*SDK 无效 Cache保证数据一致性*/ + output_scale = reg_readq(REG_SCALE_PARAMETER); + output_scale = (output_scale << 32) | (output_scale >> 32); + fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); + + //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); + //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); + pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); + return ret; #endif return 0; @@ -138,11 +823,14 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { DLOG << "=============ComputeFpgaConcat==========="; DLOG << " Image_num: " << args.image_num << " out_address:" << args.image_out - << " out_scale_address:" << args.scale_out; + << " out_scale_address:" << args.scale_out + << " out_channel:" << args.out_channel; DLOG << " image_height:" << args.height << " image_width:" << args.width; for (int i = 0; i < args.image_num; i++) { DLOG << " " << i << "th: "; - DLOG << " channel_num:" << args.channel_num[i] + DLOG << " channel_num:" + << args.channel_num[i] + // << " aligned_channel_num:" << args.aligned_channel_num[i] << " image_address:" << args.images_in[i] << " image_scale_address:" << args.scales_in[i]; } @@ -154,6 +842,82 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) { return 0; } +void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, + int sub_height, int sub_width, int omit_size) { + int origin_h = sub_height * sub_conv_n; + int origin_w = sub_width * sub_conv_n; + int align_origin_w = align_to_x(origin_w * channel, 16); + int deconv_h = origin_h - 2 * omit_size; + int deconv_w = origin_w - 2 * omit_size; + int deconv_row_len = deconv_w * channel; + int align_deconv_row_len = align_to_x(deconv_row_len, 16); + half *ptr_tmp = *data_in; + half *ptr_deconv = + (half *)fpga_malloc(num * align_deconv_row_len * deconv_h * sizeof(half)); + memset(ptr_deconv, 0, num * align_deconv_row_len * deconv_h * sizeof(half)); + int deconv_idx = 0; + for (int nn = 0; nn < num; ++nn) { + for (int hh = 0; hh < origin_h; ++hh) { + int hx = (hh % sub_conv_n); + half *sub_t = ptr_tmp + hx * sub_height * align_origin_w; // sub(hx,:); + + int hi = (hh / sub_conv_n); + + if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue; + + // for (int ww = 0; ww < origin_w; ++ww){ + + // if((ww < omit_size) )// || (ww >= (origin_w-omit_size)) + // continue; + + int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w + + omit_size * channel); + + fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx, + sizeof(half) * deconv_row_len); + deconv_idx += align_deconv_row_len; + //} + } + } + + *data_in = ptr_deconv; + fpga_free(ptr_tmp); +} +int ComputeFpgaDeconv(const struct DeconvArgs &args) { +#ifdef FPGA_TEST_MODE + DLOG << "=============ComputeFPGADeConv==========="; + DLOG << " filter_num:" << args.filter_num + << " group_num:" << args.group_num + << " sub_conv_num:" << args.sub_conv_num; +#endif + + int sub_conv_num = args.sub_conv_num; + + for (int i = 0; i < sub_conv_num; i++) { + //#if CPU_SIMULATE + + //#else + ComputeBasicConv(args.conv_args[i]); + //#endif + } + + if (sub_conv_num > 1) { + float max_scale = -1.0; + for (int i = 0; i < sub_conv_num; i++) { + float ptr_scale = (args.conv_args[i].output.scale_address)[0]; + if (ptr_scale > max_scale) { + args.output.scale_address[0] = ptr_scale; + args.output.scale_address[1] = + (args.conv_args[i].output.scale_address)[1]; + } + } + deconv_post_process((half **)(&(args.output.address)), args.sub_conv_num, 1, + args.filter_num, (args.sub_output_height), + (args.sub_output_width), args.omit_size); + } + return 0; +} + int ComputeFPGASplit(const struct SplitArgs &args) { #ifdef FPGA_PRINT_MODE DLOG << "=============ComputeFpgaSplit==========="; @@ -173,6 +937,5 @@ int ComputeFPGASplit(const struct SplitArgs &args) { args.height, args.width); return 0; } - } // namespace fpga } // namespace paddle_mobile diff --git a/src/fpga/V2/api.cpp b/src/fpga/V2/api.cpp index 5bfd34104600668ce63a9c7d684d4482d5d804fb..d58e780c279e03b90b4ebe3731c6693615107ec4 100644 --- a/src/fpga/V2/api.cpp +++ b/src/fpga/V2/api.cpp @@ -132,11 +132,11 @@ void format_concat_output(framework::Tensor *out, int height, int width, } int format_conv_data(framework::Tensor *filter_tensor, - framework::Tensor *ofm_tensor, float *bs_ptr, int group) { + framework::Tensor *ofm_tensor, float **bs_ptr, int group) { float max_value = fpga::filter_find_max(filter_tensor); fpga::format_filter(filter_tensor, max_value, group); int aligned_num = get_aligned_filter_num(filter_tensor); - fpga::format_bias_scale_array(&bs_ptr, + fpga::format_bias_scale_array(bs_ptr, (int)filter_tensor->dims()[0], // NOLINT aligned_num); int aligned_channel = fpga::get_conv_output_channel(filter_tensor); diff --git a/src/fpga/V2/api.h b/src/fpga/V2/api.h index 1386810164d72ef849162b76a8b83fcf32082907..59c1b006183e4355ebe9316766773215b6edf12f 100644 --- a/src/fpga/V2/api.h +++ b/src/fpga/V2/api.h @@ -39,7 +39,7 @@ void format_bias_scale_array(float** bias_scale_array, int filter_num, void format_concat_output(framework::Tensor* out, int height, int width, uint32_t out_channel); int format_conv_data(framework::Tensor* filter_tensor, - framework::Tensor* ofm_tensor, float* bs_ptr, int group); + framework::Tensor* ofm_tensor, float** bs_ptr, int group); int format_fc_data(framework::Tensor* filter_tensor, framework::Tensor* ofm_tensor, float* bs_ptr); void fill_split_arg(struct SplitConvArgs* arg, framework::Tensor* input, diff --git a/src/fpga/common/driver.cpp b/src/fpga/common/driver.cpp index 8c59ac14fb11282b29a837152194d873bd65d87d..2f592fe45d951230427595f2f8ff5b4a148c0276 100644 --- a/src/fpga/common/driver.cpp +++ b/src/fpga/common/driver.cpp @@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { for (i = 0; i < timeout; i++) { if (val == reg_readq(reg)) { + std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg + << std::endl; break; } } - if (i <= timeout) { + if (i < timeout) { return 0; } else { return -1; @@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); unsigned int nr = (unsigned int)_nr; int ret = 0; + DLOG << size; + DLOG << _nr; + DLOG << nr; + + uint64_t a_size = FPGA_PAGE_SIZE * nr; + DLOG << a_size; pthread_mutex_lock(&memory->mutex); @@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { *addr = address_ofset; } else { + DLOG << "memory request failed!"; ret = -ENOMEM; } @@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) { if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { paddr = iter->second; } else { - DLOG << "Invalid pointer"; + DLOG << "Invalid pointer: " << address; } return paddr; @@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) { fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos, g_fpgainfo.memory_info->nr[pos]); pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex); + + auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr); + if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { + g_fpgainfo.fpga_vaddr2paddr_map.erase(iter); + } } else { DLOG << "Invalid pointer"; } diff --git a/src/fpga/common/driver.h b/src/fpga/common/driver.h index 2dad07ec5206a7ca64449aa38ebe0603d72b71e3..c204370be7ecd3aca229b2c130ec7861116a3ef7 100644 --- a/src/fpga/common/driver.h +++ b/src/fpga/common/driver.h @@ -17,6 +17,7 @@ limitations under the License. */ #include #include #include +#include #include #include @@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1; const int PE_IDX_EW = 2; const int PE_IDX_BYPASS = 3; -enum pe_status { IDLE = 0, BUSY = 1 }; +enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 }; struct MemoryCacheArgs { void *offset; @@ -58,7 +59,7 @@ struct MemoryCacheArgs { struct fpga_pe { char type_name[MAX_TYPE_NAME_LENTH + 1]; struct pe_data_s *outer; - pe_status status; // 0=idle 1=busy -1=fail + pe_status status; uint64_t interrupt_cnt; }; @@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) { uint64_t value = *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset); // NOLINT + // DLOG << "read end"; + usleep(10); return value; } @@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) { // DLOG << "offset : " << offset << ", value : " << value; *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT offset) = value; + // DLOG << "write end"; + usleep(10); } int open_device_driver(); diff --git a/src/fpga/common/fpga_common.h b/src/fpga/common/fpga_common.h index b3f619f2f24aba47d99f7f427c4b67af8c0d430d..fdda65afda595d281a6f4db6f8132213f8f8d9e5 100644 --- a/src/fpga/common/fpga_common.h +++ b/src/fpga/common/fpga_common.h @@ -74,12 +74,21 @@ struct ConcatArgs { void* image_out; float* scale_out; uint32_t* channel_num; - // uint32_t* aligned_channel_num; - // uint32_t out_channel; + uint32_t* aligned_channel_num; + uint32_t out_channel; uint32_t height; uint32_t width; }; +struct SplitConvArgs { + uint32_t split_num; + uint32_t group_num; + uint32_t filter_num; + struct ImageOutputArgs output; + struct ConvArgs* conv_arg; + struct ConcatArgs concat_arg; +}; + struct SplitArgs { uint32_t image_num; int16_t* image_in; @@ -91,15 +100,6 @@ struct SplitArgs { uint32_t width; }; -struct SplitConvArgs { - uint32_t split_num; - uint32_t group_num; - uint32_t filter_num; - struct ImageOutputArgs output; - struct ConvArgs* conv_arg; - struct ConcatArgs concat_arg; -}; - struct PoolingArgs { int16_t mode; // mode: 0:max, 1:avg int16_t kernel_reciprocal; @@ -127,7 +127,14 @@ struct BypassArgs { }; struct DeconvArgs { - struct ConvArgs conv_arg; + uint32_t sub_conv_num; + uint32_t group_num; + uint32_t filter_num; + uint32_t omit_size; + uint32_t sub_output_width; + uint32_t sub_output_height; + struct ImageOutputArgs output; + struct ConvArgs* conv_args; }; static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } diff --git a/src/framework/cl/cl_image.h b/src/framework/cl/cl_image.h index f94eba187f2c5610d7a20098e95015244b420ce2..1a906ba4a4f43e1e1b57bbb3652fdc19fa052a78 100644 --- a/src/framework/cl/cl_image.h +++ b/src/framework/cl/cl_image.h @@ -68,6 +68,13 @@ class CLImage { InitCLImage(context, command_queue, folder_converter); } + void InitNormalCLImage(cl_context context, cl_command_queue command_queue) { + PADDLE_MOBILE_ENFORCE(tensor_data_ != nullptr, + " need call SetTensorData first"); + CLImageConverterNormal *normal_converter = new CLImageConverterNormal(); + InitCLImage(context, command_queue, normal_converter); + } + void InitCLImage(cl_context context, cl_command_queue command_queue, CLImageConverterBase *converter) { if (image_converter_ != nullptr) { diff --git a/src/operators/feed_op.cpp b/src/operators/feed_op.cpp index ac707d22696dd0a62902137607fb64c141341d77..4e496fb51d16c47d801eabada7c36dbdefdd2140 100644 --- a/src/operators/feed_op.cpp +++ b/src/operators/feed_op.cpp @@ -22,7 +22,6 @@ void FeedOp::InferShape() const { auto out_dims = this->param_.Out()->dims(); out_dims[0] = this->param_.BatchSize(); auto input_dims = this->param_.InputX()->dims(); - DLOG << input_dims.size(); if (input_dims.size() == 4) { this->param_.Out()->Resize(input_dims); } else { diff --git a/src/operators/fusion_fc_op.cpp b/src/operators/fusion_fc_op.cpp index 928a4d8541db11886986ffbb695cdf54b5f12c51..f2e98b2b4ceae283ddbe04af06e8926f1b8bb47f 100644 --- a/src/operators/fusion_fc_op.cpp +++ b/src/operators/fusion_fc_op.cpp @@ -60,6 +60,9 @@ REGISTER_FUSION_MATCHER(fusion_fc, ops::FusionFcMatcher); #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(fusion_fc, ops::FusionFcOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(fusion_fc, ops::FusionFcOp); +#endif #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(fusion_fc, ops::FusionFcOp); #endif diff --git a/src/operators/kernel/central-arm-func/conv_arm_func.h b/src/operators/kernel/central-arm-func/conv_arm_func.h index 8a88e0e10bfaf546871afa32e5dc09b0b52df846..7f40157c30ad19472045eb53bd7a99e577429db5 100644 --- a/src/operators/kernel/central-arm-func/conv_arm_func.h +++ b/src/operators/kernel/central-arm-func/conv_arm_func.h @@ -107,15 +107,9 @@ inline void GemmConv(const ConvParam ¶m) { Tensor out_slice = out_batch.Slice(g * out_step, (g + 1) * out_step); Tensor filter_slice = filter.Slice(g * out_step, (g + 1) * out_step); - if (param.Input()->type() == typeid(int8_t)) { - math::matmul_int8(filter_slice, false, col_matrix, false, + math::matmul(filter_slice, false, col_matrix, false, static_cast(1), &out_slice, static_cast(0)); - } else { - math::matmul(filter_slice, false, col_matrix, false, - static_cast(1), &out_slice, - static_cast(0)); - } } } } diff --git a/src/operators/kernel/central-arm-func/mul_arm_func.h b/src/operators/kernel/central-arm-func/mul_arm_func.h index 62e8ae03d9119cafc3c5716042569a90f077325c..07e634e3be9648520357871d91d6677aec6b5c0e 100644 --- a/src/operators/kernel/central-arm-func/mul_arm_func.h +++ b/src/operators/kernel/central-arm-func/mul_arm_func.h @@ -73,8 +73,8 @@ void MulCompute(const MulParam ¶m) { } if (param.InputX()->type() == typeid(int8_t)) { out->mutable_data(); - math::matmul_int8(x_matrix, false, y_matrix, false, static_cast(1), - out, static_cast(0)); + math::matmul(x_matrix, false, y_matrix, false, + static_cast(1), out, static_cast(0)); } else { out->mutable_data(); diff --git a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl index b07ee4d819b25ef77729ed868c54b19a3d8699ae..20cf7b4c48db4191a2bc95b0d952fbaf0ea1dc18 100644 --- a/src/operators/kernel/cl/cl_kernel/concat_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/concat_kernel.cl @@ -13,7 +13,27 @@ See the License for the specific language governing permissions and limitations under the License. */ #pragma OPENCL EXTENSION cl_khr_fp16 : enable -/* + +__kernel void concatByC0(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_W) { + + const int in_c = get_global_id(0); + const int in_w = get_global_id(1); + const int in_nh = get_global_id(2); + + int2 input_pos ; + input_pos.x = in_c * out_W + in_w; + input_pos.y = in_nh; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + half4 input; + input = read_imageh(input_image, sampler,input_pos); + + write_imageh(output_image, input_pos, input); + +} __kernel void concatByC(__read_only image2d_t input_image1, __read_only image2d_t input_image2, @@ -24,13 +44,13 @@ __kernel void concatByC(__read_only image2d_t input_image1, __private const int out_C_Start, __private const int in_W, __private const int in_H, - __private const int int_C1, - __private const int int_C2) { + __private const int in_C1, + __private const int in_C2) { const int in_c = get_global_id(0); const int in_w = get_global_id(1); const int in_nh = get_global_id(2); - int out_c1 = (out_C_Start)/4 + in_c; + int out_c1 = (out_C_Start + 3)/4 -1 + in_c; int out_c2 = out_c1 + 1; @@ -45,7 +65,7 @@ __kernel void concatByC(__read_only image2d_t input_image1, int2 input_pos1; if(in_c==0){ - input_pos1.x = ((in_C1-1)/4) * in_W + in_w; + input_pos1.x = ((in_C1 + 3)/4-1) * in_W + in_w; }else{ input_pos1.x = (in_c - 1) * in_W + in_w; } @@ -103,26 +123,6 @@ __kernel void concatByC(__read_only image2d_t input_image1, write_imageh(output_image, output_pos2, output2); } -__kernel void concatByW0(__read_only image2d_t input_image, - __write_only image2d_t output_image, - __private const int out_W) { - - const int in_c = get_global_id(0); - const int in_w = get_global_id(1); - const int in_nh = get_global_id(2); - - int2 input_pos = in_c * out_W + in_w; - - const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | - CLK_ADDRESS_CLAMP | - CLK_FILTER_NEAREST; - half4 input; - input = read_imageh(input_image, sampler,input_pos); - - write_imageh(output_image, input_pos, input); - -} -*/ __kernel void concatByH(__read_only image2d_t input_image, __write_only image2d_t output_image, diff --git a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl index 2247df59fb77a67a87a00bd26de014f94e86a378..1085e97c10d27aa99583a86a2e2d70ae11d2d68d 100644 --- a/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl +++ b/src/operators/kernel/cl/cl_kernel/conv_kernel.inc.cl @@ -692,6 +692,238 @@ __kernel void conv_1x1_4(__private const int global_size_dim0, */ +__kernel void conv_7x7(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const filter_n0 = 4 * out_c + 0; + const filter_n1 = 4 * out_c + 1; + const filter_n2 = 4 * out_c + 2; + const filter_n3 = 4 * out_c + 3; + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + half4 input; + half4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for(int j = 0; j < 7; j++){ + for(int k = 0; k < 7; k++){ + input = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 3) * dilation, pos_in.y + (k - 3) * dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + (j - 3) * dilation < 0 || in_pos_in_one_block.y + (k - 3) * dilation < 0 || in_pos_in_one_block.x + (j - 3) * dilation >= input_width || in_pos_in_one_block.y + (k - 3) * dilation >= input_height) << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 7 + filter_w; + filter_pos0.y = filter_n0 * 7 + filter_h; + + filter_pos1.x = filter_c * 7 + filter_w; + filter_pos1.y = filter_n1 * 7 + filter_h; + + filter_pos2.x = filter_c * 7 + filter_w; + filter_pos2.y = filter_n2 * 7 + filter_h; + + filter_pos3.x = filter_c * 7 + filter_w; + filter_pos3.y = filter_n3 * 7 + filter_h; + + filter[0] = read_imageh(filter_image, sampler, filter_pos0); + filter[1] = read_imageh(filter_image, sampler, filter_pos1); + filter[2] = read_imageh(filter_image, sampler, filter_pos2); + filter[3] = read_imageh(filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + } + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + +__kernel void conv_5x5(__private const int global_size_dim0, + __private const int global_size_dim1, + __private const int global_size_dim2, + __read_only image2d_t input_image, + __read_only image2d_t filter_image, + +#ifdef BIASE + __read_only image2d_t bias, +#endif + +#ifdef BATCH_NORM + __read_only image2d_t new_scale, + __read_only image2d_t new_biase, +#endif + + __write_only image2d_t output_image, + __private const int stride, + __private const int offset, + __private const int input_c, + __private const int dilation, + __private const int input_width,/* of one block */ + __private const int input_height,/* of one block */ + __private const int output_width, + __private const int output_height) { + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + if (out_c >= global_size_dim0 || + out_w >= global_size_dim1 || + out_nh >= global_size_dim2) { + return; + } + const filter_n0 = 4 * out_c + 0; + const filter_n1 = 4 * out_c + 1; + const filter_n2 = 4 * out_c + 2; + const filter_n3 = 4 * out_c + 3; + + int2 stride_xy; + stride_xy.x = stride; + stride_xy.y = stride; + + int2 ouput_pos_in_one_block; + ouput_pos_in_one_block.x = out_w; + ouput_pos_in_one_block.y = out_nh; + + + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + int2 in_pos_in_one_block; + in_pos_in_one_block.x = ouput_pos_in_one_block.x * stride + offset; + in_pos_in_one_block.y = ouput_pos_in_one_block.y * stride + offset; + +#ifdef BIASE + half4 output = read_imageh(bias, sampler, (int2)(out_c, 0)); +#else + half4 output = 0.0f; +#endif + + half4 input; + half4 filter[4]; + int2 filter_pos0; + int2 filter_pos1; + int2 filter_pos2; + int2 filter_pos3; + for (int i = 0; i < input_c; ++i) { + int2 pos_in = (int2)(i * input_width + in_pos_in_one_block.x, in_pos_in_one_block.y); + for(int j = 0; j < 5; j++){ + for(int k = 0; k < 5; k++){ + input = select(read_imageh(input_image, sampler, + (int2)(pos_in.x + (j - 2) * dilation, pos_in.y + (k - 2) * dilation)), + (half4)(0.0f), + (ushort4)((in_pos_in_one_block.x + (j - 2) * dilation < 0 || in_pos_in_one_block.y + (k - 2) * dilation < 0 || in_pos_in_one_block.x + (j - 2) * dilation >= input_width || in_pos_in_one_block.y + (k - 2) * dilation >= input_height) << 15)); + int filter_h = k; + int filter_w = j; + int filter_c = i; + + filter_pos0.x = filter_c * 5 + filter_w; + filter_pos0.y = filter_n0 * 5 + filter_h; + + filter_pos1.x = filter_c * 5 + filter_w; + filter_pos1.y = filter_n1 * 5 + filter_h; + + filter_pos2.x = filter_c * 5 + filter_w; + filter_pos2.y = filter_n2 * 5 + filter_h; + + filter_pos3.x = filter_c * 5 + filter_w; + filter_pos3.y = filter_n3 * 5 + filter_h; + + filter[0] = read_imageh(filter_image, sampler, filter_pos0); + filter[1] = read_imageh(filter_image, sampler, filter_pos1); + filter[2] = read_imageh(filter_image, sampler, filter_pos2); + filter[3] = read_imageh(filter_image, sampler, filter_pos3); + + output.x += dot(input, filter[0]); + output.y += dot(input, filter[1]); + output.z += dot(input, filter[2]); + output.w += dot(input, filter[3]); + } + } + } + +#ifdef BATCH_NORM + output = output * read_imageh(new_scale, sampler, (int2)(out_c, 0)) + read_imageh(new_biase, sampler, (int2)(out_c, 0)); +#endif + +#ifdef RELU + output = activation(output); +#endif + + write_imageh(output_image, (int2)(out_c * global_size_dim1 + out_w, out_nh), output); +} + diff --git a/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl b/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl new file mode 100644 index 0000000000000000000000000000000000000000..080928b23586b0aa3e639a0cc9b5577355863639 --- /dev/null +++ b/src/operators/kernel/cl/cl_kernel/lrn_kernel.cl @@ -0,0 +1,136 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#pragma OPENCL EXTENSION cl_khr_fp16 : enable + +__kernel void lrn(__read_only image2d_t input_image, + __write_only image2d_t output_image, + __private const int out_C, + __private const int out_W, + __private const int n, + __private const float k, + __private const float alpha, + __private const float beta){ + + const int out_c = get_global_id(0); + const int out_w = get_global_id(1); + const int out_nh = get_global_id(2); + + const int out_c0 = out_c * 4; + const int out_c1 = out_c * 4 + 1; + const int out_c2 = out_c * 4+ 2; + const int out_c3 = out_c * 4+ 3; + const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | + CLK_ADDRESS_CLAMP | + CLK_FILTER_NEAREST; + + const int start = -(n-1)/2; + const end = start + n; + float sqr_sum0 = 0.0f; + float sqr_sum1 = 0.0f; + float sqr_sum2 = 0.0f; + float sqr_sum3 = 0.0f; + int input_c0,input_c1,input_c2,input_c3; + int2 input_pos0,input_pos1,input_pos2,input_pos3; + float4 input0,input1,input2,input3; + for(int i = start; i < end ;i++){ + if(out_c0 + i>=0&&out_c0 + i=0&&out_c1 + i=0&&out_c2 + i=0&&out_c3 + i=2){ + output.y = input.y / (pow(k + alpha * (sqr_sum1),beta)); + } + if(out_C - 4 * out_c>=3){ + output.z = input.z / (pow(k + alpha * (sqr_sum2),beta)); + } + if(out_C - 4 * out_c>=4){ + output.w = input.w / (pow(k + alpha * (sqr_sum3),beta)); + } + half4 tmp = convert_half4(output); + write_imageh(output_image, output_pos, tmp); + +} \ No newline at end of file diff --git a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl index fc660941f8863a0056c4618f0207ae69533d3242..a6a4da690fa921d281786fcddebf7362d3c52119 100644 --- a/src/operators/kernel/cl/cl_kernel/pool_kernel.cl +++ b/src/operators/kernel/cl/cl_kernel/pool_kernel.cl @@ -31,11 +31,13 @@ __kernel void pool_max( const sampler_t sampler = CLK_NORMALIZED_COORDS_TRUE | CLK_ADDRESS_CLAMP | CLK_FILTER_NEAREST; - int start_h = max(out_h * stride_h - pad_top, 0); + int start_h = out_h * stride_h - pad_top; int end_h = min(start_h + ksize_h, in_height); + start_h = max(start_h,0); - int start_w = max(out_w * stride_w - pad_left, 0); + int start_w = out_w * stride_w - pad_left; int end_w = min(start_w + ksize_w, in_width); + start_w = max(start_w,0); const int pos_in_x = out_c * in_width; const int pos_in_y = out_n * in_height; diff --git a/src/operators/kernel/cl/concat_kernel.cpp b/src/operators/kernel/cl/concat_kernel.cpp index 3deb31e7aa0c408cc2b87c523d324001f75ade88..c8ff448b3be79c1acfac7e8cd4e32ea4e3c2b3f5 100644 --- a/src/operators/kernel/cl/concat_kernel.cpp +++ b/src/operators/kernel/cl/concat_kernel.cpp @@ -23,12 +23,17 @@ template <> bool ConcatKernel::Init(ConcatParam *param) { if (param->Out()->dims().size() < 4) { this->cl_helper_.AddKernel("concatByH", "concat_kernel.cl"); + } else if (param->Out()->dims().size() == 4) { + this->cl_helper_.AddKernel("concatByC0", "concat_kernel.cl"); + this->cl_helper_.AddKernel("concatByC", "concat_kernel.cl"); } return true; } template <> void ConcatKernel::Compute(const ConcatParam ¶m) { + DLOG << "yangfei50"; + DLOG << param.Out()->dims(); if (param.Out()->dims().size() < 4) { auto kernel = this->cl_helper_.KernelAt(0); auto inputs = param.Inputs(); @@ -62,6 +67,76 @@ void ConcatKernel::Compute(const ConcatParam ¶m) { out_H_Start += inputs[i]->dims()[0]; } } + } else { + auto kernel0 = this->cl_helper_.KernelAt(0); + auto kernel1 = this->cl_helper_.KernelAt(1); + auto inputs = param.Inputs(); + auto *output_image = param.Out()->GetCLImage(); + + int out_C_Start = 0; + auto input_image = inputs[0]->GetCLImage(); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[0]); + int out_W = param.Out()->dims()[3]; + cl_int status; + status = clSetKernelArg(kernel0, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel0, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel0, 2, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel0, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + out_C_Start += inputs[0]->dims()[1]; + for (int i = 1; i < inputs.size(); i++) { + auto input_image1 = inputs[i - 1]->GetCLImage(); + auto input_image2 = inputs[i]->GetCLImage(); + default_work_size = this->cl_helper_.DefaultWorkSize(*inputs[i]); + int out_C = param.Out()->dims()[1]; + int out_H = param.Out()->dims()[2]; + int in_W = inputs[i]->dims()[3]; + int in_H = inputs[i]->dims()[2]; + int in_C1 = inputs[i - 1]->dims()[1]; + int in_C2 = inputs[i]->dims()[1]; + DLOG << "第" << i << "个"; + DLOG << "out_C=" << out_C; + DLOG << "out_H=" << out_H; + DLOG << "in_W=" << in_W; + DLOG << "in_H=" << in_H; + DLOG << "in_C1=" << in_C1; + DLOG << "in_C2=" << in_C2; + DLOG << "out_C_Start = " << out_C_Start; + status = clSetKernelArg(kernel1, 0, sizeof(cl_mem), &input_image1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 1, sizeof(cl_mem), &input_image2); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 2, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 3, sizeof(int), &out_C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 4, sizeof(int), &out_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 5, sizeof(int), &out_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 6, sizeof(int), &out_C_Start); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 7, sizeof(int), &in_W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 8, sizeof(int), &in_H); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 9, sizeof(int), &in_C1); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel1, 10, sizeof(int), &in_C2); + CL_CHECK_ERRORS(status); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel1, default_work_size.size(), + NULL, default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); + + out_C_Start += inputs[i]->dims()[1]; + } } } diff --git a/src/operators/kernel/cl/conv_add_kernel.cpp b/src/operators/kernel/cl/conv_add_kernel.cpp index 3292cc7ccd2febc4d1e5b8f5e4991f8348b25196..9485644dea3fbbfb983ca104e6dbc04832e2afe6 100644 --- a/src/operators/kernel/cl/conv_add_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_kernel.cpp @@ -51,8 +51,16 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { this->cl_helper_.AddKernel("conv_3x3", "conv_add_kernel.cl"); - } else { - PADDLE_MOBILE_THROW_EXCEPTION(" not support "); + } else if (param->Filter()->dims()[2] == 7 && + param->Filter()->dims()[3] == 7) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_7x7", "conv_add_kernel.cl"); + } else if (param->Filter()->dims()[2] == 5 && + param->Filter()->dims()[3] == 5) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_5x5", "conv_add_kernel.cl"); } return true; diff --git a/src/operators/kernel/cl/conv_add_relu_kernel.cpp b/src/operators/kernel/cl/conv_add_relu_kernel.cpp index 814cff634cb0c4c2d5dd6e6706b558bb1cd64f22..88de4ae2e308f2b55020c314d18551ebe8ae1ea7 100644 --- a/src/operators/kernel/cl/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/cl/conv_add_relu_kernel.cpp @@ -52,6 +52,16 @@ bool ConvAddReluKernel::Init( this->cl_helper_.AddKernel("conv_3x3", "conv_add_relu_kernel.cl"); + } else if (param->Filter()->dims()[2] == 7 && + param->Filter()->dims()[3] == 7) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_7x7", "conv_add_relu_kernel.cl"); + } else if (param->Filter()->dims()[2] == 5 && + param->Filter()->dims()[3] == 5) { + param->Filter()->InitCLImage(cl_helper_.CLContext(), + cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("conv_5x5", "conv_add_relu_kernel.cl"); } else { PADDLE_MOBILE_THROW_EXCEPTION(" not support "); } diff --git a/src/operators/kernel/cl/fusion_fc_kernel.cpp b/src/operators/kernel/cl/fusion_fc_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..7d85becea601878de577b59a5c671b3ea04f9370 --- /dev/null +++ b/src/operators/kernel/cl/fusion_fc_kernel.cpp @@ -0,0 +1,130 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef FUSION_FC_OP + +#include "operators/kernel/fusion_fc_kernel.h" +#include "operators/math/math_function.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool FusionFcKernel::Init(FusionFcParam *param) { + param->InputY()->InitNormalCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + param->InputZ()->InitNormalCLImage(cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue()); + this->cl_helper_.AddKernel("fetch", "fetch_kernel.cl"); + this->cl_helper_.AddKernel("feed", "feed_kernel.cl"); + return true; +} + +template +void FusionFcCompute(const FusionFcParam ¶m, cl_context context, + cl_command_queue commandQueue, cl_kernel kernel0, + cl_kernel kernel1) { + auto *input_x_image = param.InputX(); + auto *input_y_image = param.InputY(); + auto *input_z_image = param.InputZ(); + + int axis = param.Axis(); + auto *out_image = param.Out(); + + Tensor *input_x = new Tensor(); + input_x->Resize(input_x_image->dims()); + input_x->mutable_data(); + framework::CLImageToTensor(input_x_image, input_x, context, commandQueue, + kernel0); + + Tensor *input_y = new Tensor(); + input_y->Resize(input_y_image->dims()); + input_y->mutable_data(); + framework::CLImageToTensor(input_y_image, input_y, context, commandQueue, + kernel0); + + Tensor *input_z = new Tensor(); + input_z->Resize(input_z_image->dims()); + input_z->mutable_data(); + framework::CLImageToTensor(input_z_image, input_z, context, commandQueue, + kernel0); + auto *input_z_data = input_z->data(); + + DLOG << *input_x; + DLOG << *input_y; + DLOG << *input_z; + + Tensor *out = new Tensor(); + out->Resize(out_image->dims()); + out->mutable_data(); + auto *out_data = out->mutable_data(); + + const Tensor x_matrix = + input_x->dims().size() > 2 + ? framework::ReshapeToMatrix(*input_x, param.XNumColDims()) + : *input_x; + const Tensor y_matrix = + input_y->dims().size() > 2 + ? framework::ReshapeToMatrix(*input_y, param.YNumColDims()) + : *input_y; + auto out_dim = out->dims(); + if (out_dim.size() != 2) { + out->Resize({x_matrix.dims()[0], y_matrix.dims()[1]}); + } + PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); + PADDLE_MOBILE_ENFORCE(input_z->dims().size() == 1, "inpu_z size must be 1"); + PADDLE_MOBILE_ENFORCE(out_dim[1] == input_z->dims()[0], + " out_dim.size must be 2."); + axis = (axis == -1 ? out_dim.size() - input_z->dims().size() : axis); + PADDLE_MOBILE_ENFORCE(axis == 1, " to fit broadcast, axis = 1. "); + + int64_t classes = input_z->numel(); + for (int i = 0; i < out_dim[0]; i++) { + memory::Copy(out_data + i * classes, input_z_data, sizeof(float) * classes); + } + + // for (int i = 0; i < out->numel(); i++) { + // DLOG << out_data[i]; + // } + // bias_data的维度和out的维度一致 + math::matmul(x_matrix, false, y_matrix, false, static_cast(1), + out, static_cast(1), false); + + out_image->InitEmptyImage(context, commandQueue, out->dims()); + framework::TensorToCLImage(out, out_image, context, commandQueue, kernel1); + + DLOG << *out; + + delete (input_x); + delete (input_y); + delete (input_z); + delete (out); + PADDLE_MOBILE_ENFORCE(out_dim.size() == 2, " out_dim.size must be 2."); + // if (out_dim.size() != 2) { + // out->Resize(out_dim); + // } +} +template <> +void FusionFcKernel::Compute( + const FusionFcParam ¶m) { + auto kernel0 = this->cl_helper_.KernelAt(0); + auto kernel1 = this->cl_helper_.KernelAt(1); + FusionFcCompute(param, this->cl_helper_.CLContext(), + this->cl_helper_.CLCommandQueue(), kernel0, kernel1); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/cl/lrn_kernel.cpp b/src/operators/kernel/cl/lrn_kernel.cpp new file mode 100644 index 0000000000000000000000000000000000000000..e7e949e5ab5e8a8c8e17d76ee839767173251edc --- /dev/null +++ b/src/operators/kernel/cl/lrn_kernel.cpp @@ -0,0 +1,79 @@ +/* Copyright (c) 2018 PaddlePaddle Authors. All Rights Reserved. + +Licensed under the Apache License, Version 2.0 (the "License"); +you may not use this file except in compliance with the License. +You may obtain a copy of the License at + + http://www.apache.org/licenses/LICENSE-2.0 + +Unless required by applicable law or agreed to in writing, software +distributed under the License is distributed on an "AS IS" BASIS, +WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +See the License for the specific language governing permissions and +limitations under the License. */ + +#ifdef LRN_OP + +#include "operators/kernel/lrn_kernel.h" + +namespace paddle_mobile { +namespace operators { + +template <> +bool LrnKernel::Init(LrnParam *param) { + this->cl_helper_.AddKernel("lrn", "lrn_kernel.cl"); + return true; +} + +template <> +void LrnKernel::Compute(const LrnParam ¶m) { + auto kernel = this->cl_helper_.KernelAt(0); + auto default_work_size = this->cl_helper_.DefaultWorkSize(*param.Out()); + + auto input_image = param.InputX()->GetCLImage(); + auto x_dims = param.InputX()->dims(); + auto output_image = param.Out()->GetCLImage(); + + const int N = x_dims[0]; + const int C = x_dims[1]; + const int H = x_dims[2]; + const int W = x_dims[3]; + + const int n = param.N(); + const float alpha = param.Alpha(); + const float beta = param.Beta(); + const float k = param.K(); + DLOG << "n=" << n; + DLOG << "alpha=" << alpha; + DLOG << "beta=" << beta; + DLOG << "k=" << k; + DLOG << default_work_size; + DLOG << C; + DLOG << W; + cl_int status; + status = clSetKernelArg(kernel, 0, sizeof(cl_mem), &input_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 1, sizeof(cl_mem), &output_image); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 2, sizeof(int), &C); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 3, sizeof(int), &W); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 4, sizeof(int), &n); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 5, sizeof(float), &k); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 6, sizeof(float), &alpha); + CL_CHECK_ERRORS(status); + status = clSetKernelArg(kernel, 7, sizeof(float), &beta); + + status = clEnqueueNDRangeKernel( + this->cl_helper_.CLCommandQueue(), kernel, default_work_size.size(), NULL, + default_work_size.data(), NULL, 0, NULL, NULL); + CL_CHECK_ERRORS(status); +} + +} // namespace operators +} // namespace paddle_mobile + +#endif diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp index 7c03daf7797dbc09ba85a4f4e32e983571d192df..82cb872055aed84d28c798e413b86478de6ca0a6 100644 --- a/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_bn_kernel.cpp @@ -58,7 +58,7 @@ bool ConvAddBNKernel::Init(FusionConvAddBNParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp index 8737554e6f8c343491656ca7659e1850d84ea246..266ebe012e0db3ef3b2ac21f81f4436d143ece59 100644 --- a/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_bn_relu_kernel.cpp @@ -56,7 +56,7 @@ bool ConvAddBNReluKernel::Init( param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp index 22841e705c255433bebeab479a2e2b8d3a3b7187..e9c5032779b4e6b63f82355cd2a5634c1fae88de 100644 --- a/src/operators/kernel/fpga/V2/conv_add_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_kernel.cpp @@ -38,7 +38,7 @@ bool ConvAddKernel::Init(FusionConvAddParam *param) { bs_ptr[i] = bias_ptr[i]; } - fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp index a3c4443645e421ee0dce10f53914600fb7af75bf..1002a358434046b05fee41b60281cc594a093808 100644 --- a/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_add_relu_kernel.cpp @@ -38,7 +38,7 @@ bool ConvAddReluKernel::Init(FusionConvAddReluParam *param) { bs_ptr[i] = bias_ptr[i]; } - fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp index 070fce98b9e5f0c7055943447602dba8ae78c7c4..cb32c0fe040b9c55de660269fbfc3598ea9722bf 100644 --- a/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_bn_kernel.cpp @@ -50,7 +50,7 @@ bool ConvBNKernel::Init(FusionConvBNParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp index 95ac74cbf87fe20ef419e748f8a8a04df20c98e3..918b65bd347811f9a2cc6b1182c54d9f39a9082e 100644 --- a/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp +++ b/src/operators/kernel/fpga/V2/conv_bn_relu_kernel.cpp @@ -15,6 +15,7 @@ limitations under the License. */ #ifdef FUSION_CONVBNRELU_OP #include "operators/kernel/conv_bn_relu_kernel.h" +#include "fpga/V2/filter.h" namespace paddle_mobile { namespace operators { @@ -50,7 +51,7 @@ bool ConvBNReluKernel::Init(FusionConvBNReluParam *param) { param->SetNewScale(new_scale); param->SetNewBias(new_bias); - fpga::format_conv_data(filter, out, bs_ptr, param->Groups()); + fpga::format_conv_data(filter, out, &bs_ptr, param->Groups()); fpga::SplitConvArgs conv_arg = {0}; fpga::fill_split_arg(&conv_arg, input, out, filter, relu_enabled, diff --git a/src/operators/lrn_op.cpp b/src/operators/lrn_op.cpp index faa9ccb6132e70e01e5c076554455d9424c68086..b63d2f2fbe594fc35cd580ea772562a263c97bd5 100644 --- a/src/operators/lrn_op.cpp +++ b/src/operators/lrn_op.cpp @@ -14,7 +14,7 @@ limitations under the License. */ #ifdef LRN_OP -#include "lrn_op.h" +#include "operators/lrn_op.h" namespace paddle_mobile { namespace operators { @@ -32,6 +32,9 @@ namespace ops = paddle_mobile::operators; #ifdef PADDLE_MOBILE_CPU REGISTER_OPERATOR_CPU(lrn, ops::LrnOp); #endif +#ifdef PADDLE_MOBILE_CL +REGISTER_OPERATOR_CL(lrn, ops::LrnOp); +#endif #ifdef PADDLE_MOBILE_MALI_GPU REGISTER_OPERATOR_MALI_GPU(lrn, ops::LrnOp); #endif diff --git a/src/operators/math/gemm.h b/src/operators/math/gemm.h index e409fe07dc55bcf68748f0f25b3b63480d25cd56..8498992fcecbcb2c9a773fba874e108c013a04fc 100644 --- a/src/operators/math/gemm.h +++ b/src/operators/math/gemm.h @@ -23,12 +23,10 @@ limitations under the License. */ #if __aarch64__ #define MR_INT8 4 -#define NR_INT8 2 #define MR 6 #define NR 16 #else #define MR_INT8 4 -#define NR_INT8 2 #define MR 6 #define NR 8 #endif @@ -195,58 +193,52 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, // 8 bits int small block inner product void AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc); - void AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc); void AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc); // 8 bits int inner product - void InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a, - const int8_t *b, float beta, int32_t *c, int32_t *C, - int32_t ldc, bool relu); - void InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, const int8_t *a, - const int8_t *b, float beta, int32_t *c, int8_t *C, - int32_t ldc, bool relu, int32_t *bias); + void InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, + const int8_t *a, const int8_t *b, int8_t beta, + int32_t *c, int32_t *C, int32_t ldc, bool relu, + int8_t *bias); // 8 bits int pack function void PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer); - void PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, - int32_t lda, int8_t *buffer); void PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer); - void PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, - int32_t ldb, int8_t *buffer); void PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer); void PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer); void PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer); - void PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail, - const int8_t *A, int32_t lda, int8_t *buffer); - void PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer); // 8 bits int matrix product - void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, int32_t *C, - int32_t ldc, bool relu, int32_t *bias); - void Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, int8_t *C, - int32_t ldc, bool relu, int32_t *bias); - void Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, - int32_t *C, int32_t ldc, bool relu, int32_t *bias); + void Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, + int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, int32_t *C, + int32_t ldc, bool relu, int8_t *bias); + void Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, + int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, + int32_t *C, int32_t ldc, bool relu, int8_t *bias); // 8 bits int write back + // C = alpha * A * B + beta * C + void WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc); // C = A * B void WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc); - // C = A * B + bias, scale * relu(C) - void WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale); - // C = A * B + bias, scale * C - void WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale); + // C = A * B + C + void WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc); + // C = A * B + bias + void WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc, int8_t *bias); + // C = A * B + C, relu(C) + void WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc); + // C = A * B + bias, relu(C) + void WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc, int8_t *bias); private: int MC = 0; @@ -262,7 +254,7 @@ void PackMatrixB(int k, int n, int n_tail, const float *B, int ldb, // 8 bits int int8_t *packedA_int8; int8_t *packedB_int8; - int32_t *packedC_int32; + int32_t *packedC_int8; int8_t *zero_int8; }; diff --git a/src/operators/math/gemm_int8.cpp b/src/operators/math/gemm_int8.cpp index 555672720f2be51631ea10808ce6891b08df0721..b16db7fe6acf0c3c7fb2902c9fb3f6e3dc81a65f 100644 --- a/src/operators/math/gemm_int8.cpp +++ b/src/operators/math/gemm_int8.cpp @@ -18,8 +18,6 @@ limitations under the License. */ #include "operators/math/gemm.h" #if __ARM_NEON #include -#include - #endif #ifdef _OPENMP #include @@ -64,7 +62,7 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, "pld [%[b_ptr], #128] \n\t" "vld1.s8 {d0-d3}, [%[a_ptr]]! \n\t" // load A 8 cols "vld1.s8 {d8-d11}, [%[b_ptr]]! \n\t" // load B first 4 rows - "vmovl.s8 q2, d0 \n\t" // process B first + "vmovl.s8 q2, d0 \n\t" // process B first 4 // rows "vmovl.s8 q3, d8 \n\t" "vmlal.s16 q8, d6, d4[0]\n\t" @@ -243,132 +241,6 @@ void Gemm::AddDot4x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, #endif // __ARM_NEON } -void Gemm::AddDot4x2(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, - int32_t ldc) { -#if __ARM_NEON -#if __aarch64__ -// TODO(wzzju) -#else -#define PADDLE_LABEL_LOOP "1" -#define PADDLE_LABEL_AFTER_LOOP "2" - asm volatile( - "lsl %[ldc], %[ldc], #2 \n\t" // sizeof(int32) == 4 - "vldr d0, [%[b], #0] \n\t" - "vmov.s32 q8, #0 \n\t" - "vldr d4, [%[a], #0] \n\t" - "vmov.s32 q9, q8 \n\t" - "vldr d2, [%[b], #16] \n\t" - "vmov.s32 q10, q8 \n\t" - "vldr d6, [%[a], #16] \n\t" - "vmov.s32 q11, q8 \n\t" - "vldr d1, [%[b], #8]\n\t" - "vmov.s32 q12, q8 \n\t" - "vldr d5, [%[a], #8]\n" - "vmov.s32 q13, q8 \n\t" - "vldr d3, [%[b], #24]\n\t" - "vmov.s32 q14, q8 \n\t" - "vldr d7, [%[a], #24]\n" - "vmov.s32 q15, q8 \n\t" - - PADDLE_LABEL_LOOP - ": \n\t" - "vmull.s8 q4, d0, d4 \n\t" // first half - "add %[b], %[b], #32 \n\t" - "vmull.s8 q5, d2, d4 \n\t" - "vldr d4, [%[a], #32] \n\t" - "vmull.s8 q6, d0, d6 \n\t" - "vmull.s8 q7, d2, d6 \n\t" - "vldr d6, [%[a], #48] \n\t" - - "vmlal.s8 q4, d1, d5 \n\t" // second half - "vmlal.s8 q5, d3, d5 \n\t" - "vldr d5, [%[a], #40] \n\t" - "vmlal.s8 q6, d1, d7 \n\t" - "vmlal.s8 q7, d3, d7 \n\t" - "vldr d7, [%[a], #56] \n\t" - - "vpadal.s16 q8, q4 \n\t" // pairwise-add - "add %[a], %[a], #64 \n\t" - "vpadal.s16 q9, q5 \n\t" - "subs %[k], %[k], #16 \n\t" - "vpadal.s16 q10, q6 \n\t" - "vpadal.s16 q11, q7 \n\t" - - "beq " PADDLE_LABEL_AFTER_LOOP - "f \n\t" - - "vmull.s8 q4, d0, d4 \n\t" // first half - "vmull.s8 q5, d2, d4 \n\t" - "vldr d4, [%[a], #0] \n\t" - "vmull.s8 q6, d0, d6 \n\t" - "vldr d0, [%[b], #0] \n\t" - "vmull.s8 q7, d2, d6 \n\t" - "vldr d2, [%[b], #16] \n\t" - - "vmlal.s8 q4, d1, d5 \n\t" // second half - "vldr d6, [%[a], #16] \n\t" - "vmlal.s8 q5, d3, d5 \n\t" - "vldr d5, [%[a], #8] \n\t" - "vmlal.s8 q6, d1, d7 \n\t" - "vldr d1, [%[b], #8] \n\t" - "vmlal.s8 q7, d3, d7 \n\t" - "vldr d3, [%[b], #24] \n\t" - - "vpadal.s16 q12, q4 \n\t" // pairwise-add - "vldr d7, [%[a], #24] \n\t" - "vpadal.s16 q13, q5 \n\t" - "vpadal.s16 q14, q6 \n\t" - "vpadal.s16 q15, q7 \n\t" - - "b " PADDLE_LABEL_LOOP "b \n\t" - - PADDLE_LABEL_AFTER_LOOP - ": \n\t" - "vmull.s8 q4, d0, d4 \n\t" // first half - "vmull.s8 q5, d2, d4 \n\t" - "vmull.s8 q6, d0, d6 \n\t" - "vmull.s8 q7, d2, d6 \n\t" - - "vmlal.s8 q4, d1, d5 \n\t" // second half - "vmlal.s8 q5, d3, d5 \n\t" - "vmlal.s8 q6, d1, d7 \n\t" - "vmlal.s8 q7, d3, d7 \n\t" - - "vpadal.s16 q12, q4 \n\t" // pairwise-add - "vpadal.s16 q13, q5 \n\t" - "vpadal.s16 q14, q6 \n\t" - "vpadal.s16 q15, q7 \n\t" - - "vpadd.s32 d0, d16, d17 \n\t" // reduce to int32 - "vpadd.s32 d1, d18, d19 \n\t" - "vpadd.s32 d2, d20, d21 \n\t" - "vpadd.s32 d3, d22, d23 \n\t" - "vpadd.s32 d4, d24, d25 \n\t" - "vpadd.s32 d5, d26, d27 \n\t" - "vpadd.s32 d6, d28, d29 \n\t" - "vpadd.s32 d7, d30, d31 \n\t" - - "vpadd.s32 d8, d0, d1 \n\t" // reduce to int32 again - "vpadd.s32 d9, d2, d3 \n\t" - "vpadd.s32 d10, d4, d5 \n\t" - "vpadd.s32 d11, d6, d7 \n\t" - - "vst1.32 {d8}, [%[c]], %[ldc] \n\t" - "vst1.32 {d9}, [%[c]], %[ldc] \n\t" - "vst1.32 {d10}, [%[c]], %[ldc] \n\t" - "vst1.32 {d11}, [%[c]] \n\t" - - : [k] "+r"(k), [a] "+r"(a), [b] "+r"(b), [c] "+r"(c) - : [ldc] "r"(ldc) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q5", "q6", "q7", "q8", - "q9", "q10", "q11", "q12", "q13", "q14", "q15"); -#undef PADDLE_LABEL_AFTER_LOOP -#undef PADDLE_LABEL_LOOP - -#endif // __aarch64__ -#endif // __ARM_NEON -} - // 8 bits int small block inner product void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, int32_t ldc) { @@ -667,213 +539,51 @@ void Gemm::AddDot6x8(int32_t k, const int8_t *a, const int8_t *b, int32_t *c, } // 8 bits int inner product -void Gemm::InnerKernel(int32_t mc, int32_t nc, float alpha, const int8_t *a, - const int8_t *b, float beta, int32_t *c, int32_t *C, - int32_t ldc, bool relu) { +void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, int8_t alpha, + const int8_t *a, const int8_t *b, int8_t beta, + int32_t *c, int32_t *C, int32_t ldc, bool relu, + int8_t *bias) { #pragma omp parallel for - for (int32_t j = 0; j < nc; j += NR_INT8) { + for (int32_t j = 0; j < nc; j += NR) { for (int32_t i = 0; i < mc; i += MR_INT8) { #if __aarch64__ // TODO(wzzju) #else // AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); + AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); #endif // __aarch64__ } } - if (!relu) { - WriteBasic(mc, nc, c, C, ldc); + if (alpha != 1) { + WriteWithAlphaBeta(mc, nc, c, C, ldc); return; } -} - -void Gemm::InnerKernelWithBias(int32_t mc, int32_t nc, float alpha, - const int8_t *a, const int8_t *b, float beta, - int32_t *c, int8_t *C, int32_t ldc, bool relu, - int32_t *bias) { -#pragma omp parallel for - for (int32_t j = 0; j < nc; j += NR_INT8) { - for (int32_t i = 0; i < mc; i += MR_INT8) { -#if __aarch64__ - // TODO(wzzju) -#else - // AddDot6x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - // AddDot4x8(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); - AddDot4x2(KC, a + i * KC, b + j * KC, c + i * NC + j, NC); -#endif // __aarch64__ - } - } - if (relu) { - WriteWithAddReluScale(mc, nc, c, C, ldc, bias, alpha); + if (beta == 0) { + WriteBasic(mc, nc, c, C, ldc); return; - } else { - WriteWithAddScale(mc, nc, c, C, ldc, bias, alpha); } -} - -// 8 bits int PackMatrixA_4r -void Gemm::PackMatrixA_4r_16(int32_t m, int32_t k, int32_t m_tail, - const int8_t *A, int32_t lda, int8_t *buffer) { - const int32_t i_length = m - m_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; - - for (int32_t i = 0; i < i_length; i += 4) { - const int8_t *a0 = A + i * lda; - const int8_t *a1 = A + (i + 1) * lda; - const int8_t *a2 = A + (i + 2) * lda; - const int8_t *a3 = A + (i + 3) * lda; - int8_t *local_buffer = buffer + i * KC; - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - // TODO(wzzju) -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } + if (beta == 1 && !relu) { + if (bias == nullptr) { + WriteWithAdd(mc, nc, c, C, ldc); + } else { + WriteWithAddV1(mc, nc, c, C, ldc, bias); } + return; } - - if (m_tail != 0) { - const int8_t *a0 = &A(i_length, 0); - const int8_t *a1 = a0 + lda; - const int8_t *a2 = a0 + 2 * lda; - const int8_t *a3 = a0 + 3 * lda; - int8_t *local_buffer = buffer + i_length * KC; - switch (m_tail) { - case 1: - a1 = zero_int8; - case 2: - a2 = zero_int8; - case 3: - a3 = zero_int8; - break; - default: - break; - } - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - // TODO(wzzju) -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } + if (beta == 1 && relu) { + if (bias == nullptr) { + WriteWithAddRelu(mc, nc, c, C, ldc); + } else { + WriteWithAddReluV1(mc, nc, c, C, ldc, bias); } + return; } } - // 8 bits int PackMatrixA_4r void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer) { const int8_t *a0, *a1, *a2, *a3; - for (int32_t i = 0; i < m - m_tail; i += 4) { + for (int32_t i = 0; i < m - m_tail; i += MR_INT8) { a0 = A + i * lda; a1 = A + (i + 1) * lda; a2 = A + (i + 2) * lda; @@ -915,7 +625,7 @@ void Gemm::PackMatrixA_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer) { const int32_t i_length = m - m_tail; - for (int32_t i = 0; i < i_length; i += 6) { + for (int32_t i = 0; i < i_length; i += MR_INT8) { const int8_t *a0 = A + i * lda; const int8_t *a1 = A + (i + 1) * lda; const int8_t *a2 = A + (i + 2) * lda; @@ -966,79 +676,11 @@ void Gemm::PackMatrixA_6r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, } } -// 8 bits int PackMatrixB -void Gemm::PackMatrixB_2c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; - for (int32_t j = 0; j < j_length; j += 2) { - int8_t *local_buffer = buffer + j * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j); - const int8_t *b1 = &B((i << 4), j + 1); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b1; - b1 += ldb; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j); - const int8_t *b1 = &B((k_count << 4), j + 1); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } - if (n_tail != 0) { - int8_t *local_buffer = buffer + j_length * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j_length); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = 0; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j_length); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - // 8 bits int PackMatrixB void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer) { const int32_t j_length = n - n_tail; - for (int32_t j = 0; j < j_length; j += 8) { + for (int32_t j = 0; j < j_length; j += NR) { int8_t *local_buffer = buffer + j * k; for (int32_t i = 0; i < k; ++i) { const int8_t *b0 = &B(i, j); @@ -1073,7 +715,7 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, for (int32_t j = j_length; j < n; ++j) { *local_buffer++ = *b0++; } - for (int32_t j = n; j < j_length + 8; ++j) { + for (int32_t j = n; j < j_length + NR; ++j) { *local_buffer++ = 0; } } @@ -1081,20 +723,19 @@ void Gemm::PackMatrixB_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, } // 8 bits int matrix product (m*k x k*n) -void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, - int32_t *C, int32_t ldc, bool relu, int32_t *bias) { +void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, + int32_t lda, const int8_t *B, int32_t ldb, int8_t beta, + int32_t *C, int32_t ldc, bool relu, int8_t *bias) { // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) // L2 cache is 0.5~4 Mib (Contex-A72 cluster) int32_t L1 = 32 * 1024; int32_t L2 = 512 * 1024; - const int32_t k_complete = (k + 15) - ((k + 15) & 15); - KC = k_complete; + KC = k; MC = L1 / (KC * sizeof(int8_t)); NC = L2 / (KC * sizeof(int8_t)); - // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8 + // make sure MC is multiple of MR_INT8, and NC is multiple of NR if (MC == 0) { MC = MR_INT8; } else { @@ -1104,106 +745,52 @@ void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, } // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; if (NC == 0) { - NC = NR_INT8; + NC = NR; } else { int32_t nblock_num = (n + NC - 1) / NC; NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8; + NC = (NC + NR - 1) / NR * NR; } // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; packedA_int8 = static_cast( paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC)); packedB_int8 = static_cast( paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); - packedC_int32 = static_cast( + packedC_int8 = static_cast( paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC)); zero_int8 = - static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * k)); + static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC)); - memset(static_cast(zero_int8), 0, sizeof(int8_t) * k); + memset(static_cast(zero_int8), 0, sizeof(int8_t) * KC); int32_t mc, nc; for (int32_t j = 0; j < n; j += NC) { nc = s_min(n - j, NC); - PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8); + PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, packedB_int8); for (int32_t i = 0; i < m; i += MC) { mc = s_min(m - i, MC); - PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8); + // PackMatrixA_6r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8); + PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, packedA_int8); if (bias == nullptr) { - InnerKernel(mc, nc, alpha, packedA_int8, packedB_int8, beta, - packedC_int32, &C(i, j), ldc, relu); - } - } - } - - paddle_mobile::memory::Free(packedA_int8); - paddle_mobile::memory::Free(packedB_int8); - paddle_mobile::memory::Free(packedC_int32); - paddle_mobile::memory::Free(zero_int8); -} - -// 8 bits int matrix product (m*k x k*n) -void Gemm::Sgemm(int32_t m, int32_t n, int32_t k, float alpha, const int8_t *A, - int32_t lda, const int8_t *B, int32_t ldb, float beta, - int8_t *C, int32_t ldc, bool relu, int32_t *bias) { - // L1 data cache is 32 kib (Per Contex-A57, Contex-A72, Contex-A73) - // L2 cache is 0.5~4 Mib (Contex-A72 cluster) - int32_t L1 = 32 * 1024; - int32_t L2 = 512 * 1024; - - const int32_t k_complete = (k + 15) - ((k + 15) & 15); - KC = k_complete; - MC = L1 / (KC * sizeof(int8_t)); - NC = L2 / (KC * sizeof(int8_t)); - - // make sure MC is multiple of MR_INT8, and NC is multiple of NR_INT8 - if (MC == 0) { - MC = MR_INT8; - } else { - int32_t mblock_num = (m + MC - 1) / MC; - MC = (m + mblock_num - 1) / mblock_num; - MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8; - } - // DLOG << "mblock_num = " << mblock_num << ", MC = " << MC << "\n"; - if (NC == 0) { - NC = NR_INT8; - } else { - int32_t nblock_num = (n + NC - 1) / NC; - NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8; - } - // DLOG << "nblock_num = " << nblock_num << ", NC = " << NC << "\n"; - packedA_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC)); - packedB_int8 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); - packedC_int32 = static_cast( - paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC)); - zero_int8 = - static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * k)); - - memset(static_cast(zero_int8), 0, sizeof(int8_t) * k); - int32_t mc, nc; - for (int32_t j = 0; j < n; j += NC) { - nc = s_min(n - j, NC); - PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, packedB_int8); - for (int32_t i = 0; i < m; i += MC) { - mc = s_min(m - i, MC); - PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, packedA_int8); - if (bias != nullptr) { InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, - packedC_int32, &C(i, j), ldc, relu, bias + i); + packedC_int8, &C(i, j), ldc, relu, nullptr); + } else { + InnerKernelWithBias(mc, nc, alpha, packedA_int8, packedB_int8, beta, + packedC_int8, &C(i, j), ldc, relu, bias + i); } } } paddle_mobile::memory::Free(packedA_int8); paddle_mobile::memory::Free(packedB_int8); - paddle_mobile::memory::Free(packedC_int32); + paddle_mobile::memory::Free(packedC_int8); paddle_mobile::memory::Free(zero_int8); } // 8 bits int write back -// C = A * B +// C = alpha * A * B + beta * C +void Gemm::WriteWithAlphaBeta(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc) {} +// C = A * B, 8位 int32_t void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t ldc) { #if __ARM_NEON @@ -1215,7 +802,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, int32_t step = sizeof(int32_t) * ldc; int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 4)); int32_t volatile m = mc; - int32_t volatile n = nc1; + int32_t *volatile c_ptr, *volatile C_ptr; int32_t *C0, *c0; c_ptr = c; @@ -1249,7 +836,7 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, "end_mc_%=: \n\t" : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), + : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(nc1), [step] "r"(step), [step1] "r"(step1) : "memory", "r5", "r6", "q0", "q1", "q2", "q3"); } @@ -1267,254 +854,20 @@ void Gemm::WriteBasic(int32_t mc, int32_t nc, int32_t *c, int32_t *C, #endif // __ARM_NEON } -// C = A * B + bias, scale * C -void Gemm::WriteWithAddScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale) { -#if __ARM_NEON -#if __aarch64__ -// TODO(wzzju) -#else - int32_t zero = 0; - int8_t narrow = -128; - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - int32_t step = sizeof(int8_t) * ldc; - int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3)); - int32_t volatile m = mc; - int32_t volatile n = nc1; - int32_t *volatile c_ptr, *volatile bias_ptr; - int8_t *volatile C_ptr; - c_ptr = c; - C_ptr = C; - bias_ptr = bias; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" - "vdup.8 d24, %[narrow] \n\t" - "loop_mc_%=: \n\t" - "vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t" - "vdup.32 q13, d26[0] \n\t" - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vqadd.s32 q1, q1, q13 \n\t" - "vcvt.f32.s32 q2, q0 \n\t" - "vcvt.f32.s32 q3, q1 \n\t" - "vmul.f32 q2, q2, q15 \n\t" - "vmul.f32 q3, q3, q15 \n\t" - "vcvt.s32.f32 q4, q2 \n\t" - "vcvt.s32.f32 q5, q3 \n\t" - "vqmovn.s32 d12, q4 \n\t" - "vqmovn.s32 d13, q5 \n\t" - "vqmovn.s16 d14, q6 \n\t" - "vceq.s8 d15, d14, d24 \n\t" - "vsub.s8 d14, d14, d15 \n\t" - "vst1.8 {d14}, [r6]! \n\t" - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" +// C = A * B + C +void Gemm::WriteWithAdd(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc) {} - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), - [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr), - [scale] "r"(scale), [zero] "r"(zero), [narrow] "r"(narrow) - : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q12", "q13", "q14", "q15"); - } +// C = A * B + bias +void Gemm::WriteWithAddV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc, int8_t *bias) {} +// C = A * B + C, relu(C) +void Gemm::WriteWithAddRelu(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc) {} - int32_t nc_left; - int32_t *c0; - int8_t *C0; - int32_t bias_v; - if (_nc1 != 0) { - for (int32_t i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 8 + i * ldc; - c0 = c_ptr + nc1 * 8 + i * NC; - bias_v = *(bias_ptr + i); - nc_left = _nc1; - asm volatile( - "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" - "vdup.8 d24, %[narrow] \n\t" - "vdup.32 q13, %[bias_v] \n\t" - "cmp %[_nc1], #4 \n\t" - "blt less_four_%= \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vceq.s8 d9, d8, d24 \n\t" - "vsub.s8 d8, d8, d9 \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vst1.8 {d8[1]}, [%[C0]]! \n\t" - "vst1.8 {d8[2]}, [%[C0]]! \n\t" - "vst1.8 {d8[3]}, [%[C0]]! \n\t" - "subs %[_nc1], %[_nc1], #4 \n\t" - "beq process_over_%= \n\t" - "less_four_%=: \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vceq.s8 d9, d8, d24 \n\t" - "vsub.s8 d8, d8, d9 \n\t" - "loop_save_%=: \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vext.8 d8, d8, d8, #1 \n\t" - "subs %[_nc1], %[_nc1], #1 \n\t" - "bgt loop_save_%= \n\t" - "process_over_%=: \n\t" - : - : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), - [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero), - [narrow] "r"(narrow) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q12", "q13", "q14", - "q15"); - } - } -#endif // __aarch64__ -#endif // __ARM_NEON -} - -// C = A * B + bias, scale * relu(C) -void Gemm::WriteWithAddReluScale(int32_t mc, int32_t nc, int32_t *c, int8_t *C, - int32_t ldc, int32_t *bias, float scale) { -#if __ARM_NEON -#if __aarch64__ -// TODO(wzzju) -#else - int32_t zero = 0; - int32_t nc1 = nc >> 3; - int32_t _nc1 = nc & 7; - int32_t step = sizeof(int8_t) * ldc; - int32_t step1 = sizeof(int32_t) * (NC - (nc1 << 3)); - int32_t volatile m = mc; - int32_t volatile n = nc1; - int32_t *volatile c_ptr, *volatile bias_ptr; - int8_t *volatile C_ptr; - c_ptr = c; - C_ptr = C; - bias_ptr = bias; - if (nc1 > 0) { - asm volatile( - "subs %[mc], %[mc], #1 \n\t" - "blt end_mc_%= \n\t" - "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" - "loop_mc_%=: \n\t" - "vld1.32 {d26[0]}, [%[bias_ptr]]!\n\t" - "vdup.32 q13, d26[0] \n\t" - "mov r6, %[C_ptr] \n\t" - "mov r5, %[nc1] \n\t" - "subs r5, r5, #1 \n\t" - "blt end_nc1_%= \n\t" - "loop_nc1_%=: \n\t" - "vld1.32 {q0, q1}, [%[c_ptr]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vqadd.s32 q1, q1, q13 \n\t" - "vmax.s32 q0, q0, q14 \n\t" - "vmax.s32 q1, q1, q14 \n\t" - "vcvt.f32.s32 q2, q0 \n\t" - "vcvt.f32.s32 q3, q1 \n\t" - "vmul.f32 q2, q2, q15 \n\t" - "vmul.f32 q3, q3, q15 \n\t" - "vcvt.s32.f32 q4, q2 \n\t" - "vcvt.s32.f32 q5, q3 \n\t" - "vqmovn.s32 d12, q4 \n\t" - "vqmovn.s32 d13, q5 \n\t" - "vqmovn.s16 d14, q6 \n\t" - "vst1.8 {d14}, [r6]! \n\t" - "subs r5, r5, #1 \n\t" - "bge loop_nc1_%= \n\t" - "end_nc1_%=: \n\t" - - "add %[C_ptr], %[C_ptr], %[step] \n\t" - "add %[c_ptr], %[c_ptr], %[step1] \n\t" - "subs %[mc], %[mc], #1 \n\t" - "bge loop_mc_%= \n\t" - "end_mc_%=: \n\t" - - : - : [C_ptr] "r"(C_ptr), [c_ptr] "r"(c_ptr), [mc] "r"(m), [nc1] "r"(n), - [step] "r"(step), [step1] "r"(step1), [bias_ptr] "r"(bias_ptr), - [scale] "r"(scale), [zero] "r"(zero) - : "cc", "memory", "r5", "r6", "q0", "q1", "q2", "q3", "q4", "q5", "q6", - "q7", "q13", "q14", "q15"); - } - - int32_t nc_left; - int32_t *c0; - int8_t *C0; - int32_t bias_v; - if (_nc1 != 0) { - for (int32_t i = 0; i < mc; i++) { - C0 = C_ptr + nc1 * 8 + i * ldc; - c0 = c_ptr + nc1 * 8 + i * NC; - bias_v = *(bias_ptr + i); - nc_left = _nc1; - asm volatile( - "vdup.32 q15, %[scale] \n\t" - "vdup.32 q14, %[zero] \n\t" - "vdup.32 q13, %[bias_v] \n\t" - "cmp %[_nc1], #4 \n\t" - "blt less_four_%= \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vmax.s32 q0, q0, q14 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vst1.8 {d8[1]}, [%[C0]]! \n\t" - "vst1.8 {d8[2]}, [%[C0]]! \n\t" - "vst1.8 {d8[3]}, [%[C0]]! \n\t" - "subs %[_nc1], %[_nc1], #4 \n\t" - "beq process_over_%= \n\t" - "less_four_%=: \n\t" - "vld1.32 {q0}, [%[c0]]! \n\t" - "vqadd.s32 q0, q0, q13 \n\t" - "vmax.s32 q0, q0, q14 \n\t" - "vcvt.f32.s32 q1, q0 \n\t" - "vmul.f32 q1, q1, q15 \n\t" - "vcvt.s32.f32 q2, q1 \n\t" - "vqmovn.s32 d6, q2 \n\t" - "vqmovn.s16 d8, q3 \n\t" - "loop_save_%=: \n\t" - "vst1.8 {d8[0]}, [%[C0]]! \n\t" - "vext.8 d8, d8, d8, #1 \n\t" - "subs %[_nc1], %[_nc1], #1 \n\t" - "bgt loop_save_%= \n\t" - "process_over_%=: \n\t" - : - : [_nc1] "r"(nc_left), [C0] "r"(C0), [c0] "r"(c0), - [bias_v] "r"(bias_v), [scale] "r"(scale), [zero] "r"(zero) - : "cc", "memory", "q0", "q1", "q2", "q3", "q4", "q13", "q14", "q15"); - } - } -#endif // __aarch64__ -#endif // __ARM_NEON -} +// C = A * B + bias, relu(C) +void Gemm::WriteWithAddReluV1(int32_t mc, int32_t nc, int32_t *c, int32_t *C, + int32_t ldc, int8_t *bias) {} } // namespace math } // namespace operators diff --git a/src/operators/math/gemm_omp_int8.cpp b/src/operators/math/gemm_omp_int8.cpp index d4d4c294934191ba6717716486bf857477d73b55..21256cccfcc6dcc647f34a2129616b70804d398f 100644 --- a/src/operators/math/gemm_omp_int8.cpp +++ b/src/operators/math/gemm_omp_int8.cpp @@ -28,10 +28,10 @@ namespace operators { namespace math { // 8 bits int matrix product (m*k x k*n) -void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, +void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, int8_t alpha, const int8_t *A, int32_t lda, const int8_t *B, int32_t ldb, - float beta, int32_t *C, int32_t ldc, bool relu, - int32_t *bias) { + int8_t beta, int32_t *C, int32_t ldc, bool relu, + int8_t *bias) { #ifdef _OPENMP int32_t max_threads = omp_get_max_threads(); #else @@ -39,11 +39,10 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, #endif int32_t L1 = 64 / max_threads * 1024; - const int32_t k_complete = (k + 15) - ((k + 15) & 15); - KC = k_complete; + KC = k; zero_int8 = - static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * k)); - memset(static_cast(zero_int8), 0, sizeof(int8_t) * k); + static_cast(paddle_mobile::memory::Alloc(sizeof(int8_t) * KC)); + memset(static_cast(zero_int8), 0, sizeof(int8_t) * KC); if (m > n) { // 对 A 分块 MC = L1 / (KC * sizeof(int8_t)); @@ -55,14 +54,14 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, MC = (MC + MR_INT8 - 1) / MR_INT8 * MR_INT8; } // 补齐 B - NC = (n + NR_INT8 - 1) / NR_INT8 * NR_INT8; + NC = (n + NR - 1) / NR * NR; packedB_int8 = static_cast( paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC)); #if __aarch64__ // TODO(wzzju) #else - PackMatrixB_omp_2c_16(k, n, n % NR_INT8, B, ldb, packedB_int8); + PackMatrixB_omp_8c(KC, n, n % NR, B, ldb, packedB_int8); #endif packedA_int8 = static_cast( paddle_mobile::memory::Alloc(sizeof(int8_t) * MC * KC * max_threads)); @@ -70,11 +69,11 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, // 对 B 分块 NC = L1 / (KC * sizeof(int8_t)); if (NC == 0) { - NC = NR_INT8; + NC = NR; } else { int32_t nblock_num = (n + NC - 1) / NC; NC = (n + nblock_num - 1) / nblock_num; - NC = (NC + NR_INT8 - 1) / NR_INT8 * NR_INT8; + NC = (NC + NR - 1) / NR * NR; } // 补齐 A MC = (m + MR_INT8 - 1) / MR_INT8 * MR_INT8; @@ -84,12 +83,12 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, #if __aarch64__ // TODO(wzzju) #else - PackMatrixA_omp_4r_16(m, k, m % MR_INT8, A, lda, packedA_int8); + PackMatrixA_omp_4r(m, KC, m % MR_INT8, A, lda, packedA_int8); #endif packedB_int8 = static_cast( paddle_mobile::memory::Alloc(sizeof(int8_t) * KC * NC * max_threads)); } - packedC_int32 = static_cast( + packedC_int8 = static_cast( paddle_mobile::memory::Alloc(sizeof(int32_t) * MC * NC * max_threads)); if (m > n) { @@ -104,19 +103,14 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, int32_t mc; mc = s_min(m - i, MC); int8_t *local_A = packedA_int8 + MC * KC * local_threads; - int32_t *local_C = packedC_int32 + MC * NC * local_threads; + int32_t *local_C = packedC_int8 + MC * NC * local_threads; #if __aarch64__ // TODO(wzzju) #else - PackMatrixA_4r_16(mc, k, mc % MR_INT8, &A(i, 0), lda, local_A); + PackMatrixA_4r(mc, KC, mc % MR_INT8, &A(i, 0), lda, local_A); #endif - // InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, - // local_C, - // &C(i, 0), ldc, relu, bias + i); - if (bias == nullptr) { - InnerKernel(mc, n, alpha, local_A, packedB_int8, beta, local_C, - &C(i, 0), ldc, relu); - } + InnerKernelWithBias(mc, n, alpha, local_A, packedB_int8, beta, local_C, + &C(i, 0), ldc, relu, bias + i); } } else { #pragma omp parallel for @@ -129,25 +123,20 @@ void Gemm::Sgemm_omp(int32_t m, int32_t n, int32_t k, float alpha, int32_t nc; nc = s_min(n - j, NC); int8_t *local_B = packedB_int8 + KC * NC * local_threads; - int32_t *local_C = packedC_int32 + MC * NC * local_threads; + int32_t *local_C = packedC_int8 + MC * NC * local_threads; #if __aarch64__ // TODO(wzzju) #else - PackMatrixB_2c_16(k, nc, nc % NR_INT8, &B(0, j), ldb, local_B); + PackMatrixB_8c(KC, nc, nc % NR, &B(0, j), ldb, local_B); #endif - // InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, - // local_C, - // &C(0, j), ldc, relu, bias); - if (bias == nullptr) { - InnerKernel(m, nc, alpha, packedA_int8, local_B, beta, local_C, - &C(0, j), ldc, relu); - } + InnerKernelWithBias(m, nc, alpha, packedA_int8, local_B, beta, local_C, + &C(0, j), ldc, relu, bias); } } paddle_mobile::memory::Free(packedA_int8); paddle_mobile::memory::Free(packedB_int8); - paddle_mobile::memory::Free(packedC_int32); + paddle_mobile::memory::Free(packedC_int8); paddle_mobile::memory::Free(zero_int8); } @@ -155,7 +144,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, const int8_t *B, int32_t ldb, int8_t *buffer) { const int32_t j_length = n - n_tail; #pragma omp parallel for - for (int32_t j = 0; j < j_length; j += 8) { + for (int32_t j = 0; j < j_length; j += NR) { int8_t *local_buffer = buffer + j * k; for (int32_t i = 0; i < k; ++i) { const int8_t *b0 = &B(i, j); @@ -190,7 +179,7 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, for (int32_t j = j_length; j < n; ++j) { *local_buffer++ = *b0++; } - for (int32_t j = n; j < j_length + 8; ++j) { + for (int32_t j = n; j < j_length + NR; ++j) { *local_buffer++ = 0; } } @@ -199,9 +188,9 @@ void Gemm::PackMatrixB_omp_8c(int32_t k, int32_t n, int32_t n_tail, void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, const int8_t *A, int32_t lda, int8_t *buffer) { - const int32_t i_length = m - m_tail; + const int i_length = m - m_tail; #pragma omp parallel for - for (int32_t i = 0; i < i_length; i += 4) { + for (int32_t i = 0; i < i_length; i += MR_INT8) { const int8_t *a0 = A + i * lda; const int8_t *a1 = A + (i + 1) * lda; const int8_t *a2 = A + (i + 2) * lda; @@ -232,7 +221,7 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, default: break; } - for (int32_t j = 0; j < k; ++j) { + for (int j = 0; j < k; ++j) { *local_buffer++ = *a0++; *local_buffer++ = *a1++; *local_buffer++ = *a2++; @@ -241,232 +230,6 @@ void Gemm::PackMatrixA_omp_4r(int32_t m, int32_t k, int32_t m_tail, } } -// 8 bits int PackMatrixA_4r -void Gemm::PackMatrixA_omp_4r_16(int32_t m, int32_t k, int32_t m_tail, - const int8_t *A, int32_t lda, int8_t *buffer) { - const int32_t i_length = m - m_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; -#pragma omp parallel for - for (int32_t i = 0; i < i_length; i += 4) { - const int8_t *a0 = A + i * lda; - const int8_t *a1 = A + (i + 1) * lda; - const int8_t *a2 = A + (i + 2) * lda; - const int8_t *a3 = A + (i + 3) * lda; - int8_t *local_buffer = buffer + i * KC; - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - // TODO(wzzju) -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } - - if (m_tail != 0) { - const int8_t *a0 = &A(i_length, 0); - const int8_t *a1 = a0 + lda; - const int8_t *a2 = a0 + 2 * lda; - const int8_t *a3 = a0 + 3 * lda; - int8_t *local_buffer = buffer + i_length * KC; - switch (m_tail) { - case 1: - a1 = zero_int8; - case 2: - a2 = zero_int8; - case 3: - a3 = zero_int8; - break; - default: - break; - } - for (int32_t j = 0; j < k_count; ++j) { -#if __ARM_NEON -#if __aarch64__ - // TODO(wzzju) -#else - asm volatile( - "vld1.s8 {d0, d1}, [%[a0]]! \n\t" - "vld1.s8 {d2, d3}, [%[a1]]! \n\t" - "vld1.s8 {d4, d5}, [%[a2]]! \n\t" - "vld1.s8 {d6, d7}, [%[a3]]! \n\t" - "vst1.s8 {d0, d1}, [%[local_buffer]]! \n\t" - "vst1.s8 {d2, d3}, [%[local_buffer]]! \n\t" - "vst1.s8 {d4, d5}, [%[local_buffer]]! \n\t" - "vst1.s8 {d6, d7}, [%[local_buffer]]! \n\t" - : [local_buffer] "+r"(local_buffer), [a0] "+r"(a0), [a1] "+r"(a1), - [a2] "+r"(a2), [a3] "+r"(a3) - : - : "memory", "q0", "q1", "q2", "q3"); -#endif // __aarch64__ -#else - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a0++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a1++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a2++; - } - for (int32_t l = 0; l < 16; ++l) { - *local_buffer++ = *a3++; - } -#endif // __ARM_NEON - } - if (k_tail != 0) { - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a0++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a1++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a2++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *a3++; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - -// 8 bits int PackMatrixB -void Gemm::PackMatrixB_omp_2c_16(int32_t k, int32_t n, int32_t n_tail, - const int8_t *B, int32_t ldb, int8_t *buffer) { - const int32_t j_length = n - n_tail; - const int32_t k_count = k >> 4; - const int32_t k_tail = k & 15; -#pragma omp parallel for - for (int32_t j = 0; j < j_length; j += 2) { - int8_t *local_buffer = buffer + j * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j); - const int8_t *b1 = &B((i << 4), j + 1); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b1; - b1 += ldb; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j); - const int8_t *b1 = &B((k_count << 4), j + 1); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b1; - b1 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } - if (n_tail != 0) { - int8_t *local_buffer = buffer + j_length * KC; - for (int32_t i = 0; i < k_count; ++i) { - const int8_t *b0 = &B((i << 4), j_length); - for (int m = 0; m < 16; ++m) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int m = 0; m < 16; ++m) { - *local_buffer++ = 0; - } - } - if (k_tail != 0) { - const int8_t *b0 = &B((k_count << 4), j_length); - for (int32_t j = k_count << 4; j < k; ++j) { - *local_buffer++ = *b0; - b0 += ldb; - } - for (int32_t j = k; j < KC; ++j) { - *local_buffer++ = 0; - } - for (int32_t j = k_count << 4; j < KC; ++j) { - *local_buffer++ = 0; - } - } - } -} - } // namespace math } // namespace operators } // namespace paddle_mobile diff --git a/src/operators/math/math_function.h b/src/operators/math/math_function.h index 9661b2d4c22ed49ef0c078fac0872c7643057430..b91242c1868398e4541c3727567a905e5b0c8714 100644 --- a/src/operators/math/math_function.h +++ b/src/operators/math/math_function.h @@ -28,12 +28,7 @@ template void matmul(const framework::Tensor &matrix_a, bool trans_a, const framework::Tensor &matrix_b, bool trans_b, T alpha, framework::Tensor *matrix_out, T beta, bool relu = false, - float *bias = nullptr); - -void matmul_int8(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu = false, - int32_t *bias = nullptr); + T *bias = nullptr); template void matmulWithBn(const framework::Tensor &matrix_a, bool trans_a, diff --git a/src/operators/math/math_function_int8.cpp b/src/operators/math/math_function_int8.cpp index ba0e5578cd32ff45620ddaa6feda9b31b2bcd68e..e02824b290ebc0080613e2ae2365626d79576c9e 100644 --- a/src/operators/math/math_function_int8.cpp +++ b/src/operators/math/math_function_int8.cpp @@ -20,10 +20,11 @@ limitations under the License. */ namespace paddle_mobile { namespace operators { namespace math { -void matmul_int8(const framework::Tensor &matrix_a, bool trans_a, - const framework::Tensor &matrix_b, bool trans_b, float alpha, - framework::Tensor *matrix_out, float beta, bool relu, - int32_t *bias) { +template <> +void matmul(const framework::Tensor &matrix_a, bool trans_a, + const framework::Tensor &matrix_b, bool trans_b, + int8_t alpha, framework::Tensor *matrix_out, int8_t beta, + bool relu, int8_t *bias) { auto dim_a = matrix_a.dims(); auto dim_b = matrix_b.dims(); auto dim_out = matrix_out->dims(); @@ -51,45 +52,21 @@ void matmul_int8(const framework::Tensor &matrix_a, bool trans_a, } #ifdef _OPENMP - if (bias != nullptr) { - // TODO(wzzju): gemm.Sgemm_omp_with_bias, now use single thread instead. - gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); - } else { - gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); - } + gemm.Sgemm_omp(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); #else - if (bias != nullptr) { - gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); - } else { - gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); - } + gemm.Sgemm(M, N, K, alpha, a, K, matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); #endif } else { #ifdef _OPENMP - if (bias != nullptr) { - // TODO(wzzju): gemm.Sgemm_omp_with_bias, now use single thread instead. - gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias); - } else { - gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, - matrix_out->data(), N, relu, bias); - } + gemm.Sgemm_omp(M, N, K, alpha, matrix_a.data(), K, + matrix_b.data(), N, beta, + matrix_out->data(), N, relu, bias); #else - if (bias != nullptr) { - gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias); - } else { - gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, - matrix_b.data(), N, beta, matrix_out->data(), - N, relu, bias); - } + gemm.Sgemm(M, N, K, alpha, matrix_a.data(), K, + matrix_b.data(), N, beta, matrix_out->data(), N, + relu, bias); #endif } } diff --git a/src/operators/op_param.h b/src/operators/op_param.h index c00369cec7a17ef742420d20bcad786665992136..607576e389c69c296f9b1721a632738425a5f7ae 100644 --- a/src/operators/op_param.h +++ b/src/operators/op_param.h @@ -1633,11 +1633,11 @@ class FusionFcParam : public OpParam { y_num_col_dims_ = GetAttr("y_num_col_dims", attrs); axis_ = GetAttr("axis", attrs); } - const GType *InputX() const { return input_x_; } + GType *InputX() const { return input_x_; } - const RType *InputY() const { return input_y_; } + RType *InputY() const { return input_y_; } - const RType *InputZ() const { return input_z_; } + RType *InputZ() const { return input_z_; } GType *Out() const { return out_; } diff --git a/test/common/test_gemm_perf.cpp b/test/common/test_gemm_perf.cpp index 5ca0b40cfcb20786ad69d1bbfbaca103b3e426e3..14da4ba284b5ac7b0660bd15de871fdf5ed04cdd 100644 --- a/test/common/test_gemm_perf.cpp +++ b/test/common/test_gemm_perf.cpp @@ -28,7 +28,7 @@ limitations under the License. */ int main() { paddle_mobile::PaddleMobile paddle_mobile; - paddle_mobile.SetThreadNum(4); + paddle_mobile.SetThreadNum(8); Tensor aa, bb, cc; auto aaptr = aa.mutable_data({m, k}); auto bbptr = bb.mutable_data({k, n}); @@ -44,12 +44,10 @@ int main() { ccptr[i] = 2; } - Tensor aa_int8, bb_int8, cc_int32, cc_int8; + Tensor aa_int8, bb_int8, cc_int8; auto aaptr_int8 = aa_int8.mutable_data({m, k}); auto bbptr_int8 = bb_int8.mutable_data({k, n}); - auto ccptr_int32 = cc_int32.mutable_data({m, n}); - auto ccptr_int8 = cc_int8.mutable_data({m, n}); - int32_t* bias_data = new int32_t[m]; + auto ccptr_int8 = cc_int8.mutable_data({m, n}); for (int i = 0; i < m * k; ++i) { aaptr_int8[i] = static_cast(2); @@ -58,11 +56,7 @@ int main() { bbptr_int8[i] = static_cast(2); } for (int i = 0; i < m * n; ++i) { - ccptr_int32[i] = static_cast(2); - } - - for (int i = 0; i < m; ++i) { - bias_data[i] = 2; + ccptr_int8[i] = static_cast(2); } // float @@ -82,41 +76,22 @@ int main() { auto time2 = time(); std::cout << "float gemm cost :" << time_diff(time1, time2) / 10 << "ms\n"; - // int8_t without bias + // int8_t // warm-up 10 times for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::matmul_int8( - aa_int8, false, bb_int8, false, static_cast(1), &cc_int32, - static_cast(0), false, nullptr); + paddle_mobile::operators::math::matmul( + aa_int8, false, bb_int8, false, static_cast(1), &cc_int8, + static_cast(0), false, nullptr); } auto time3 = time(); for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::matmul_int8( - aa_int8, false, bb_int8, false, static_cast(1), &cc_int32, - static_cast(0), false, nullptr); + paddle_mobile::operators::math::matmul( + aa_int8, false, bb_int8, false, static_cast(1), &cc_int8, + static_cast(0), false, nullptr); } auto time4 = time(); std::cout << "int8_t gemm cost :" << time_diff(time3, time4) / 10 << "ms\n"; - // int8_t with bias&relu - // warm-up 10 times - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::matmul_int8( - aa_int8, false, bb_int8, false, static_cast(1), &cc_int8, - static_cast(0), true, &bias_data[0]); - } - auto time5 = time(); - for (int j = 0; j < 10; ++j) { - paddle_mobile::operators::math::matmul_int8( - aa_int8, false, bb_int8, false, static_cast(1), &cc_int8, - static_cast(0), true, &bias_data[0]); - } - auto time6 = time(); - std::cout << "int8_t gemm_with_bias_relu cost :" - << time_diff(time5, time6) / 10 << "ms\n"; - - delete[] bias_data; - return 0; } diff --git a/test/fpga/test_resnet50.cpp b/test/fpga/test_resnet50.cpp index 4d05328179fa2acc771e08a6dfddea4f770d9780..1a5daafe2b784b98b102fa2eab04f71c67260d9c 100644 --- a/test/fpga/test_resnet50.cpp +++ b/test/fpga/test_resnet50.cpp @@ -12,6 +12,8 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. See the License for the specific language governing permissions and limitations under the License. */ #include +#include +#include #include "../test_include.h" #ifdef PADDLE_MOBILE_FPGA_V1 @@ -87,26 +89,29 @@ int main() { paddle_mobile::PaddleMobile paddle_mobile; if (paddle_mobile.Load(std::string(g_resnet50), true)) { Tensor input_tensor; - SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(0), - static_cast(1)); + SetupTensor(&input_tensor, {1, 3, 224, 224}, static_cast(2), + static_cast(2)); readStream(g_image_src_float, input_tensor.mutable_data({1, 3, 224, 224})); paddle_mobile.FeedData(input_tensor); paddle_mobile.Predict_To(-1); - /*for(int i = 0; i < 73; i++) - { + for (int i = 0; i < 73; i++) { auto tensor_ptr = paddle_mobile.FetchResult(i); - std::string saveName = "resnet50_result_" + std::to_string (i); + std::string saveName = "resnet50_result_" + std::to_string(i); paddle_mobile::fpga::fpga_invalidate((*tensor_ptr).data(), - tensor_ptr->numel()); dump_stride(saveName, (*tensor_ptr), 20); - //dump(saveName, (*tensor_ptr)); - }*/ + tensor_ptr->numel() * sizeof(half)); + dump_stride(saveName, (*tensor_ptr), 20); + // dump(saveName, (*tensor_ptr)); + } - /*std::shared_ptr output_tensor = paddle_mobile.FetchResult(73); - (*output_tensor).dump("resnet50_result_73"); + std::shared_ptr output_tensor = paddle_mobile.FetchResult(73); + //(*output_tensor).dump("resnet50_result_73"); output_tensor = paddle_mobile.FetchResult(74); - (*output_tensor).dump("resnet50_result_74");*/ - std::shared_ptr output_tensor = paddle_mobile.FetchResult(74); + //(*output_tensor).dump("resnet50_result_74"); + // std::shared_ptr output_tensor = paddle_mobile.FetchResult(74); + + // output_tensor = paddle_mobile.FetchResult(74); + float max = 0; auto data_ptr = output_tensor->data(); int maximumIdx = 0; @@ -116,7 +121,7 @@ int main() { max = data_ptr[i]; } } - std::cout << "index : " << maximumIdx << ", value : " << max + std::cout << "index : " << std::dec << maximumIdx << ", value : " << max << std::endl; std::cout << "Computation done" << std::endl; return 0;