提交 6b14134f 编写于 作者: Z zhangyang

add deconv op for V1 for FPGA track

上级 467bbfe7
......@@ -196,19 +196,35 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].filter_scale_address = filter->scale;
arg->conv_arg[i].filter_address = &(
(int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; // NOLINT
arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
// arg->conv_arg[i].filter_address = &(
// (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; //
// NOLINT
// arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].filter_num = (uint32_t)(
i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div);
size_t filter_size =
element_num * arg->conv_arg[i].filter_num * sizeof(int8_t);
auto filter_head =
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size);
size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float);
auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
fpga_flush(arg->conv_arg[i].sb_address, bs_size);
if (n > 1) {
arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_arg[i].output.address =
fpga_malloc(input->dims()[2] *
align_to_x(input->dims()[3] * arg->conv_arg[i].filter_num,
fpga_malloc(out->dims()[2] *
align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num,
IMAGE_ALIGNMENT) *
sizeof(half));
} else {
......@@ -221,6 +237,8 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
}
filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr);
}
} // namespace fpga
......
......@@ -137,24 +137,23 @@ void align_num(char **data_in, int num_per_div_before_alignment, int num,
int align_chw = align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
if (num_per_div_after_alignment != num_per_div_before_alignment) {
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
memset(data_tmp, 0, num_element * sizeof(char));
char *tmp = *data_in;
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_element = div_num * num_per_div_after_alignment * align_chw;
char *data_tmp = (char *)fpga_malloc(num_element * sizeof(char)); // NOLINT
for (i = 0; i < div_num; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
memset(data_tmp, 0, num_element * sizeof(char));
*data_in = data_tmp;
fpga_free(tmp);
for (i = 0; i < div_num; i++) {
memcpy(data_tmp + num_per_div_after_alignment * align_chw * i,
*data_in + num_per_div_before_alignment * align_chw * i,
num_per_div_before_alignment * align_chw);
}
*data_in = data_tmp;
fpga_free(tmp);
}
void reorder(char **data_in, int num_after_alignment, int chw) {
......@@ -223,7 +222,10 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
......@@ -254,15 +256,18 @@ void format_fc_filter(float **data_in, int num, int channel, int height,
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num;
int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT
convert_fc_filter(quantize_data, num, chw);
align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw);
if (num_after_alignment != num) {
align_num(quantize_data, num_per_div_before_alignment, num, chw);
}
reorder(quantize_data, num_after_alignment, chw);
interleave(quantize_data, num_after_alignment, chw);
fpga_flush(*quantize_data, align_to_x(chw, FILTER_ELEMENT_ALIGNMENT) *
......
......@@ -13,16 +13,172 @@ See the License for the specific language governing permissions and
limitations under the License. */
#include "fpga/common/pe.h"
#include <unistd.h>
#include <iomanip>
#include <iostream>
#include "common/types.h"
#include "fpga/V1/filter.h"
#include "fpga/V1/image.h"
#include "fpga/common/config.h"
#include "fpga/common/driver.h"
using namespace std;
using namespace paddle_mobile::fpga::driver; // NOLINT
namespace paddle_mobile {
namespace fpga {
#define IMAGE_ALIGN 16
#define FILTER_ALIGN 16
#define FILTER_NUM_ALIGN 32
#define USE_RELU 1
#define USE_BIAS 2
// bypass cmd
#define CMD_FP16_TO_FP16 0
#define CMD_FP16_TO_FP32 1
#define CMD_FP32_TO_FP16 2
#define CMD_FP32_TO_FP32 3
// bypass macro
#define SIZE_FP16 2
#define SIZE_FP32 4
#define PE_IRQ_TIMEOUT 1000000
/* Interrupt bit-set offset*/
#define INTERRUPT_RSVD 0x0001
#define INTERRUPT_BYPASS 0x0002
#define INTERRUPT_CONV 0x0004
#define INTERRUPT_POOLING 0x0008
#define INTERRUPT_EW 0x0010
//#define INTERRUPT_RESIZE 0x0020
/* Register offset */
#define REG_INTERRUPT 0x000
#define REG_VERSION 0x008
#define REG_TEMPERATURE 0x010
#define REG_FPGA_RESET 0x018
#define REG_TEST_REGISTER 0x048
#define REG_HARDWARE_STATUS 0x050
#define REG_TIMER_COUNTER 0x070
#define REG_SCALE_PARAMETER 0x080
#define REG_FLASH_CMD 0x200
#define REG_FLASH_DATA 0x208
#define REG_FLASH_CONFIG 0x210
#define REG_FLASH_STATUS 0x218
#define REG_SN 0x220
//#define REG_READ_SCALE
//#define REG_WRITE_SCALE
/*bypass*/
#define REG_CONVERT_CMD 0x400
#define REG_CONVERT_SRC_ADDR 0x408
#define REG_CONVERT_DST_ADDR 0x410
#define REG_CONVERT_LENGTH 0x418
/*resize*/
#define REG_RESIZE_CMD 0x600
#define REG_RESIZE_CHANNEL_NUMBER 0x608
#define REG_RESIZE_INPUT_IMAGE_PIXEL 0x610
#define REG_RESIZE_OUTPUT_IMAGE_PIXEL 0x618
#define REG_RESIZE_INPUT_BASE_ADDR 0x620
#define REG_RESIZE_WEIGHT_BASE_ADDR 0x628
#define REG_RESIZE_SRC_POS_BASE_ADDR 0x630
#define REG_RESIZE_OUTPUT_BASE_ADDR 0x638
/*pooling*/
#define REG_POOLING_CMD 0x800
#define REG_POOLING_IMAGE_BASE_ADDR 0x808
#define REG_POOLING_RESULT_BASE_ADDR 0x810
#define REG_POOLING_IMAGE_PIXEL 0x818
#define REG_POOLING_WINDOW_SIZE 0x820
#define REG_POOLING_RESULT_PIXEL 0x828
#define REG_POOLING_PAD_PIXEL 0x830
#define REG_POOLING_STEP_PIXEL 0x838
#define REG_POOLING_CHANNEL_NUMBER 0x840
#define REG_POOLING_IMAGE_AMOUNT_PER_ROW 0x848
#define REG_POOLING_IMAGE_ONE_PAD_PER_ROW 0x850
#define REG_POOLING_IMAGE_TWO_PAD_PER_ROW 0x858
#define REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT 0x860
#define REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT 0x868
#define REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT 0x870
#define REG_POOLING_RESULT_AMOUNT_ALIGN_32 0x878
#define REG_POOLING_RESULT_AMOUNT_ALIGN_64 0x880
#define REG_POOLING_IMAGE_CALCU_HEIGHT 0x888
#define REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW 0x898
#define REG_POOLING_MODE_RECIPROCAL 0x890
/*conv*/
#define REG_CONV_CMD 0xC00
#define REG_CONV_IMAGE_BASE_ADDR 0xC08
#define REG_CONV_FILTER_BASE_ADDR 0xC10
#define REG_CONV_SB_BASE_ADDR 0xC18
#define REG_CONV_RESULT_BASE_ADDR 0xC20
#define REG_CONV_IMAGE_PIXEL 0xC28
#define REG_CONV_FILTER_PIXEL 0xC30
#define REG_CONV_RESULT_PIXEL 0xC38
#define REG_CONV_PAD_PIXEL 0xC40
#define REG_CONV_STEP_PIXEL 0xC48
#define REG_CONV_GROUP_NUMBER 0xC50
#define REG_CONV_FILTER_NUMBER 0xC58
#define REG_CONV_CHANNEL_NUMBER 0xC60
#define REG_CONV_FILTER_PER_GROUP 0xC68
#define REG_CONV_CHANNEL_PER_GROUP 0xC70
#define REG_CONV_IMAGE_AMOUNT_PER_ROW 0xC78
#define REG_CONV_IMAGE_ONE_PAD_PER_ROW 0xC80
#define REG_CONV_IMAGE_TWO_PAD_PER_ROW 0xC88
#define REG_CONV_FILTER_AMOUNT_ALL 0xC90
#define REG_CONV_RESULT_AMOUNT_PER_ROW 0xC98
#define REG_CONV_RESULT_LAST_VALID 0xCA0
#define REG_CONV_BLOCK_AMOUNT_PER_ROW 0xCA8
#define REG_CONV_FILTER_PAD_WIDTH_MUL_CH 0xCB0
#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN_F 0xCB8
#define REG_CONV_IMAGE_AMOUNT_PER_ROW_MUL_WIN 0xCC0
#define REG_CONV_IMAGE_BLOCK_NUM 0xCC8
#define REG_CONV_IMAGE_BLOCK_LEN 0xCD0
#define REG_CONV_IMAGE_BLOCK_LEN_LAST 0xCD8
#define REG_CONV_IMAGE_WIN_CNT 0xCE0
#define REG_CONV_IMAGE_WIN_CNT_LAST 0xCE8
#define REG_CONV_RES_ROW_DATA_ALIGN4_PAD 0xCF8
#define REG_CONV_PROG_FULL_CNT 0xD08
#define REG_CONV_POST_PROG_FULL_CNT 0xD10
#define REG_CONV_FPGA_BIAS_SCALE_LEN 0xD20
#define REG_CONV_IMAGE_SCALE 0xD28
#define REG_CONV_FILTER_SCALE 0xD30
/*ew*/
#define REG_EW_CMD 0x0F00
#define REG_EW_IMAGE0_BASE_ADDR 0x0F08
#define REG_EW_IMAGE1_BASE_ADDR 0x0F10
#define REG_EW_RESULT_BASE_ADDR 0x0F18
#define REG_EW_DATA_LEN 0x0F20
#define REG_EW_COEFFICIENT 0x0F28
#define REG_EW_IMAGE_PIXEL 0x0F30
#define REG_EW_IMAGE_AMOUNT_PER_ROW 0x0F38
int ComputeFpgaConv(const struct SplitConvArgs &args) {
ComputeBasicConv(args.conv_arg[0]);
// ComputeBasicConv(args.conv_arg[0]);
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFPGAConv===========";
DLOG << " filter_num:" << args.filter_num
<< " group_num:" << args.group_num
<< " split_num:" << args.split_num;
#endif
int split_num = args.split_num;
for (int i = 0; i < split_num; i++) {
ComputeBasicConv(args.conv_arg[i]);
}
if (split_num > 1) {
ComputeFPGAConcat(args.concat_arg);
}
}
int ComputeBasicConv(const struct ConvArgs &args) {
......@@ -47,9 +203,237 @@ int ComputeBasicConv(const struct ConvArgs &args) {
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
cout << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
cout << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#ifndef PADDLE_MOBILE_ZU5
return 0;
#ifdef PADDLE_MOBILE_ZU5
DLOG << "Conv";
// return 0;
uint64_t timer_cnt;
uint64_t output_scale;
uint64_t image_scale;
uint64_t filter_scale;
uint64_t image_address_phy = 0;
uint64_t sb_address_phy = 0;
uint64_t filter_address_phy = 0;
uint64_t output_address_phy = 0;
int ret = 0;
fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
cout << "image_scale :" << hex << (image_scale) << endl;
cout << "filter_scale :" << hex << (filter_scale) << endl;
uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height *
(uint64_t)args.image.channels;
filterlen = align_to_x(filterlen, FILTER_ALIGN);
filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGN);
uint64_t fpga_bias_scale_len =
align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
uint64_t output_height =
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h +
1;
uint64_t output_width =
(args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1;
uint64_t output_size =
output_height * output_width * (uint64_t)args.filter_num;
uint64_t filter_per_group = (uint64_t)(args.filter_num / args.group_num);
uint64_t channel_per_group = (uint64_t)(args.image.channels / args.group_num);
uint64_t image_row_count = ((uint64_t)args.image.width) *
((uint64_t)args.image.channels); // without align
uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGN);
uint64_t image_one_pad_per_row =
align_to_x(image_row_count, IMAGE_ALIGN) +
((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
uint64_t filter_amount_all =
align_to_x(((uint64_t)args.kernel.height) *
((uint64_t)args.kernel.width) * channel_per_group,
FILTER_ALIGN);
uint64_t output_amount_per_row =
align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGN);
// find the opt partition strategy
uint64_t res_win;
uint64_t res_fit = 0;
for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
if ((align_to_x(
(args.image.channels *
(args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
IMAGE_ALIGN) /
16 +
1) *
args.kernel.height >
2048) {
break;
}
}
if (res_win != output_width) {
res_win -= 1;
}
if (((res_win % 2) != 0) && (res_win != 1)) {
res_win = res_win - 1;
}
res_fit = res_win;
uint64_t block_num = (output_width + res_fit - 1) / res_fit;
uint64_t block_len = res_fit;
uint64_t block_last = output_width - res_fit * (block_num - 1);
uint64_t res_amount_per_row = output_width * args.filter_num;
uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
uint64_t image_block_amount_per_row =
args.kernel.stride_w * (res_fit)*args.image.channels;
uint64_t filter_pad_width_mul_channel =
args.image.pad_width * args.image.channels;
uint64_t image_amount_per_row_multi_win_first =
image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
uint64_t image_amount_per_row_multi_win =
image_amount_per_row * (4 * args.kernel.stride_h);
uint64_t image_block_num = block_num;
uint64_t image_block_len =
align_to_x((args.image.channels *
(args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
IMAGE_ALIGN) /
16 +
1;
uint64_t image_block_len_last =
align_to_x(
(args.image.channels *
(args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
IMAGE_ALIGN) /
16 +
1;
uint64_t image_win_cnt = block_len;
uint64_t image_win_cnt_last = block_last;
uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
if (prog_full_cnt == 1023) {
prog_full_cnt--;
}
uint64_t post_prog_full_cnt =
(512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
: 0;
image_address_phy = vaddr_to_paddr(args.image.address);
sb_address_phy = vaddr_to_paddr(args.sb_address);
filter_address_phy = vaddr_to_paddr(args.filter_address);
output_address_phy = vaddr_to_paddr(args.output.address);
/*SDK刷Cache保证数据一致性*/
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
ret = -EIO;
DLOG << "Conv Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR);
reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR);
reg_writeq(
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
REG_CONV_IMAGE_PIXEL);
reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
REG_CONV_FILTER_PIXEL);
reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL);
reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32),
REG_CONV_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32),
REG_CONV_STEP_PIXEL);
reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP);
reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
reg_writeq(image_block_amount_per_row, 0xca8);
reg_writeq(filter_pad_width_mul_channel, 0xcb0);
reg_writeq(image_amount_per_row_multi_win_first, 0xcb8);
reg_writeq(image_amount_per_row_multi_win, 0xcc0);
reg_writeq(image_block_num, 0xcc8);
reg_writeq(image_block_len, 0xcd0);
reg_writeq(image_block_len_last, 0xcd8);
reg_writeq(image_win_cnt, 0xce0);
reg_writeq(image_win_cnt_last, 0xce8);
reg_writeq(res_row_data_align4_pad, 0xcf8);
reg_writeq(prog_full_cnt, 0xd08);
reg_writeq(post_prog_full_cnt, 0xd10);
reg_writeq(fpga_bias_scale_len / 4, 0xd20);
/*write scale*/
reg_writeq(image_scale, REG_CONV_IMAGE_SCALE);
reg_writeq(filter_scale, REG_CONV_FILTER_SCALE);
reg_writeq(cmd, REG_CONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
ret = -EIO;
DLOG << "Conv Wait Irq Timeout!";
}
DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
cout << "output_scale :" << hex << (output_scale) << endl;
//*(args.output.scale_address) = output_scale;
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
......@@ -74,8 +458,135 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#ifdef PADDLE_MOBILE_ZU5
DLOG << "Polling";
// return 0;
uint64_t output_scale = 0;
uint64_t timer_cnt = 0;
int ret = 0;
uint64_t cmd = 0;
uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0;
image_physical_address = vaddr_to_paddr(args.image.address);
output_physical_address = vaddr_to_paddr(args.output.address);
uint32_t output_height = (uint32_t)(
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h +
1);
uint32_t output_width = (uint32_t)(
(args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1);
uint64_t image_amount_per_row = align_to_x(
(uint64_t)args.image.width * (uint64_t)args.image.channels, IMAGE_ALIGN);
uint64_t image_one_pad_per_row =
align_to_x((uint64_t)args.image.width * (uint64_t)args.image.channels,
FILTER_ALIGN) +
(uint64_t)args.image.pad_width * (uint64_t)args.image.channels;
uint64_t image_two_pad_per_row = align_to_x(
((uint64_t)args.image.width + (uint64_t)args.image.pad_width * 2) *
(uint64_t)args.image.channels,
IMAGE_ALIGN);
uint64_t image_row_mul_pooling_hight =
image_amount_per_row * (uint64_t)args.kernel.height;
uint64_t image_row_mul_pad_hight =
image_amount_per_row * (uint64_t)args.image.pad_height;
uint64_t image_row_mul_step_hight =
image_amount_per_row * (uint64_t)args.kernel.stride_h;
uint64_t result_amount_align_32 = align_to_x(
(uint64_t)output_width * (uint64_t)args.image.channels, FILTER_ALIGN);
uint64_t result_amount_align_64 = align_to_x(
(uint64_t)output_width * (uint64_t)args.image.channels, IMAGE_ALIGN);
uint64_t image_calcu_height =
(uint64_t)args.kernel.height +
((uint64_t)output_height - 1) * (uint64_t)args.kernel.stride_h;
uint64_t image_pad_left = args.image.channels * args.image.pad_width;
uint64_t image_skip_window = args.image.channels * args.kernel.stride_w;
uint64_t image_padleft_skipwindow =
(image_skip_window << 32) | image_pad_left;
uint64_t mode_reciprocal = (uint64_t)0 | ((uint64_t)args.mode) << 16 |
(((uint64_t)args.kernel_reciprocal));
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO;
DLOG << "Conv Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, REG_POOLING_IMAGE_BASE_ADDR);
reg_writeq(output_physical_address, REG_POOLING_RESULT_BASE_ADDR);
reg_writeq(
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
REG_POOLING_IMAGE_PIXEL);
reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
REG_POOLING_WINDOW_SIZE);
reg_writeq(((uint64_t)output_height) | (((uint64_t)output_width) << 32),
REG_POOLING_RESULT_PIXEL);
reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32),
REG_POOLING_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32),
REG_POOLING_STEP_PIXEL);
reg_writeq((uint64_t)args.image.channels, REG_POOLING_CHANNEL_NUMBER);
reg_writeq(image_amount_per_row, REG_POOLING_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_one_pad_per_row, REG_POOLING_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(image_two_pad_per_row, REG_POOLING_IMAGE_TWO_PAD_PER_ROW);
reg_writeq(image_row_mul_pooling_hight,
REG_POOLING_IMAGE_ROW_MUL_WINDOW_HEIGHT);
reg_writeq(image_row_mul_pad_hight, REG_POOLING_IMAGE_ROW_MUL_PAD_HEIGHT);
reg_writeq(image_row_mul_step_hight, REG_POOLING_IMAGE_ROW_MUL_STEP_HEIGHT);
reg_writeq(result_amount_align_32, REG_POOLING_RESULT_AMOUNT_ALIGN_32);
reg_writeq(result_amount_align_64, REG_POOLING_RESULT_AMOUNT_ALIGN_64);
reg_writeq(image_calcu_height, REG_POOLING_IMAGE_CALCU_HEIGHT);
reg_writeq(image_padleft_skipwindow, REG_POOLING_IMAGE_PADLEFT_SKIPWINDOW);
reg_writeq(mode_reciprocal, REG_POOLING_MODE_RECIPROCAL);
/*SDK刷Cache保证数据一致性*/
reg_writeq(cmd, REG_POOLING_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
ret = -EIO;
DLOG << "Pooling Wait Irq Timeout!";
}
DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
}
......@@ -103,8 +614,73 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#ifdef PADDLE_MOBILE_ZU5
DLOG << "Conv";
// return 0;
int ret = 0;
uint64_t output_scale = 0;
uint64_t timer_cnt = 0;
uint64_t image0_address_phy = 0;
uint64_t image1_address_phy = 0;
uint64_t output_address_phy = 0;
uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
uint64_t datalen = (uint64_t)args.image0.width *
(uint64_t)args.image0.height *
(uint64_t)args.image0.channels;
uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO;
DLOG << "Conv Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
image0_address_phy = vaddr_to_paddr(args.image0.address);
image1_address_phy = vaddr_to_paddr(args.image1.address);
output_address_phy = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGN);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) |
(uint64_t)args.image0.height;
/*SDK刷Cache保证数据一致性*/
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
reg_writeq(datalen, REG_EW_DATA_LEN);
reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL);
reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR);
reg_writeq(coefficient, REG_EW_COEFFICIENT);
reg_writeq(cmd, REG_EW_CMD);
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR;
ret = -EIO;
DLOG << "EW Wait Irq Timeout!";
}
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
}
......@@ -126,8 +702,117 @@ int PerformBypass(const struct BypassArgs &args) {
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#endif
#ifndef PADDLE_MOBILE_ZU5
return 0;
#ifdef PADDLE_MOBILE_ZU5
DLOG << "Bypass";
// return 0;
struct fpga_pe *pe;
uint64_t output_scale = 0;
uint64_t timer_cnt = 0;
uint64_t cmd = 0;
uint64_t datalen = 0;
uint64_t input_address_phy = 0;
uint64_t output_address_phy = 0;
uint8_t data_cell_in = 0;
uint8_t data_cell_out = 0;
int ret = 0;
datalen = (uint64_t)args.image.width * (uint64_t)args.image.height *
(uint64_t)args.image.channels;
datalen = align_to_x(datalen, 16);
input_address_phy = vaddr_to_paddr(args.image.address);
output_address_phy = vaddr_to_paddr(args.output.address);
DLOG << "input_phy:" << input_address_phy;
DLOG << "output_phy:" << output_address_phy;
switch (args.input_data_type) {
case DATA_TYPE_FP16: {
switch (args.output_data_type) {
case DATA_TYPE_FP16:
data_cell_in = SIZE_FP16;
data_cell_out = SIZE_FP16;
cmd = CMD_FP16_TO_FP16;
break;
case DATA_TYPE_FP32:
data_cell_in = SIZE_FP16;
data_cell_out = SIZE_FP32;
cmd = CMD_FP16_TO_FP32;
break;
default:
break;
}
} break;
case DATA_TYPE_FP32: {
switch (args.output_data_type) {
case DATA_TYPE_FP16:
data_cell_in = SIZE_FP32;
data_cell_out = SIZE_FP16;
cmd = CMD_FP32_TO_FP16;
break;
case DATA_TYPE_FP32:
data_cell_in = SIZE_FP32;
data_cell_out = SIZE_FP32;
cmd = CMD_FP32_TO_FP32;
break;
default:
break;
}
} break;
default:
break;
}
if (cmd != CMD_FP16_TO_FP16 && cmd != CMD_FP16_TO_FP32 &&
cmd != CMD_FP32_TO_FP16 && cmd != CMD_FP32_TO_FP32) {
return -EFAULT;
}
if ((data_cell_in != SIZE_FP16 && data_cell_in != SIZE_FP32) ||
(data_cell_out != SIZE_FP16 && data_cell_out != SIZE_FP32)) {
return -EFAULT;
}
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status) {
ret = -EIO;
DLOG << "Bypass Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(input_address_phy, REG_CONVERT_SRC_ADDR);
reg_writeq(output_address_phy, REG_CONVERT_DST_ADDR);
reg_writeq(datalen, REG_CONVERT_LENGTH);
/*SDK刷Cache保证数据一致性*/
reg_writeq(cmd, REG_CONVERT_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_BYPASS, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_BYPASS]->status = ERROR;
ret = -EIO;
DLOG << "BYPASS Wait Irq Timeout!";
}
DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
#endif
return 0;
......@@ -138,11 +823,14 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
DLOG << "=============ComputeFpgaConcat===========";
DLOG << " Image_num: " << args.image_num
<< " out_address:" << args.image_out
<< " out_scale_address:" << args.scale_out;
<< " out_scale_address:" << args.scale_out
<< " out_channel:" << args.out_channel;
DLOG << " image_height:" << args.height << " image_width:" << args.width;
for (int i = 0; i < args.image_num; i++) {
DLOG << " " << i << "th: ";
DLOG << " channel_num:" << args.channel_num[i]
DLOG << " channel_num:"
<< args.channel_num[i]
// << " aligned_channel_num:" << args.aligned_channel_num[i]
<< " image_address:" << args.images_in[i]
<< " image_scale_address:" << args.scales_in[i];
}
......@@ -154,6 +842,82 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
return 0;
}
void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
int sub_height, int sub_width, int omit_size) {
int origin_h = sub_height * sub_conv_n;
int origin_w = sub_width * sub_conv_n;
int align_origin_w = align_to_x(origin_w * channel, 16);
int deconv_h = origin_h - 2 * omit_size;
int deconv_w = origin_w - 2 * omit_size;
int deconv_row_len = deconv_w * channel;
int align_deconv_row_len = align_to_x(deconv_row_len, 16);
half *ptr_tmp = *data_in;
half *ptr_deconv =
(half *)fpga_malloc(num * align_deconv_row_len * deconv_h * sizeof(half));
memset(ptr_deconv, 0, num * align_deconv_row_len * deconv_h * sizeof(half));
int deconv_idx = 0;
for (int nn = 0; nn < num; ++nn) {
for (int hh = 0; hh < origin_h; ++hh) {
int hx = (hh % sub_conv_n);
half *sub_t = ptr_tmp + hx * sub_height * align_origin_w; // sub(hx,:);
int hi = (hh / sub_conv_n);
if ((hh < omit_size) || (hh >= (origin_h - omit_size))) continue;
// for (int ww = 0; ww < origin_w; ++ww){
// if((ww < omit_size) )// || (ww >= (origin_w-omit_size))
// continue;
int sidx = (nn * origin_h * align_origin_w + hi * align_origin_w +
omit_size * channel);
fpga_copy(ptr_deconv + deconv_idx, sub_t + sidx,
sizeof(half) * deconv_row_len);
deconv_idx += align_deconv_row_len;
//}
}
}
*data_in = ptr_deconv;
fpga_free(ptr_tmp);
}
int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFPGADeConv===========";
DLOG << " filter_num:" << args.filter_num
<< " group_num:" << args.group_num
<< " sub_conv_num:" << args.sub_conv_num;
#endif
int sub_conv_num = args.sub_conv_num;
for (int i = 0; i < sub_conv_num; i++) {
//#if CPU_SIMULATE
//#else
ComputeBasicConv(args.conv_args[i]);
//#endif
}
if (sub_conv_num > 1) {
float max_scale = -1.0;
for (int i = 0; i < sub_conv_num; i++) {
float ptr_scale = (args.conv_args[i].output.scale_address)[0];
if (ptr_scale > max_scale) {
args.output.scale_address[0] = ptr_scale;
args.output.scale_address[1] =
(args.conv_args[i].output.scale_address)[1];
}
}
deconv_post_process((half **)(&(args.output.address)), args.sub_conv_num, 1,
args.filter_num, (args.sub_output_height),
(args.sub_output_width), args.omit_size);
}
return 0;
}
int ComputeFPGASplit(const struct SplitArgs &args) {
#ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFpgaSplit===========";
......@@ -173,6 +937,5 @@ int ComputeFPGASplit(const struct SplitArgs &args) {
args.height, args.width);
return 0;
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -137,11 +137,13 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) {
std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg
<< std::endl;
break;
}
}
if (i <= timeout) {
if (i < timeout) {
return 0;
} else {
return -1;
......@@ -153,6 +155,12 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
unsigned int nr = (unsigned int)_nr;
int ret = 0;
DLOG << size;
DLOG << _nr;
DLOG << nr;
uint64_t a_size = FPGA_PAGE_SIZE * nr;
DLOG << a_size;
pthread_mutex_lock(&memory->mutex);
......@@ -166,6 +174,7 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
*addr = address_ofset;
} else {
DLOG << "memory request failed!";
ret = -ENOMEM;
}
......@@ -282,7 +291,7 @@ uint64_t vaddr_to_paddr(void *address) {
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
paddr = iter->second;
} else {
DLOG << "Invalid pointer";
DLOG << "Invalid pointer: " << address;
}
return paddr;
......@@ -348,6 +357,11 @@ void fpga_free_driver(void *ptr) {
fpga_bitmap::bitmap_clear(g_fpgainfo.memory_info->bitmap, pos,
g_fpgainfo.memory_info->nr[pos]);
pthread_mutex_unlock(&g_fpgainfo.memory_info->mutex);
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(ptr);
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
}
} else {
DLOG << "Invalid pointer";
}
......
......@@ -17,6 +17,7 @@ limitations under the License. */
#include <ctype.h>
#include <stdio.h>
#include <stdlib.h>
#include <unistd.h>
#include <cstring>
#include <map>
......@@ -44,7 +45,7 @@ const int PE_IDX_POOLING = 1;
const int PE_IDX_EW = 2;
const int PE_IDX_BYPASS = 3;
enum pe_status { IDLE = 0, BUSY = 1 };
enum pe_status { IDLE = 0, BUSY = 1, ERROR = 2 };
struct MemoryCacheArgs {
void *offset;
......@@ -58,7 +59,7 @@ struct MemoryCacheArgs {
struct fpga_pe {
char type_name[MAX_TYPE_NAME_LENTH + 1];
struct pe_data_s *outer;
pe_status status; // 0=idle 1=busy -1=fail
pe_status status;
uint64_t interrupt_cnt;
};
......@@ -106,6 +107,8 @@ inline uint64_t reg_readq(uint32_t offset) {
uint64_t value =
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset); // NOLINT
// DLOG << "read end";
usleep(10);
return value;
}
......@@ -114,6 +117,8 @@ inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value;
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset) = value;
// DLOG << "write end";
usleep(10);
}
int open_device_driver();
......
......@@ -74,12 +74,21 @@ struct ConcatArgs {
void* image_out;
float* scale_out;
uint32_t* channel_num;
// uint32_t* aligned_channel_num;
// uint32_t out_channel;
uint32_t* aligned_channel_num;
uint32_t out_channel;
uint32_t height;
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct SplitArgs {
uint32_t image_num;
int16_t* image_in;
......@@ -91,15 +100,6 @@ struct SplitArgs {
uint32_t width;
};
struct SplitConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* conv_arg;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
int16_t mode; // mode: 0:max, 1:avg
int16_t kernel_reciprocal;
......@@ -127,7 +127,14 @@ struct BypassArgs {
};
struct DeconvArgs {
struct ConvArgs conv_arg;
uint32_t sub_conv_num;
uint32_t group_num;
uint32_t filter_num;
uint32_t omit_size;
uint32_t sub_output_width;
uint32_t sub_output_height;
struct ImageOutputArgs output;
struct ConvArgs* conv_args;
};
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册