提交 2c7af720 编写于 作者: qnqinan's avatar qnqinan 提交者: Jiaying Zhao

fix bugs of concat, reshape and slice op and add usleep in fpga regpoll,...

fix bugs of concat, reshape and slice op and add usleep in fpga regpoll, test=develop,  close #2501 (#2502)

* update proposal and psroipool kernel file in FPGA V2 track

* update, test=develop

* update FPGA v2 pe cpp file and ew kernel files, test=develop

* fix a bug of sigmoid kernel in FPGA v2 track, test=develop

* fix bugs of concat, reshape and slice op and add usleep in fpga regpoll, test=develop

* add interupt clear operation before op compute in FPGA V2 track, test=develop
上级 ab362576
...@@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, ...@@ -83,11 +83,6 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
height * height *
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
sizeof(int8_t)); sizeof(int8_t));
for (j = 0;
j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
j++) {
images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5);
}
} }
align_each_out_area_cw = align_each_out_area_cw =
align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT); align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
...@@ -102,7 +97,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, ...@@ -102,7 +97,7 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
memcpy( memcpy(
(int8_t *)image_out + tmp_channel + // NOLINT (int8_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ, k * align_each_out_area_cw_differ,
images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw, images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int8_t)); channel_num[i] * sizeof(int8_t));
tmp_channel += channel_num[i]; tmp_channel += channel_num[i];
...@@ -110,6 +105,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out, ...@@ -110,6 +105,10 @@ void concat_images(int8_t **images_in, float **scales_in, void *image_out,
} }
} }
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t)); fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
for (i = 0; i < image_num; i++) {
fpga_free(images_in_tmp[i]);
}
fpga_free(images_in_tmp);
} }
void split_image(int8_t *image_in, void **images_out, int image_num, void split_image(int8_t *image_in, void **images_out, int image_num,
......
...@@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -248,8 +248,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
// DLOG << " activation_type:" << active_args.activation_type // DLOG << " activation_type:" << active_args.activation_type
// << " leaky_relu_negative_slope:" // << " leaky_relu_negative_slope:"
// << active_args.leaky_relu_negative_slope; // << active_args.leaky_relu_negative_slope;
// DLOG << " reg_ActivationArgs:" << reg_ActivationArgs; DLOG << " reg_ActivationArgs:";
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
ret = -EIO; ret = -EIO;
...@@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -257,6 +257,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
// reg_writeq(reg_ActivationArgs,
// REG_ACTIVATION_MODE_AND_LEAKY_RELU_FACTOR); // active functoion
reg_writeq(output_scale, REG_SCALE_PARAMETER);
// new // new
reg_writeq((args.driver.row_padding_down << 45) | reg_writeq((args.driver.row_padding_down << 45) |
(args.driver.row_padding_up << 34) | (args.driver.row_padding_up << 34) |
...@@ -365,7 +369,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -365,7 +369,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
uint64_t cmd = 0; uint64_t cmd = 0;
uint64_t image_physical_address = 0; uint64_t image_physical_address = 0;
uint64_t output_physical_address = 0; uint64_t output_physical_address = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
image_physical_address = vaddr_to_paddr(args.image.address); image_physical_address = vaddr_to_paddr(args.image.address);
output_physical_address = vaddr_to_paddr(args.output.address); output_physical_address = vaddr_to_paddr(args.output.address);
uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64); uint64_t C_paral_64 = align_to_x((uint64_t)args.image.channels, 64);
...@@ -440,7 +444,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -440,7 +444,7 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, 0x808); reg_writeq(image_physical_address, 0x808);
reg_writeq(result_addr_row, 0x810); reg_writeq(result_addr_row, 0x810);
reg_writeq(kernel_padding_step, 0x818); reg_writeq(kernel_padding_step, 0x818);
...@@ -497,7 +501,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -497,7 +501,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
int ret = 0; int ret = 0;
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
...@@ -534,6 +538,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -534,6 +538,7 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam); uint32_t* ew_scale = reinterpret_cast<uint32_t*>(&quantParam);
uint64_t ew_scale_mult_factor = (*ew_scale) | uint64_t ew_scale_mult_factor = (*ew_scale) |
((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40); ((uint64_t)args.const0 << 32) | ((uint64_t)args.const1 << 40);
reg_writeq(0ul, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, 0x808); reg_writeq(image_physical_address, 0x808);
reg_writeq(result_addr_row, 0x810); reg_writeq(result_addr_row, 0x810);
reg_writeq(kernel_padding_step, 0x818); reg_writeq(kernel_padding_step, 0x818);
...@@ -928,6 +933,7 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -928,6 +933,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "DWConv"; DLOG << "DWConv";
uint64_t bypass_interrupt = reg_readq(REG_INTERRUPT);
// return 0; // return 0;
uint64_t timer_cnt = 0; uint64_t timer_cnt = 0;
int ret = 0; int ret = 0;
...@@ -1011,7 +1017,7 @@ int ComputeDWConv(const struct DWconvArgs &args) { ...@@ -1011,7 +1017,7 @@ int ComputeDWConv(const struct DWconvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
reg_writeq(0ul, REG_SCALE_PARAMETER);
reg_writeq(image_physical_address, 0x808); reg_writeq(image_physical_address, 0x808);
reg_writeq(result_addr_row, 0x810); reg_writeq(result_addr_row, 0x810);
reg_writeq(kernel_padding_step, 0x818); reg_writeq(kernel_padding_step, 0x818);
......
...@@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { ...@@ -134,9 +134,9 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
uint64_t i = 0; uint64_t i = 0;
/*timeout精确性待确认*/ /*timeout精确性待确认*/
int64_t timeout = time * 6; int64_t timeout = time * 6;
usleep(1);
for (i = 0; i < timeout; i++) { for (i = 0; i < timeout; i++) {
usleep(1);
if (val == reg_readq(reg)) { if (val == reg_readq(reg)) {
break; break;
} }
......
...@@ -211,6 +211,7 @@ struct ConcatArgs { ...@@ -211,6 +211,7 @@ struct ConcatArgs {
uint32_t out_channel; uint32_t out_channel;
uint32_t height; uint32_t height;
uint32_t width; uint32_t width;
std::vector<std::shared_ptr<char>> vector_concat_space;
}; };
struct SplitConvArgs { struct SplitConvArgs {
......
...@@ -53,6 +53,15 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) { ...@@ -53,6 +53,15 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs.channel_num = channel_num; concatArgs.channel_num = channel_num;
concatArgs.height = height; concatArgs.height = height;
concatArgs.width = width; concatArgs.width = width;
auto deleter = [](void *p) { fpga::fpga_free(p); };
concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(concatArgs.images_in), deleter));
concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(concatArgs.scales_in), deleter));
concatArgs.vector_concat_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(concatArgs.channel_num), deleter));
param->SetFpgaArgs(concatArgs); param->SetFpgaArgs(concatArgs);
return true; return true;
} }
......
...@@ -110,7 +110,27 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) { ...@@ -110,7 +110,27 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
} }
} }
output->Resize(framework::make_ddim(shape)); output->Resize(framework::make_ddim(shape));
bool reshapeNeedFlg = 1;
if (output->dims() == input->dims()) { if (output->dims() == input->dims()) {
reshapeNeedFlg = 0;
} else if (output->dims().size() != input->dims().size()) {
auto inputdimsize = input->dims().size();
auto outputdimsize = output->dims().size();
int smallersize =
inputdimsize > outputdimsize ? outputdimsize : inputdimsize;
int i = 0;
for (i = 0; i < smallersize; i++) {
if ((input->dims())[i] != (output->dims())[i])
break;
}
if (i == smallersize) {
reshapeNeedFlg = 0;
}
}
if (reshapeNeedFlg) {
reshape(input, output);
} else {
DLOG << "No need to reshape"; DLOG << "No need to reshape";
output->ShareDataWith(*input); output->ShareDataWith(*input);
framework::LoD lod = input->lod(); framework::LoD lod = input->lod();
...@@ -118,9 +138,6 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) { ...@@ -118,9 +138,6 @@ void Reshape2Kernel<FPGA, float>::Compute(const Reshape2Param<FPGA> &param) {
output->scale[0] = input->scale[0]; output->scale[0] = input->scale[0];
return; return;
} }
reshape(input, output);
//
} }
} // namespace operators } // namespace operators
......
...@@ -30,6 +30,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) { ...@@ -30,6 +30,7 @@ bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
} }
return true; return true;
} }
template <> template <>
void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) { void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
// Only support slicing in channel dimension // Only support slicing in channel dimension
...@@ -38,6 +39,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) { ...@@ -38,6 +39,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
auto input = param.input_; auto input = param.input_;
auto output = param.output_; auto output = param.output_;
int H = input->dims()[2];
int W = input->dims()[3];
int HW = input->dims()[2] * input->dims()[3]; int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1]; int channel = input->dims()[1];
auto input_ptr = input->data<int8_t>(); auto input_ptr = input->data<int8_t>();
...@@ -53,10 +56,32 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) { ...@@ -53,10 +56,32 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
end = end > channel ? channel : end; end = end > channel ? channel : end;
int len = end - start; int len = end - start;
size_t size = len * sizeof(int8_t); size_t size = len * sizeof(int8_t);
DLOG << input->fpga_data_num;
fpga::fpga_invalidate(input_ptr, input->fpga_data_num*sizeof(int8_t));
DLOG << output->fpga_data_num;
fpga::fpga_invalidate(output_ptr, output->fpga_data_num*sizeof(int8_t));
int unalignedWC = len * W;
int alignedWC = fpga::align_to_x(W * len, IMAGE_ALIGNMENT);
for (int i = 0; i < HW; i++) { if (unalignedWC != alignedWC) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size); auto tmpOutput = reinterpret_cast<int8_t*>
(fpga::fpga_malloc(len*HW * sizeof(int8_t)));
for (int i = 0; i < HW; i++) {
memcpy(tmpOutput + len * i, input_ptr + i * channel + start, size);
}
for (int i = 0; i < H; i++) {
for (int j = 0; j < unalignedWC; j++) {
*(output_ptr + alignedWC * i + j) =
*(tmpOutput + unalignedWC * i + j);
}
}
fpga::fpga_free(tmpOutput);
} else {
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
}
} }
fpga::fpga_flush(output_ptr, output->fpga_data_num*sizeof(int8_t));
} }
} // namespace operators } // namespace operators
} // namespace paddle_mobile } // namespace paddle_mobile
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册