提交 1e8dd938 编写于 作者: Z zhangyang

remove redundancy for V1 for FPGA track

上级 d74fdd19
...@@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF) ...@@ -10,6 +10,7 @@ option(LOG_PROFILE "log profile" OFF)
option(CPU "armv7 with neon" ON) option(CPU "armv7 with neon" ON)
option(GPU_MALI "mali gpu" OFF) option(GPU_MALI "mali gpu" OFF)
option(GPU_CL "opencl gpu" OFF) option(GPU_CL "opencl gpu" OFF)
option(FPGA "fpga" OFF) option(FPGA "fpga" OFF)
if(FPGA) if(FPGA)
option(FPGAV1 "fpga v1" ON) option(FPGAV1 "fpga v1" ON)
...@@ -144,7 +145,7 @@ if(FPGA) ...@@ -144,7 +145,7 @@ if(FPGA)
endforeach() endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h) file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V2/*.h src/fpga/V2/*.h)
foreach(f ${_tmp_list}) foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach() endforeach()
endif() endif()
if(FPGAV2) if(FPGAV2)
...@@ -156,7 +157,7 @@ if(FPGA) ...@@ -156,7 +157,7 @@ if(FPGA)
endforeach() endforeach()
file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h) file(GLOB_RECURSE _tmp_list src/operators/kernel/fpga/V1/*.h src/fpga/V1/*.h)
foreach(f ${_tmp_list}) foreach(f ${_tmp_list})
list(REMOVE_ITEM PADDLE_MOBILE_CC ${f}) list(REMOVE_ITEM PADDLE_MOBILE_H ${f})
endforeach() endforeach()
endif() endif()
......
...@@ -24,8 +24,6 @@ namespace fpga { ...@@ -24,8 +24,6 @@ namespace fpga {
#define USE_RELU 1 #define USE_RELU 1
#define USE_BIAS 2 #define USE_BIAS 2
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
void format_image(framework::Tensor *image_tensor) { void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims(); auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3]; auto channel = dims[1], height = dims[2], width = dims[3];
...@@ -96,10 +94,6 @@ int get_aligned_filter_element_num(int chw) { ...@@ -96,10 +94,6 @@ int get_aligned_filter_element_num(int chw) {
return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT); return align_to_x(chw, FILTER_ELEMENT_ALIGNMENT);
} }
int get_aligned_filter_num(int num) {
return align_to_x(num, FILTER_NUM_ALIGNMENT);
}
void format_filter(framework::Tensor *filter_tensor, float max_value, void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num) { int group_num) {
filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT filter_tensor->scale[0] = float(max_value / 127.0); // NOLINT
...@@ -177,46 +171,37 @@ void format_concat_output(framework::Tensor *out, int height, int width, ...@@ -177,46 +171,37 @@ void format_concat_output(framework::Tensor *out, int height, int width,
void expand_conv_arg(ConvArgs *arg) { void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg; ConvArgs args = *arg;
uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height * auto fpga_bias_scale_len =
(uint64_t)args.image.channels;
filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT);
filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT);
uint64_t fpga_bias_scale_len =
align_to_x(args.filter_num / args.group_num, 8) * args.group_num; align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
uint64_t output_height = auto output_height =
(args.image.height + args.image.pad_height * 2 - args.kernel.height) / (args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h + args.kernel.stride_h +
1; 1;
uint64_t output_width = auto output_width =
(args.image.width + args.image.pad_width * 2 - args.kernel.width) / (args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w + args.kernel.stride_w +
1; 1;
uint64_t output_size =
output_height * output_width * (uint64_t)args.filter_num; auto filter_per_group = args.filter_num / args.group_num;
auto channel_per_group = args.image.channels / args.group_num;
auto filter_per_group = (uint64_t)(args.filter_num / args.group_num);
auto channel_per_group = (uint64_t)(args.image.channels / args.group_num); auto image_row_count = args.image.width * args.image.channels;
auto image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
uint64_t image_row_count = ((uint64_t)args.image.width) * auto image_one_pad_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT) +
((uint64_t)args.image.channels); // without align args.image.pad_width * args.image.channels;
uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT); auto filter_amount_all =
uint64_t image_one_pad_per_row = align_to_x(args.kernel.height * args.kernel.width * channel_per_group,
align_to_x(image_row_count, IMAGE_ALIGNMENT) +
((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
uint64_t filter_amount_all =
align_to_x(((uint64_t)args.kernel.height) *
((uint64_t)args.kernel.width) * channel_per_group,
FILTER_ELEMENT_ALIGNMENT); FILTER_ELEMENT_ALIGNMENT);
uint64_t output_amount_per_row = auto output_amount_per_row =
align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT); align_to_x(output_width * args.filter_num, IMAGE_ALIGNMENT);
// find the opt partition strategy // find the opt partition strategy
uint64_t res_win; uint64_t res_win;
uint64_t res_fit = 0; uint64_t res_fit = 0;
for (res_win = 1; res_win <= output_width; res_win = res_win + 1) { for (res_win = 1; res_win <= output_width; res_win++) {
if ((align_to_x( if ((align_to_x(
(args.image.channels * (args.image.channels *
(args.kernel.width + (res_win - 1) * args.kernel.stride_w)), (args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
...@@ -238,48 +223,48 @@ void expand_conv_arg(ConvArgs *arg) { ...@@ -238,48 +223,48 @@ void expand_conv_arg(ConvArgs *arg) {
} }
res_fit = res_win; res_fit = res_win;
uint64_t block_num = (output_width + res_fit - 1) / res_fit; auto block_num = (output_width + res_fit - 1) / res_fit;
uint64_t block_len = res_fit; auto block_len = res_fit;
uint64_t block_last = output_width - res_fit * (block_num - 1); auto block_last = output_width - res_fit * (block_num - 1);
uint64_t res_amount_per_row = output_width * args.filter_num; auto res_amount_per_row = output_width * args.filter_num;
uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row; auto res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
uint64_t image_block_amount_per_row = auto image_block_amount_per_row =
args.kernel.stride_w * (res_fit)*args.image.channels; args.kernel.stride_w * res_fit * args.image.channels;
uint64_t filter_pad_width_mul_channel = auto filter_pad_width_mul_channel =
args.image.pad_width * args.image.channels; args.image.pad_width * args.image.channels;
uint64_t image_amount_per_row_multi_win_first = auto image_amount_per_row_multi_win_first =
image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height); image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
uint64_t image_amount_per_row_multi_win = auto image_amount_per_row_multi_win =
image_amount_per_row * (4 * args.kernel.stride_h); image_amount_per_row * (4 * args.kernel.stride_h);
uint64_t image_block_num = block_num; auto image_block_num = block_num;
uint64_t image_block_len = auto image_block_len =
align_to_x((args.image.channels * align_to_x((args.image.channels *
(args.kernel.width + (block_len - 1) * args.kernel.stride_w)), (args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) / IMAGE_ALIGNMENT) /
16 + 16 +
1; 1;
uint64_t image_block_len_last = auto image_block_len_last =
align_to_x( align_to_x(
(args.image.channels * (args.image.channels *
(args.kernel.width + (block_last - 1) * args.kernel.stride_w)), (args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) / IMAGE_ALIGNMENT) /
16 + 16 +
1; 1;
uint64_t image_win_cnt = block_len; auto image_win_cnt = block_len;
uint64_t image_win_cnt_last = block_last; auto image_win_cnt_last = block_last;
uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8; auto res_row_data_align4_pad = res_amount_per_row_pad / 8;
uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1; auto prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
if (prog_full_cnt == 1023) { if (prog_full_cnt == 1023) {
prog_full_cnt--; prog_full_cnt--;
} }
uint64_t post_prog_full_cnt = auto post_prog_full_cnt =
(512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2) (512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2) ? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
: 0; : 0;
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; auto cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address); (*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address); (*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
...@@ -449,7 +434,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -449,7 +434,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->sub_conv_num = (uint32_t)stride_h; arg->sub_conv_num = (uint32_t)stride_h;
arg->filter_num = (uint32_t)filter->dims()[0]; arg->filter_num = (uint32_t)filter->dims()[0];
int sub_conv_num = arg->sub_conv_num; int sub_conv_num = arg->sub_conv_num;
int sub_stride = 1;
int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3], int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
padding_w, stride_w); padding_w, stride_w);
int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis( int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis(
...@@ -466,16 +450,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -466,16 +450,12 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
stride_w, (int)filter->dims()[3], padding_w); stride_w, (int)filter->dims()[3], padding_w);
arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
int sub_channels = (int)input->dims()[1]; auto sub_channels = (int)input->dims()[1];
int omit_size = arg->omit_size;
int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
int sub_filter_num = sub_conv_num * (arg->filter_num); int sub_filter_num = sub_conv_num * (arg->filter_num);
int conv_output_size = int conv_output_size =
(align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) * (align_to_x(sub_output_width * sub_filter_num, IMAGE_ALIGNMENT)) *
sub_output_height; sub_output_height;
int ouput_size = conv_output_size * sub_conv_num;
int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT); int align_sub_filter_num = align_to_x(sub_filter_num, FILTER_NUM_ALIGNMENT);
int align_sub_filter_count = int align_sub_filter_count =
...@@ -485,7 +465,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -485,7 +465,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
align_sub_filter_count * align_sub_filter_num; align_sub_filter_count * align_sub_filter_num;
for (int i = 0; i < sub_conv_num; ++i) { for (int i = 0; i < sub_conv_num; ++i) {
arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); arg->conv_args[i].filter_num = arg->sub_conv_num * arg->filter_num;
arg->conv_args[i].group_num = (uint32_t)group_num; arg->conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].filter_scale_address = filter->scale; arg->conv_args[i].filter_scale_address = filter->scale;
...@@ -496,7 +476,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -496,7 +476,6 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->conv_args[i].kernel.stride_w = 1; arg->conv_args[i].kernel.stride_w = 1;
arg->conv_args[i].kernel.stride_h = 1; arg->conv_args[i].kernel.stride_h = 1;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg->conv_args[i].image.scale_address = input->scale; arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.channels = (uint32_t)sub_channels; arg->conv_args[i].image.channels = (uint32_t)sub_channels;
arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
...@@ -504,30 +483,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -504,30 +483,31 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->conv_args[i].image.pad_width = (uint32_t)sub_pad; arg->conv_args[i].image.pad_width = (uint32_t)sub_pad;
arg->conv_args[i].image.pad_height = (uint32_t)sub_pad; arg->conv_args[i].image.pad_height = (uint32_t)sub_pad;
arg->conv_args[i].image.address = input_ptr; arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].sb_address = (void *)bs_ptr; arg->conv_args[i].sb_address = bs_ptr;
auto filter_sub_space = auto filter_sub_space =
(char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
fpga_copy(filter_sub_space, fpga_copy(filter_sub_space,
(char *)filter_ptr + i * align_conv_sub_filter_count, (char *)filter_ptr + i * align_conv_sub_filter_count,
(size_t)align_conv_sub_filter_count); (size_t)align_conv_sub_filter_count);
arg->conv_args[i].filter_address = (void *)(filter_sub_space); arg->conv_args[i].filter_address = filter_sub_space;
fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count); fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count);
if (sub_conv_num == 1) { if (sub_conv_num == 1) {
arg->conv_args[i].output.address = out_ptr; arg->conv_args[i].output.address = out_ptr;
arg->conv_args[i].output.scale_address = out->scale; arg->conv_args[i].output.scale_address = out->scale;
} else { } else {
auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); auto ptr_output = fpga_malloc(conv_output_size * sizeof(half));
arg->conv_args[i].output.address = (void *)((half *)ptr_output); arg->conv_args[i].output.address = ptr_output;
auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.scale_address = ptr_output_scale; arg->conv_args[i].output.scale_address = ptr_output_scale;
} }
expand_conv_arg(&arg->conv_args[i]);
} }
arg->output.address = out_ptr; arg->output.address = out_ptr;
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
// fpga_free(filter_ptr); filter->reset_data_ptr(nullptr);
} // fill_deconv_arg } // fill_deconv_arg
} // namespace fpga } // namespace fpga
......
...@@ -21,7 +21,6 @@ limitations under the License. */ ...@@ -21,7 +21,6 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
int get_align_image_cw(int cw);
void format_image(framework::Tensor* image_tensor); void format_image(framework::Tensor* image_tensor);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp32_ofm(framework::Tensor* ofm_tensor); void format_fp32_ofm(framework::Tensor* ofm_tensor);
...@@ -30,7 +29,6 @@ float filter_find_max(framework::Tensor* filter_tensor); ...@@ -30,7 +29,6 @@ float filter_find_max(framework::Tensor* filter_tensor);
int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num); int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
int get_plit_num(framework::Tensor* filter_tensor); int get_plit_num(framework::Tensor* filter_tensor);
int get_aligned_filter_element_num(int chw); int get_aligned_filter_element_num(int chw);
int get_aligned_filter_num(int num);
void format_filter(framework::Tensor* filter_tensor, float max_value, void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num); int group_num);
void format_fc_filter(framework::Tensor* filter_tensor, float max_value); void format_fc_filter(framework::Tensor* filter_tensor, float max_value);
......
...@@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter ...@@ -40,10 +40,9 @@ inverse kernel weights of each channel for every filter
void deconv_inverse_filter(float** data_in, int num, int channel, int width, void deconv_inverse_filter(float** data_in, int num, int channel, int width,
int height) { int height) {
float* tmp = *data_in; float* tmp = *data_in;
// float fix_range = 127;// float scale = fix_range / max;
int data_size = num * channel * width * height; int data_size = num * channel * width * height;
int hw_len = height * width; int hw_len = height * width;
float* tmp_data = (float*)fpga_malloc(data_size * sizeof(float)); auto tmp_data = (float*)fpga_malloc(data_size * sizeof(float));
for (int i = 0; i < num; ++i) { for (int i = 0; i < num; ++i) {
for (int j = 0; j < channel; ++j) { for (int j = 0; j < channel; ++j) {
for (int k = 0; k < hw_len; ++k) { for (int k = 0; k < hw_len; ++k) {
...@@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width, ...@@ -52,7 +51,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
} }
} }
} }
*data_in = (float*)tmp_data; // *data_in = tmp_data;
fpga_free(tmp); fpga_free(tmp);
} }
...@@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width, ...@@ -61,8 +60,7 @@ void deconv_inverse_filter(float** data_in, int num, int channel, int width,
*/ */
int deconv_calc_sub_pad(int filter_axis, int pad, int stride) { int deconv_calc_sub_pad(int filter_axis, int pad, int stride) {
if (stride == 0 || ((filter_axis - pad - 1) < 0)) { if (stride == 0 || ((filter_axis - pad - 1) < 0)) {
// error PADDLE_MOBILE_ENFORCE(false, "Wrong deconv parameters");
return 0;
} }
return (filter_axis - pad - 1) / stride; return (filter_axis - pad - 1) / stride;
} }
...@@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) { ...@@ -79,11 +77,8 @@ int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis) {
position. so the omit rows or columns is (stride - ) position. so the omit rows or columns is (stride - )
*/ */
int deconv_get_omit(int stride, int filter_width, int pad) { int deconv_get_omit(int stride, int filter_width, int pad) {
if (((filter_width - pad) <= 0)) { // ((filter_width-pad) > stride) || PADDLE_MOBILE_ENFORCE(filter_width > pad, "Wrong deconv parameters");
// error int idx;
return 0;
}
int idx = 1;
bool flag = false; bool flag = false;
for (idx = 1; idx <= stride; ++idx) { for (idx = 1; idx <= stride; ++idx) {
int j = idx; int j = idx;
...@@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) { ...@@ -102,10 +97,6 @@ int deconv_get_omit(int stride, int filter_width, int pad) {
return (stride - idx); return (stride - idx);
} }
int deconv_get_sub_filter_num(int filter_num, int stride) {
return filter_num * stride;
}
void deconv_get_sub_filter(char** data_in, int height, int width, void deconv_get_sub_filter(char** data_in, int height, int width,
int sub_conv_n, int kernel_num, int channel) { int sub_conv_n, int kernel_num, int channel) {
char* ptr_tmp = *data_in; char* ptr_tmp = *data_in;
...@@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height, ...@@ -245,7 +236,6 @@ void deconv_format_filter(float** data_in, int num, int channel, int height,
char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset * char* ptr_space = (char*)fpga_malloc(sub_conv_n * align_offset *
sizeof(char)); // continuous space sizeof(char)); // continuous space
for (int i = 0; i < sub_conv_n; ++i) { for (int i = 0; i < sub_conv_n; ++i) {
int offset = i * origin_offset;
char* ptr_tmp = (ptr_ptr_data)[i]; char* ptr_tmp = (ptr_ptr_data)[i];
filter::align_element(&ptr_tmp, sub_num, sub_chw); filter::align_element(&ptr_tmp, sub_num, sub_chw);
......
...@@ -21,7 +21,6 @@ namespace deconv_filter { ...@@ -21,7 +21,6 @@ namespace deconv_filter {
void deconv_inverse_filter(float** data_in, int num, int channel, int width, void deconv_inverse_filter(float** data_in, int num, int channel, int width,
int height); int height);
int deconv_calc_sub_pad(int filter_axis, int pad, int stride); int deconv_calc_sub_pad(int filter_axis, int pad, int stride);
int deconv_get_sub_filter_num(int filter_num, int stride);
int deconv_get_sub_filter_axis(int filter_axis, int stride); int deconv_get_sub_filter_axis(int filter_axis, int stride);
int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis); int deconv_get_sub_out_axis(int image_axis, int sub_pad, int sub_filter_axis);
int deconv_get_omit(int stride, int filter_width, int pad); int deconv_get_omit(int stride, int filter_width, int pad);
......
此差异已折叠。
...@@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) { ...@@ -153,10 +153,6 @@ int memory_request(struct fpga_memory *memory, size_t size, uint64_t *addr) {
uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE); uint64_t _nr = DIV_ROUND_UP(size, FPGA_PAGE_SIZE);
unsigned int nr = (unsigned int)_nr; unsigned int nr = (unsigned int)_nr;
int ret = 0; int ret = 0;
DLOG << size;
DLOG << _nr;
DLOG << nr;
uint64_t a_size = FPGA_PAGE_SIZE * nr; uint64_t a_size = FPGA_PAGE_SIZE * nr;
DLOG << a_size; DLOG << a_size;
...@@ -283,7 +279,7 @@ int fpga_memory_add() { ...@@ -283,7 +279,7 @@ int fpga_memory_add() {
return 0; return 0;
} }
uint64_t vaddr_to_paddr(void *address) { uint64_t vaddr_to_paddr_driver(void *address) {
uint64_t paddr = 0; uint64_t paddr = 0;
auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address); auto iter = g_fpgainfo.fpga_vaddr2paddr_map.find(address);
if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) { if (iter != g_fpgainfo.fpga_vaddr2paddr_map.end()) {
...@@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) { ...@@ -315,7 +311,7 @@ void *fpga_reg_free(void *ptr) {
g_fpgainfo.fpga_addr2size_map.erase(iter); g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size); munmap(ptr, size);
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer" << ptr;
} }
} }
...@@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) { ...@@ -347,7 +343,7 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo.fpga_addr2size_map.erase(iter); g_fpgainfo.fpga_addr2size_map.erase(iter);
munmap(ptr, size); munmap(ptr, size);
p_addr = vaddr_to_paddr(ptr); p_addr = vaddr_to_paddr_driver(ptr);
pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE; pos = (p_addr - g_fpgainfo.memory_info->mem_start) / FPGA_PAGE_SIZE;
/*clear bitmap*/ /*clear bitmap*/
...@@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) { ...@@ -361,7 +357,7 @@ void fpga_free_driver(void *ptr) {
g_fpgainfo.fpga_vaddr2paddr_map.erase(iter); g_fpgainfo.fpga_vaddr2paddr_map.erase(iter);
} }
} else { } else {
DLOG << "Invalid pointer"; DLOG << "Invalid pointer" << ptr;
} }
} }
...@@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) { ...@@ -373,7 +369,7 @@ int fpga_flush_driver(void *address, size_t size) {
struct MemoryCacheArgs args; struct MemoryCacheArgs args;
uint64_t p_addr; uint64_t p_addr;
p_addr = vaddr_to_paddr(address); p_addr = vaddr_to_paddr_driver(address);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT
args.size = size; args.size = size;
...@@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) { ...@@ -385,7 +381,7 @@ int fpga_invalidate_driver(void *address, size_t size) {
struct MemoryCacheArgs args; struct MemoryCacheArgs args;
uint64_t p_addr; uint64_t p_addr;
p_addr = vaddr_to_paddr(address); p_addr = vaddr_to_paddr_driver(address);
args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT args.offset = (void *)(p_addr - FPGA_MEM_PHY_ADDR); // NOLINT
args.size = size; args.size = size;
......
...@@ -31,8 +31,8 @@ namespace driver { ...@@ -31,8 +31,8 @@ namespace driver {
#define FPGA_REG_PHY_ADDR 0xa0000000 #define FPGA_REG_PHY_ADDR 0xa0000000
#define FPGA_REG_SIZE 0x1000 #define FPGA_REG_SIZE 0x1000
#define FPGA_MEM_PHY_ADDR 0x20000000 #define FPGA_MEM_PHY_ADDR 0x40000000
#define FPGA_MEM_SIZE 0x20000000 #define FPGA_MEM_SIZE 0x80000000
#define FPGA_PAGE_SIZE (16UL * 1024UL) #define FPGA_PAGE_SIZE (16UL * 1024UL)
...@@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size); ...@@ -122,15 +122,11 @@ void *fpga_malloc_driver(size_t size);
void fpga_free_driver(void *ptr); void fpga_free_driver(void *ptr);
void fpga_copy_driver(void *dest, const void *src, size_t num);
int fpga_flush_driver(void *address, size_t size); int fpga_flush_driver(void *address, size_t size);
int fpga_invalidate_driver(void *address, size_t size); int fpga_invalidate_driver(void *address, size_t size);
/*pe*/ uint64_t vaddr_to_paddr_driver(void *address);
uint64_t vaddr_to_paddr(void *address);
int fpga_regpoll(uint64_t reg, uint64_t val, int time); int fpga_regpoll(uint64_t reg, uint64_t val, int time);
......
...@@ -37,6 +37,18 @@ enum LayoutType { ...@@ -37,6 +37,18 @@ enum LayoutType {
LAYOUT_HWC = 0, LAYOUT_HWC = 0,
}; };
enum ActivationType {
NONE = 0,
LEAKYRELU = 1,
SIGMOID = 2,
TANH = 3,
};
struct ActivationArgs {
enum ActivationType activation_type;
int16_t leaky_relu_negative_slope;
};
struct KernelArgs { struct KernelArgs {
uint32_t width; uint32_t width;
uint32_t height; uint32_t height;
...@@ -58,7 +70,10 @@ struct ImageOutputArgs { ...@@ -58,7 +70,10 @@ struct ImageOutputArgs {
void* address; // output result address; void* address; // output result address;
float* scale_address; // output scale address; float* scale_address; // output scale address;
uint64_t timer_cnt; // time counter for FPGA computation uint64_t timer_cnt; // time counter for FPGA computation
struct ActivationArgs
activation; // To select activation and specify (Leaky)Relu parameter.
}; };
#ifdef PADDLE_MOBILE_FPGA_V1 #ifdef PADDLE_MOBILE_FPGA_V1
struct ConvDriverParam { struct ConvDriverParam {
uint64_t image_address_phy; uint64_t image_address_phy;
...@@ -198,7 +213,11 @@ struct DeconvArgs { ...@@ -198,7 +213,11 @@ struct DeconvArgs {
struct ConvArgs* conv_args; struct ConvArgs* conv_args;
}; };
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; } // static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// }
static inline uint32_t align_to_x(int64_t num, int64_t x) {
return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x;
}
int16_t fp32_2_fp16(float fp32_num); int16_t fp32_2_fp16(float fp32_num);
float fp16_2_fp32(int16_t fp16_num); float fp16_2_fp32(int16_t fp16_num);
......
...@@ -14,7 +14,6 @@ limitations under the License. */ ...@@ -14,7 +14,6 @@ limitations under the License. */
#ifdef TRANSPOSE2_OP #ifdef TRANSPOSE2_OP
#include "operators/kernel/transpose2_kernel.h" #include "operators/kernel/transpose2_kernel.h"
#include "operators/kernel/central-arm-func/transpose2_arm_func.h"
namespace paddle_mobile { namespace paddle_mobile {
namespace operators { namespace operators {
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册