提交 5f1058ea 编写于 作者: C chenhoujiang

Merge branch 'develop' of https://github.com/PaddlePaddle/paddle-mobile into dev-latest

...@@ -21,6 +21,9 @@ limitations under the License. */ ...@@ -21,6 +21,9 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
#define USE_RELU 1
#define USE_BIAS 2
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); } int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
void format_image(framework::Tensor *image_tensor) { void format_image(framework::Tensor *image_tensor) {
...@@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width, ...@@ -172,6 +175,170 @@ void format_concat_output(framework::Tensor *out, int height, int width,
out->reset_data_ptr(data_ptr); out->reset_data_ptr(data_ptr);
} }
void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg;
uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height *
(uint64_t)args.image.channels;
filterlen = align_to_x(filterlen, FILTER_ELEMENT_ALIGNMENT);
filterlen *= align_to_x((uint64_t)args.filter_num, FILTER_NUM_ALIGNMENT);
uint64_t fpga_bias_scale_len =
align_to_x(args.filter_num / args.group_num, 8) * args.group_num;
uint64_t output_height =
(args.image.height + args.image.pad_height * 2 - args.kernel.height) /
args.kernel.stride_h +
1;
uint64_t output_width =
(args.image.width + args.image.pad_width * 2 - args.kernel.width) /
args.kernel.stride_w +
1;
uint64_t output_size =
output_height * output_width * (uint64_t)args.filter_num;
auto filter_per_group = (uint64_t)(args.filter_num / args.group_num);
auto channel_per_group = (uint64_t)(args.image.channels / args.group_num);
uint64_t image_row_count = ((uint64_t)args.image.width) *
((uint64_t)args.image.channels); // without align
uint64_t image_amount_per_row = align_to_x(image_row_count, IMAGE_ALIGNMENT);
uint64_t image_one_pad_per_row =
align_to_x(image_row_count, IMAGE_ALIGNMENT) +
((uint64_t)args.image.pad_width) * ((uint64_t)args.image.channels);
uint64_t filter_amount_all =
align_to_x(((uint64_t)args.kernel.height) *
((uint64_t)args.kernel.width) * channel_per_group,
FILTER_ELEMENT_ALIGNMENT);
uint64_t output_amount_per_row =
align_to_x(output_width * ((uint64_t)args.filter_num), IMAGE_ALIGNMENT);
// find the opt partition strategy
uint64_t res_win;
uint64_t res_fit = 0;
for (res_win = 1; res_win <= output_width; res_win = res_win + 1) {
if ((align_to_x(
(args.image.channels *
(args.kernel.width + (res_win - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
1) *
args.kernel.height >
2048) {
break;
}
}
if (res_win != output_width) {
res_win -= 1;
}
if (((res_win % 2) != 0) && (res_win != 1)) {
res_win = res_win - 1;
}
res_fit = res_win;
uint64_t block_num = (output_width + res_fit - 1) / res_fit;
uint64_t block_len = res_fit;
uint64_t block_last = output_width - res_fit * (block_num - 1);
uint64_t res_amount_per_row = output_width * args.filter_num;
uint64_t res_amount_per_row_pad = output_amount_per_row - res_amount_per_row;
uint64_t image_block_amount_per_row =
args.kernel.stride_w * (res_fit)*args.image.channels;
uint64_t filter_pad_width_mul_channel =
args.image.pad_width * args.image.channels;
uint64_t image_amount_per_row_multi_win_first =
image_amount_per_row * (4 * args.kernel.stride_h - args.image.pad_height);
uint64_t image_amount_per_row_multi_win =
image_amount_per_row * (4 * args.kernel.stride_h);
uint64_t image_block_num = block_num;
uint64_t image_block_len =
align_to_x((args.image.channels *
(args.kernel.width + (block_len - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
1;
uint64_t image_block_len_last =
align_to_x(
(args.image.channels *
(args.kernel.width + (block_last - 1) * args.kernel.stride_w)),
IMAGE_ALIGNMENT) /
16 +
1;
uint64_t image_win_cnt = block_len;
uint64_t image_win_cnt_last = block_last;
uint64_t res_row_data_align4_pad = res_amount_per_row_pad / 8;
uint64_t prog_full_cnt = 2048 / (filter_amount_all / 16 * 2) - 1;
if (prog_full_cnt == 1023) {
prog_full_cnt--;
}
uint64_t post_prog_full_cnt =
(512 / (align_to_x(args.filter_num, 4) / 4 * 2) > 2)
? (512 / (align_to_x(args.filter_num, 4) / 4 * 2) - 2)
: 0;
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
(*arg).driver.image_address_phy = vaddr_to_paddr(args.image.address);
(*arg).driver.sb_address_phy = vaddr_to_paddr(args.sb_address);
(*arg).driver.filter_address_phy = vaddr_to_paddr(args.filter_address);
(*arg).driver.output_address_phy = vaddr_to_paddr(args.output.address);
(*arg).driver.output_height = output_height;
(*arg).driver.output_width = output_width;
(*arg).driver.filter_per_group = filter_per_group;
(*arg).driver.channel_per_group = channel_per_group;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.image_one_pad_per_row = image_one_pad_per_row;
(*arg).driver.filter_amount_all = filter_amount_all;
(*arg).driver.output_amount_per_row = output_amount_per_row;
(*arg).driver.image_block_amount_per_row = image_block_amount_per_row;
(*arg).driver.filter_pad_width_mul_channel = filter_pad_width_mul_channel;
(*arg).driver.image_amount_per_row_multi_win_first =
image_amount_per_row_multi_win_first;
(*arg).driver.image_amount_per_row_multi_win = image_amount_per_row_multi_win;
(*arg).driver.image_block_num = image_block_num;
(*arg).driver.image_block_len = image_block_len;
(*arg).driver.image_block_len_last = image_block_len_last;
(*arg).driver.image_win_cnt = image_win_cnt;
(*arg).driver.image_win_cnt_last = image_win_cnt_last;
(*arg).driver.res_row_data_align4_pad = res_row_data_align4_pad;
(*arg).driver.prog_full_cnt = prog_full_cnt;
(*arg).driver.post_prog_full_cnt = post_prog_full_cnt;
(*arg).driver.fpga_bias_scale_len = fpga_bias_scale_len;
(*arg).driver.cmd = cmd;
} // expand_conv_arg()
void expand_EW_arg(EWAddArgs *arg) {
EWAddArgs args = *arg;
uint64_t cmd = args.relu_enabled ? USE_RELU : 0;
uint64_t datalen = (uint64_t)args.image0.width *
(uint64_t)args.image0.height *
(uint64_t)args.image0.channels;
uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
uint64_t image0_address_phy = vaddr_to_paddr(args.image0.address);
uint64_t image1_address_phy = vaddr_to_paddr(args.image1.address);
uint64_t output_address_phy = vaddr_to_paddr(args.output.address);
uint64_t image_amount_per_row =
align_to_x((uint64_t)args.image0.width * (uint64_t)args.image0.channels,
IMAGE_ALIGNMENT);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) |
(uint64_t)args.image0.height;
(*arg).driver.image0_address_phy = image0_address_phy;
(*arg).driver.image1_address_phy = image1_address_phy;
(*arg).driver.datalen = datalen;
(*arg).driver.image_image_pixel = image_image_pixel;
(*arg).driver.image_amount_per_row = image_amount_per_row;
(*arg).driver.output_address_phy = output_address_phy;
(*arg).driver.coefficient = coefficient;
(*arg).driver.cmd = cmd;
} // expand_EW_arg
void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int group_num, int stride_h, bool relu_enabled, int group_num, int stride_h,
...@@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -206,7 +373,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
auto channel = (int)out->dims()[1]; // NOLINT auto channel = (int)out->dims()[1]; // NOLINT
int filter_num_per_div = get_filter_num_per_div(filter, group_num); int filter_num_per_div = get_filter_num_per_div(filter, group_num);
int element_num = get_aligned_filter_element_num( int element_num = get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]); (int)(filter->dims()[1] * filter->dims()[2] * filter->dims()[3]));
for (int i = 0; i < n; i++) { for (int i = 0; i < n; i++) {
arg->conv_arg[i].relu_enabled = relu_enabled; arg->conv_arg[i].relu_enabled = relu_enabled;
...@@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -223,24 +390,23 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
arg->conv_arg[i].image.pad_height = (uint32_t)padding_h; arg->conv_arg[i].image.pad_height = (uint32_t)padding_h;
arg->conv_arg[i].image.pad_width = (uint32_t)padding_w; arg->conv_arg[i].image.pad_width = (uint32_t)padding_w;
arg->conv_arg[i].filter_scale_address = filter->scale; arg->conv_arg[i].filter_scale_address = filter->scale;
// arg->conv_arg[i].filter_address = &(
// (int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; //
// NOLINT
// arg->conv_arg[i].sb_address = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].filter_num = (uint32_t)( arg->conv_arg[i].filter_num = (uint32_t)(
i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT i == n - 1 ? channel - (n - 1) * filter_num_per_div // NOLINT
: filter_num_per_div); : filter_num_per_div);
size_t filter_size = size_t filter_size =
element_num * arg->conv_arg[i].filter_num * sizeof(int8_t); element_num *
align_to_x(arg->conv_arg[i].filter_num, FILTER_NUM_ALIGNMENT) *
sizeof(int8_t);
auto filter_head = auto filter_head =
&((int8_t *)filter_ptr)[i * element_num * filter_num_per_div]; &((int8_t *)filter_ptr)[i * element_num * filter_num_per_div];
arg->conv_arg[i].filter_address = fpga_malloc(filter_size); arg->conv_arg[i].filter_address = fpga_malloc(filter_size);
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size); memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size); fpga_flush(arg->conv_arg[i].filter_address, filter_size);
size_t bs_size = 2 * arg->conv_arg[i].filter_num * sizeof(float); size_t bs_size = 2 *
align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
sizeof(float);
auto bs_head = &bs_ptr[i * filter_num_per_div * 2]; auto bs_head = &bs_ptr[i * filter_num_per_div * 2];
arg->conv_arg[i].sb_address = fpga_malloc(bs_size); arg->conv_arg[i].sb_address = fpga_malloc(bs_size);
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size); memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
...@@ -249,9 +415,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -249,9 +415,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
if (n > 1) { if (n > 1) {
arg->conv_arg[i].output.scale_address = arg->conv_arg[i].output.scale_address =
(float *)fpga_malloc(2 * sizeof(float)); // NOLINT (float *)fpga_malloc(2 * sizeof(float)); // NOLINT
arg->conv_arg[i].output.address = arg->conv_arg[i].output.address = fpga_malloc(
fpga_malloc(out->dims()[2] * out->dims()[2] *
align_to_x(out->dims()[3] * arg->conv_arg[i].filter_num, align_to_x((int)(out->dims()[3] * arg->conv_arg[i].filter_num),
IMAGE_ALIGNMENT) * IMAGE_ALIGNMENT) *
sizeof(half)); sizeof(half));
} else { } else {
...@@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input, ...@@ -263,10 +429,13 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
(half *)arg->conv_arg[i].output.address; // NOLINT (half *)arg->conv_arg[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address; arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num; arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
expand_conv_arg(&arg->conv_arg[i]);
} }
filter->reset_data_ptr(nullptr); filter->reset_data_ptr(nullptr);
fpga_free(bs_ptr); fpga_free(bs_ptr);
} } // fill_split_arg
void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::Tensor *out, framework::Tensor *filter, framework::Tensor *out, framework::Tensor *filter,
bool relu_enabled, int group_num, int stride_h, bool relu_enabled, int group_num, int stride_h,
...@@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -277,28 +446,27 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
auto out_ptr = out->data<float>(); auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num; arg->group_num = (uint32_t)group_num;
arg->sub_conv_num = stride_h; arg->sub_conv_num = (uint32_t)stride_h;
arg->filter_num = (uint32_t)filter->dims()[0]; arg->filter_num = (uint32_t)filter->dims()[0];
int sub_conv_num = arg->sub_conv_num; int sub_conv_num = arg->sub_conv_num;
int sub_stride = 1; int sub_stride = 1;
int sub_pad = deconv_filter::deconv_calc_sub_pad(filter->dims()[3], padding_w, int sub_pad = deconv_filter::deconv_calc_sub_pad((int)filter->dims()[3],
stride_w); padding_w, stride_w);
int sub_filter_width = int sub_filter_width = deconv_filter::deconv_get_sub_filter_axis(
deconv_filter::deconv_get_sub_filter_axis(filter->dims()[3], stride_w); (int)filter->dims()[3], stride_w);
int sub_output_width = deconv_filter::deconv_get_sub_out_axis( int sub_output_width = deconv_filter::deconv_get_sub_out_axis(
input->dims()[3], sub_pad, sub_filter_width); (int)input->dims()[3], sub_pad, sub_filter_width);
int sub_output_height = deconv_filter::deconv_get_sub_out_axis( int sub_output_height = deconv_filter::deconv_get_sub_out_axis(
input->dims()[2], sub_pad, sub_filter_width); (int)input->dims()[2], sub_pad, sub_filter_width);
arg->sub_output_width = sub_output_width; arg->sub_output_width = (uint32_t)sub_output_width;
arg->sub_output_height = sub_output_height; arg->sub_output_height = (uint32_t)sub_output_height;
arg->omit_size = arg->omit_size = (uint32_t)deconv_filter::deconv_get_omit(
deconv_filter::deconv_get_omit(stride_w, filter->dims()[3], padding_w); stride_w, (int)filter->dims()[3], padding_w);
arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs)); arg->conv_args = (ConvArgs *)fpga_malloc(sub_conv_num * sizeof(ConvArgs));
int sub_channels = (int32_t)input->dims()[1]; int sub_channels = (int)input->dims()[1];
int omit_size = arg->omit_size; int omit_size = arg->omit_size;
int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size; int real_out_width = sub_output_width * sub_conv_num - 2 * omit_size;
int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size; int real_out_height = sub_output_height * sub_conv_num - 2 * omit_size;
...@@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -318,42 +486,41 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
for (int i = 0; i < sub_conv_num; ++i) { for (int i = 0; i < sub_conv_num; ++i) {
arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num); arg->conv_args[i].filter_num = (arg->sub_conv_num) * (arg->filter_num);
arg->conv_args[i].group_num = group_num; arg->conv_args[i].group_num = (uint32_t)group_num;
arg->conv_args[i].filter_scale_address = filter->scale; arg->conv_args[i].filter_scale_address = filter->scale;
arg->conv_args[i].relu_enabled = relu_enabled; arg->conv_args[i].relu_enabled = relu_enabled;
arg->conv_args[i].kernel.width = sub_filter_width; arg->conv_args[i].kernel.width = (uint32_t)sub_filter_width;
arg->conv_args[i].kernel.height = sub_filter_width; arg->conv_args[i].kernel.height = (uint32_t)sub_filter_width;
arg->conv_args[i].kernel.stride_w = 1; arg->conv_args[i].kernel.stride_w = 1;
arg->conv_args[i].kernel.stride_h = 1; arg->conv_args[i].kernel.stride_h = 1;
// DeconvParam.conv_args[i].image.address = (void*)ptr_image; // DeconvParam.conv_args[i].image.address = (void*)ptr_image;
arg->conv_args[i].image.scale_address = input->scale; arg->conv_args[i].image.scale_address = input->scale;
arg->conv_args[i].image.channels = sub_channels; arg->conv_args[i].image.channels = (uint32_t)sub_channels;
arg->conv_args[i].image.width = (uint32_t)input->dims()[3]; arg->conv_args[i].image.width = (uint32_t)input->dims()[3];
arg->conv_args[i].image.height = (uint32_t)input->dims()[2]; arg->conv_args[i].image.height = (uint32_t)input->dims()[2];
arg->conv_args[i].image.pad_width = sub_pad; arg->conv_args[i].image.pad_width = (uint32_t)sub_pad;
arg->conv_args[i].image.pad_height = sub_pad; arg->conv_args[i].image.pad_height = (uint32_t)sub_pad;
arg->conv_args[i].image.address = input_ptr; arg->conv_args[i].image.address = input_ptr;
arg->conv_args[i].sb_address = (void *)bs_ptr; arg->conv_args[i].sb_address = (void *)bs_ptr;
char *filter_sub_space = auto filter_sub_space =
(char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char)); (char *)fpga_malloc(align_conv_sub_filter_count * sizeof(char));
fpga_copy(filter_sub_space, fpga_copy(filter_sub_space,
(char *)filter_ptr + i * align_conv_sub_filter_count, (char *)filter_ptr + i * align_conv_sub_filter_count,
align_conv_sub_filter_count); (size_t)align_conv_sub_filter_count);
arg->conv_args[i].filter_address = (void *)(filter_sub_space); arg->conv_args[i].filter_address = (void *)(filter_sub_space);
fpga_flush(filter_sub_space, align_conv_sub_filter_count); fpga_flush(filter_sub_space, (size_t)align_conv_sub_filter_count);
if (sub_conv_num == 1) { if (sub_conv_num == 1) {
arg->conv_args[i].output.address = out_ptr; arg->conv_args[i].output.address = out_ptr;
arg->conv_args[i].output.scale_address = out->scale; arg->conv_args[i].output.scale_address = out->scale;
} else { } else {
half *ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half)); auto ptr_output = (half *)fpga_malloc(conv_output_size * sizeof(half));
arg->conv_args[i].output.address = (void *)((half *)ptr_output); arg->conv_args[i].output.address = (void *)((half *)ptr_output);
float *ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float)); auto ptr_output_scale = (float *)fpga_malloc(2 * sizeof(float));
arg->conv_args[i].output.scale_address = ptr_output_scale; arg->conv_args[i].output.scale_address = ptr_output_scale;
} }
} }
...@@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input, ...@@ -361,6 +528,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->output.address = out_ptr; arg->output.address = out_ptr;
arg->output.scale_address = out->scale; arg->output.scale_address = out->scale;
// fpga_free(filter_ptr); // fpga_free(filter_ptr);
} } // fill_deconv_arg
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#define BS_NUM_ALIGNMENT 8
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace bias_scale { namespace bias_scale {
......
...@@ -14,8 +14,6 @@ limitations under the License. */ ...@@ -14,8 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#define BS_NUM_ALIGNMENT 8
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace deconv_bias_scale { namespace deconv_bias_scale {
......
...@@ -14,9 +14,6 @@ limitations under the License. */ ...@@ -14,9 +14,6 @@ limitations under the License. */
#pragma once #pragma once
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace filter { namespace filter {
......
...@@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out, ...@@ -111,25 +111,37 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t)); fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
} }
void split_image(int16_t *image_in, float *scale_in, void **images_out, void split_image(int16_t *image_in, const float *scale_in, void **images_out,
float **scales_out, int image_num, uint32_t *channel_nums, float **scales_out, int image_num,
int height, int width) { const uint32_t *channel_nums, int height, int width) {
int total_channel = 0; int total_channel = 0;
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
scales_out[i][0] = scale_in[0]; scales_out[i][0] = scale_in[0];
scales_out[i][1] = scale_in[1]; scales_out[i][1] = scale_in[1];
total_channel += channel_nums[i]; total_channel += channel_nums[i];
} }
int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
fpga_invalidate(image_in, element_num * sizeof(int16_t));
int src_offset = 0, des_offset = 0;
for (int h = 0; h < height; h++) { for (int h = 0; h < height; h++) {
int src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT); for (int w = 0; w < width; w++) {
src_offset = h * align_to_x(total_channel * width, IMAGE_ALIGNMENT) +
w * total_channel;
for (int i = 0; i < image_num; i++) { for (int i = 0; i < image_num; i++) {
int des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT); des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
w * channel_nums[i];
memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset, memcpy((int16_t *)images_out[i] + des_offset, image_in + src_offset,
channel_nums[i] * sizeof(int16_t)); channel_nums[i] * sizeof(int16_t));
src_offset += channel_nums[i]; src_offset += channel_nums[i];
} }
} }
}
for (int i = 0; i < image_num; i++) {
element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
fpga_flush(images_out[i], element_num * sizeof(int16_t));
}
} }
} // namespace image } // namespace image
......
...@@ -14,9 +14,8 @@ limitations under the License. */ ...@@ -14,9 +14,8 @@ limitations under the License. */
#pragma once #pragma once
#include <stdint.h> #include <cstdint>
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
namespace image { namespace image {
...@@ -24,13 +23,16 @@ namespace image { ...@@ -24,13 +23,16 @@ namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width); void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw); void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width); void format_image(float** data_in, int channel, int height, int width);
// Concat featuremaps along channel direction
void concat_images(int16_t** images_in, float** scales_in, void* image_out, void concat_images(int16_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num, float* scale_out, int image_num, uint32_t* channel_num,
int height,
int width); // Concat featuremaps along channel direction
void split_image(int16_t* image_in, float* scale_in, void** images_out,
float** scales_out, int image_num, uint32_t* channel_nums,
int height, int width); int height, int width);
// Split featuremap along channel direction
void split_image(int16_t* image_in, const float* scale_in, void** images_out,
float** scales_out, int image_num,
const uint32_t* channel_nums, int height, int width);
} // namespace image } // namespace image
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -203,29 +203,11 @@ int ComputeBasicConv(const struct ConvArgs &args) {
DLOG << " out_address:" << args.output.address DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
#endif #endif
cout << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address
<< " filter_address:" << args.filter_address
<< " filter_num:" << args.filter_num
<< " group_num:" << args.group_num;
cout << " image_address:" << args.image.address
<< " image_scale_address:" << args.image.scale_address
<< " image_channels:" << args.image.channels
<< " image_height:" << args.image.height
<< " image_width:" << args.image.width
<< " pad_height:" << args.image.pad_height
<< " pad_width:" << args.image.pad_width;
cout << " kernel_height:" << args.kernel.height
<< " kernel_width:" << args.kernel.width
<< " stride_h:" << args.kernel.stride_h
<< " stride_w:" << args.kernel.stride_w;
cout << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "Conv"; int ret = 0;
// return 0; uint64_t output_scale = 0;
uint64_t timer_cnt; /*
uint64_t output_scale; uint64_t output_scale;
uint64_t image_scale; uint64_t image_scale;
uint64_t filter_scale; uint64_t filter_scale;
...@@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -233,14 +215,10 @@ int ComputeBasicConv(const struct ConvArgs &args) {
uint64_t sb_address_phy = 0; uint64_t sb_address_phy = 0;
uint64_t filter_address_phy = 0; uint64_t filter_address_phy = 0;
uint64_t output_address_phy = 0; uint64_t output_address_phy = 0;
int ret = 0;
fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float)); fpga_copy(&image_scale, args.image.scale_address, 2 * sizeof(float));
fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float)); fpga_copy(&filter_scale, args.filter_scale_address, 2 * sizeof(float));
cout << "image_scale :" << hex << (image_scale) << endl;
cout << "filter_scale :" << hex << (filter_scale) << endl;
uint64_t filterlen = (uint64_t)args.kernel.width * uint64_t filterlen = (uint64_t)args.kernel.width *
(uint64_t)args.kernel.height * (uint64_t)args.kernel.height *
(uint64_t)args.image.channels; (uint64_t)args.image.channels;
...@@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -349,8 +327,8 @@ int ComputeBasicConv(const struct ConvArgs &args) {
filter_address_phy = vaddr_to_paddr(args.filter_address); filter_address_phy = vaddr_to_paddr(args.filter_address);
output_address_phy = vaddr_to_paddr(args.output.address); output_address_phy = vaddr_to_paddr(args.output.address);
/*SDK刷Cache保证数据一致性*/
uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS; uint64_t cmd = 0UL | (args.relu_enabled ? USE_RELU : 0) | USE_BIAS;
*/
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex); pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) { if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status) {
...@@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -359,78 +337,63 @@ int ComputeBasicConv(const struct ConvArgs &args) {
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
} }
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(sb_address_phy, REG_CONV_SB_BASE_ADDR);
reg_writeq(output_address_phy, REG_CONV_RESULT_BASE_ADDR);
reg_writeq( reg_writeq(
((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32), ((uint64_t)args.image.height) | (((uint64_t)args.image.width) << 32),
REG_CONV_IMAGE_PIXEL); REG_CONV_IMAGE_PIXEL);
reg_writeq( reg_writeq(
((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32), ((uint64_t)args.kernel.height) | (((uint64_t)args.kernel.width) << 32),
REG_CONV_FILTER_PIXEL); REG_CONV_FILTER_PIXEL);
reg_writeq(output_height | (output_width << 32), REG_CONV_RESULT_PIXEL); reg_writeq(args.driver.output_height | (args.driver.output_width << 32),
REG_CONV_RESULT_PIXEL);
reg_writeq(((uint64_t)args.image.pad_height) | reg_writeq(((uint64_t)args.image.pad_height) |
(((uint64_t)args.image.pad_width) << 32), (((uint64_t)args.image.pad_width) << 32),
REG_CONV_PAD_PIXEL); REG_CONV_PAD_PIXEL);
reg_writeq(((uint64_t)args.kernel.stride_h) | reg_writeq(((uint64_t)args.kernel.stride_h) |
(((uint64_t)args.kernel.stride_w) << 32), (((uint64_t)args.kernel.stride_w) << 32),
REG_CONV_STEP_PIXEL); REG_CONV_STEP_PIXEL);
reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER); reg_writeq((uint64_t)args.group_num, REG_CONV_GROUP_NUMBER);
reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER); reg_writeq((uint64_t)args.filter_num, REG_CONV_FILTER_NUMBER);
reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER); reg_writeq((uint64_t)args.image.channels, REG_CONV_CHANNEL_NUMBER);
reg_writeq(*(uint64_t *)args.image.scale_address, REG_CONV_IMAGE_SCALE);
reg_writeq(*(uint64_t *)args.filter_scale_address, REG_CONV_FILTER_SCALE);
reg_writeq(args.driver.image_address_phy, REG_CONV_IMAGE_BASE_ADDR);
reg_writeq(args.driver.filter_address_phy, REG_CONV_FILTER_BASE_ADDR);
reg_writeq(args.driver.sb_address_phy, REG_CONV_SB_BASE_ADDR);
reg_writeq(args.driver.output_address_phy, REG_CONV_RESULT_BASE_ADDR);
reg_writeq(args.driver.filter_per_group, REG_CONV_FILTER_PER_GROUP);
reg_writeq(args.driver.channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
reg_writeq(args.driver.image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(args.driver.filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
reg_writeq(args.driver.output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
reg_writeq(args.driver.image_block_amount_per_row, 0xca8);
reg_writeq(args.driver.filter_pad_width_mul_channel, 0xcb0);
reg_writeq(args.driver.image_amount_per_row_multi_win_first, 0xcb8);
reg_writeq(args.driver.image_amount_per_row_multi_win, 0xcc0);
reg_writeq(args.driver.image_block_num, 0xcc8);
reg_writeq(args.driver.image_block_len, 0xcd0);
reg_writeq(args.driver.image_block_len_last, 0xcd8);
reg_writeq(args.driver.image_win_cnt, 0xce0);
reg_writeq(args.driver.image_win_cnt_last, 0xce8);
reg_writeq(args.driver.res_row_data_align4_pad, 0xcf8);
reg_writeq(args.driver.prog_full_cnt, 0xd08);
reg_writeq(args.driver.post_prog_full_cnt, 0xd10);
reg_writeq(args.driver.fpga_bias_scale_len / 4, 0xd20);
reg_writeq(args.driver.cmd, REG_CONV_CMD);
reg_writeq(filter_per_group, REG_CONV_FILTER_PER_GROUP);
reg_writeq(channel_per_group, REG_CONV_CHANNEL_PER_GROUP);
reg_writeq(image_amount_per_row, REG_CONV_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_one_pad_per_row, REG_CONV_IMAGE_ONE_PAD_PER_ROW);
reg_writeq(filter_amount_all, REG_CONV_FILTER_AMOUNT_ALL);
reg_writeq(output_amount_per_row, REG_CONV_RESULT_AMOUNT_PER_ROW);
reg_writeq(image_block_amount_per_row, 0xca8);
reg_writeq(filter_pad_width_mul_channel, 0xcb0);
reg_writeq(image_amount_per_row_multi_win_first, 0xcb8);
reg_writeq(image_amount_per_row_multi_win, 0xcc0);
reg_writeq(image_block_num, 0xcc8);
reg_writeq(image_block_len, 0xcd0);
reg_writeq(image_block_len_last, 0xcd8);
reg_writeq(image_win_cnt, 0xce0);
reg_writeq(image_win_cnt_last, 0xce8);
reg_writeq(res_row_data_align4_pad, 0xcf8);
reg_writeq(prog_full_cnt, 0xd08);
reg_writeq(post_prog_full_cnt, 0xd10);
reg_writeq(fpga_bias_scale_len / 4, 0xd20);
/*write scale*/
reg_writeq(image_scale, REG_CONV_IMAGE_SCALE);
reg_writeq(filter_scale, REG_CONV_FILTER_SCALE);
reg_writeq(cmd, REG_CONV_CMD);
DLOG << "before reg poll";
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_CONV, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_CONV]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "Conv Wait Irq Timeout!"; DLOG << "Conv Wait Irq Timeout!";
} }
DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
cout << "output_scale :" << hex << (output_scale) << endl;
//*(args.output.scale_address) = output_scale;
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret; return ret;
...@@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -575,9 +538,6 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
DLOG << "Pooling Wait Irq Timeout!"; DLOG << "Pooling Wait Irq Timeout!";
} }
DLOG << "after reg poll"; DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
// *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); // *(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
...@@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -615,11 +575,9 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
#endif #endif
#ifdef PADDLE_MOBILE_ZU5 #ifdef PADDLE_MOBILE_ZU5
DLOG << "Conv";
// return 0;
int ret = 0; int ret = 0;
uint64_t output_scale = 0; uint64_t output_scale = 0;
uint64_t timer_cnt = 0; /*uint64_t timer_cnt = 0;
uint64_t image0_address_phy = 0; uint64_t image0_address_phy = 0;
uint64_t image1_address_phy = 0; uint64_t image1_address_phy = 0;
uint64_t output_address_phy = 0; uint64_t output_address_phy = 0;
...@@ -629,15 +587,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -629,15 +587,6 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
(uint64_t)args.image0.height * (uint64_t)args.image0.height *
(uint64_t)args.image0.channels; (uint64_t)args.image0.channels;
uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1; uint64_t coefficient = (uint64_t)args.const0 << 32 | (uint64_t)args.const1;
pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status) {
ret = -EIO;
DLOG << "Conv Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
image0_address_phy = vaddr_to_paddr(args.image0.address); image0_address_phy = vaddr_to_paddr(args.image0.address);
image1_address_phy = vaddr_to_paddr(args.image1.address); image1_address_phy = vaddr_to_paddr(args.image1.address);
output_address_phy = vaddr_to_paddr(args.output.address); output_address_phy = vaddr_to_paddr(args.output.address);
...@@ -647,36 +596,35 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) { ...@@ -647,36 +596,35 @@ int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
IMAGE_ALIGN); IMAGE_ALIGN);
uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) | uint64_t image_image_pixel = ((uint64_t)args.image0.channels << 32) |
((uint64_t)args.image0.width << 16) | ((uint64_t)args.image0.width << 16) |
(uint64_t)args.image0.height; (uint64_t)args.image0.height;*/
/*SDK刷Cache保证数据一致性*/ pthread_mutex_lock(&g_fpgainfo.pe_data->mutex);
if (ERROR == g_fpgainfo.pe_data->pes[PE_IDX_EW]->status) {
ret = -EIO;
DLOG << "EW Status Error!";
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
return ret;
}
/*restart scale*/
reg_writeq(output_scale, REG_SCALE_PARAMETER); reg_writeq(output_scale, REG_SCALE_PARAMETER);
reg_writeq(args.driver.image0_address_phy, REG_EW_IMAGE0_BASE_ADDR);
reg_writeq(image0_address_phy, REG_EW_IMAGE0_BASE_ADDR); reg_writeq(args.driver.image1_address_phy, REG_EW_IMAGE1_BASE_ADDR);
reg_writeq(image1_address_phy, REG_EW_IMAGE1_BASE_ADDR); reg_writeq(args.driver.datalen, REG_EW_DATA_LEN);
reg_writeq(datalen, REG_EW_DATA_LEN); reg_writeq(args.driver.image_image_pixel, REG_EW_IMAGE_PIXEL);
reg_writeq(image_image_pixel, REG_EW_IMAGE_PIXEL); reg_writeq(args.driver.image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW);
reg_writeq(image_amount_per_row, REG_EW_IMAGE_AMOUNT_PER_ROW); reg_writeq(args.driver.output_address_phy, REG_EW_RESULT_BASE_ADDR);
reg_writeq(args.driver.coefficient, REG_EW_COEFFICIENT);
reg_writeq(output_address_phy, REG_EW_RESULT_BASE_ADDR); reg_writeq(args.driver.cmd, REG_EW_CMD);
reg_writeq(coefficient, REG_EW_COEFFICIENT);
reg_writeq(cmd, REG_EW_CMD);
if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) { if (0 != fpga_regpoll(REG_INTERRUPT, INTERRUPT_POOLING, PE_IRQ_TIMEOUT)) {
g_fpgainfo.pe_data->pes[PE_IDX_POOLING]->status = ERROR; g_fpgainfo.pe_data->pes[PE_IDX_EW]->status = ERROR;
ret = -EIO; ret = -EIO;
DLOG << "EW Wait Irq Timeout!"; DLOG << "EW Wait Irq Timeout!";
} }
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
//*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER); //*(args.output.scale_address) = reg_readq(REG_SCALE_PARAMETER);
//*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER); //*(args.output.timer_cnt) = reg_readq(REG_TIMER_COUNTER);
pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex); pthread_mutex_unlock(&g_fpgainfo.pe_data->mutex);
...@@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) { ...@@ -802,9 +750,7 @@ int PerformBypass(const struct BypassArgs &args) {
DLOG << "BYPASS Wait Irq Timeout!"; DLOG << "BYPASS Wait Irq Timeout!";
} }
DLOG << "after reg poll"; DLOG << "after reg poll";
usleep(40);
/*SDK 无效 Cache保证数据一致性*/
output_scale = reg_readq(REG_SCALE_PARAMETER); output_scale = reg_readq(REG_SCALE_PARAMETER);
output_scale = (output_scale << 32) | (output_scale >> 32); output_scale = (output_scale << 32) | (output_scale >> 32);
fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2); fpga_copy(args.output.scale_address, &output_scale, sizeof(float) * 2);
...@@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel, ...@@ -883,8 +829,9 @@ void deconv_post_process(half **data_in, int sub_conv_n, int num, int channel,
*data_in = ptr_deconv; *data_in = ptr_deconv;
fpga_free(ptr_tmp); fpga_free(ptr_tmp);
} }
int ComputeFpgaDeconv(const struct DeconvArgs &args) { int ComputeFpgaDeconv(const struct DeconvArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_PRINT_MODE
DLOG << "=============ComputeFPGADeConv==========="; DLOG << "=============ComputeFPGADeConv===========";
DLOG << " filter_num:" << args.filter_num DLOG << " filter_num:" << args.filter_num
<< " group_num:" << args.group_num << " group_num:" << args.group_num
......
...@@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) { ...@@ -137,8 +137,6 @@ int fpga_regpoll(uint64_t reg, uint64_t val, int time) {
for (i = 0; i < timeout; i++) { for (i = 0; i < timeout; i++) {
if (val == reg_readq(reg)) { if (val == reg_readq(reg)) {
std::cout << "fpga_regpoll:" << i << "val:" << val << "reg:" << reg
<< std::endl;
break; break;
} }
} }
...@@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) { ...@@ -401,8 +399,6 @@ void fpga_copy_driver(void *dest, const void *src, size_t num) {
DLOG << "dest:" << dest << " src:" << src << " size:" << num; DLOG << "dest:" << dest << " src:" << src << " size:" << num;
for (i = 0; i < num; i++) { for (i = 0; i < num; i++) {
// DLOG << "i:" << i << " val:" << *((int8_t *)src + i);
// usleep(1);
*((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT *((int8_t *)dest + i) = *((int8_t *)src + i); // NOLINT
} }
......
...@@ -103,22 +103,15 @@ struct FPGA_INFO { ...@@ -103,22 +103,15 @@ struct FPGA_INFO {
extern struct FPGA_INFO g_fpgainfo; extern struct FPGA_INFO g_fpgainfo;
inline uint64_t reg_readq(uint32_t offset) { inline uint64_t reg_readq(uint32_t offset) {
// DLOG << "offset : " << offset;
uint64_t value = uint64_t value =
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset); // NOLINT offset); // NOLINT
// DLOG << "read end";
usleep(10);
return value; return value;
} }
inline void reg_writeq(uint64_t value, uint32_t offset) { inline void reg_writeq(uint64_t value, uint32_t offset) {
// DLOG << "offset : " << offset << ", value : " << value;
*(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT *(volatile uint64_t *)((uint8_t *)g_fpgainfo.FpgaRegVirAddr + // NOLINT
offset) = value; offset) = value;
// DLOG << "write end";
usleep(10);
} }
int open_device_driver(); int open_device_driver();
......
...@@ -20,6 +20,13 @@ limitations under the License. */ ...@@ -20,6 +20,13 @@ limitations under the License. */
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
#ifdef PADDLE_MOBILE_FPGA_V1
#define IMAGE_ALIGNMENT 16 // Aligned to 16
#define FILTER_NUM_ALIGNMENT 32 // Filter number aligned to 32
#define FILTER_ELEMENT_ALIGNMENT 16 // Filter element number aligned to 16
#define BS_NUM_ALIGNMENT 8
#endif
enum DataType { enum DataType {
DATA_TYPE_FP32 = 1, DATA_TYPE_FP32 = 1,
DATA_TYPE_FP16 = 0, DATA_TYPE_FP16 = 0,
...@@ -52,19 +59,70 @@ struct ImageOutputArgs { ...@@ -52,19 +59,70 @@ struct ImageOutputArgs {
float* scale_address; // output scale address; float* scale_address; // output scale address;
uint64_t timer_cnt; // time counter for FPGA computation uint64_t timer_cnt; // time counter for FPGA computation
}; };
#ifdef PADDLE_MOBILE_FPGA_V1
struct ConvDriverParam {
uint64_t image_address_phy;
uint64_t filter_address_phy;
uint64_t sb_address_phy;
uint64_t output_address_phy;
uint64_t output_height;
uint64_t output_width;
uint64_t filter_per_group;
uint64_t channel_per_group;
uint64_t image_amount_per_row;
uint64_t image_one_pad_per_row;
uint64_t filter_amount_all;
uint64_t output_amount_per_row;
uint64_t image_block_amount_per_row;
uint64_t filter_pad_width_mul_channel;
uint64_t image_amount_per_row_multi_win_first;
uint64_t image_amount_per_row_multi_win;
uint64_t image_block_num;
uint64_t image_block_len;
uint64_t image_block_len_last;
uint64_t image_win_cnt;
uint64_t image_win_cnt_last;
uint64_t res_row_data_align4_pad;
uint64_t prog_full_cnt;
uint64_t post_prog_full_cnt;
uint64_t fpga_bias_scale_len;
uint64_t cmd;
};
struct EWAddDriverParam {
uint64_t image0_address_phy;
uint64_t image1_address_phy;
uint64_t datalen;
uint64_t image_image_pixel;
uint64_t image_amount_per_row;
uint64_t output_address_phy;
uint64_t coefficient;
uint64_t cmd;
};
#endif
struct ConvArgs { struct ConvArgs {
bool relu_enabled; bool relu_enabled;
void* sb_address; // scale and bias void* sb_address; // scale and bias
void* filter_address; void* filter_address;
float* filter_scale_address; float* filter_scale_address;
void* free_space; // used by FPGA logic
uint32_t filter_num; uint32_t filter_num;
uint32_t group_num; uint32_t group_num;
struct KernelArgs kernel; struct KernelArgs kernel;
struct ImageInputArgs image; // input image; struct ImageInputArgs image; // input image;
struct ImageOutputArgs output; struct ImageOutputArgs output;
#ifdef PADDLE_MOBILE_FPGA_V2
void* free_space; // used by FPGA logic
#endif
#ifdef PADDLE_MOBILE_FPGA_V1
struct ConvDriverParam driver;
#endif
}; };
struct ConcatArgs { struct ConcatArgs {
...@@ -115,6 +173,9 @@ struct EWAddArgs { ...@@ -115,6 +173,9 @@ struct EWAddArgs {
struct ImageInputArgs image0; struct ImageInputArgs image0;
struct ImageInputArgs image1; struct ImageInputArgs image1;
struct ImageOutputArgs output; struct ImageOutputArgs output;
#ifdef PADDLE_MOBILE_FPGA_V1
struct EWAddDriverParam driver;
#endif
}; };
struct BypassArgs { struct BypassArgs {
...@@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num); ...@@ -150,5 +211,9 @@ void fpga_copy(void* dest, const void* src, size_t num);
int fpga_flush(void* address, size_t size); int fpga_flush(void* address, size_t size);
int fpga_invalidate(void* address, size_t size); int fpga_invalidate(void* address, size_t size);
uint64_t vaddr_to_paddr(void* address);
void expand_conv_arg(ConvArgs* arg);
void expand_EW_arg(EWAddArgs* arg);
} // namespace fpga } // namespace fpga
} // namespace paddle_mobile } // namespace paddle_mobile
...@@ -27,7 +27,11 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) { ...@@ -27,7 +27,11 @@ bool ConcatKernel<CPU, float>::Init(ConcatParam<CPU> *param) {
template <> template <>
void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) { void ConcatKernel<CPU, float>::Compute(const ConcatParam<CPU> &param) {
if (param.Inputs()[0]->type() == typeid(int8_t)) {
ConcatCompute<int8_t>(param);
} else {
ConcatCompute<float>(param); ConcatCompute<float>(param);
}
param.Out()->set_lod(param.Inputs()[0]->lod()); param.Out()->set_lod(param.Inputs()[0]->lod());
} }
......
...@@ -57,8 +57,8 @@ template <typename P> ...@@ -57,8 +57,8 @@ template <typename P>
void ConcatCompute(const ConcatParam<CPU> &param) { void ConcatCompute(const ConcatParam<CPU> &param) {
auto inputs = param.Inputs(); auto inputs = param.Inputs();
auto *out = param.Out(); auto *out = param.Out();
int64_t axis = param.Axis(); int axis = param.Axis();
out->mutable_data<float>(); out->mutable_data<P>();
/// Sometimes direct copies will be faster, this maybe need deeply analysis. /// Sometimes direct copies will be faster, this maybe need deeply analysis.
if (axis == 0 && inputs.size() < 10) { if (axis == 0 && inputs.size() < 10) {
...@@ -66,12 +66,12 @@ void ConcatCompute(const ConcatParam<CPU> &param) { ...@@ -66,12 +66,12 @@ void ConcatCompute(const ConcatParam<CPU> &param) {
for (auto *in : inputs) { for (auto *in : inputs) {
auto in_stride = framework::stride_numel(in->dims()); auto in_stride = framework::stride_numel(in->dims());
auto out_stride = framework::stride_numel(out->dims()); auto out_stride = framework::stride_numel(out->dims());
auto dst = out->data<float>() + output_offset; auto dst = out->data<P>() + output_offset;
auto src = in->data<float>(); auto src = in->data<P>();
PADDLE_MOBILE_ENFORCE( PADDLE_MOBILE_ENFORCE(
in_stride.size() == out_stride.size(), in_stride.size() == out_stride.size(),
"src and dst tensor should have the same dims size."); "src and dst tensor should have the same dims size.");
memory::Copy(dst, src, sizeof(float) * in_stride[0]); memory::Copy(dst, src, sizeof(P) * in_stride[0]);
output_offset += in_stride[0]; output_offset += in_stride[0];
} }
} else { } else {
...@@ -79,8 +79,8 @@ void ConcatCompute(const ConcatParam<CPU> &param) { ...@@ -79,8 +79,8 @@ void ConcatCompute(const ConcatParam<CPU> &param) {
for (int j = 0; j < inputs.size(); ++j) { for (int j = 0; j < inputs.size(); ++j) {
inputs_concat[j] = *inputs[j]; inputs_concat[j] = *inputs[j];
} }
ConcatFunctor<float> concat_functor; ConcatFunctor<P> concat_functor;
concat_functor(inputs_concat, static_cast<int>(axis), out); concat_functor(inputs_concat, axis, out);
} }
} }
......
...@@ -49,6 +49,7 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) { ...@@ -49,6 +49,7 @@ bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
ewaddArgs.image1.pad_width = 0; ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr; ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs); param->SetFpgaArgs(ewaddArgs);
return true; return true;
} }
......
...@@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init( ...@@ -50,6 +50,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
ewaddArgs.image1.pad_width = 0; ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale; ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr; ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs); param->SetFpgaArgs(ewaddArgs);
return true; return true;
} }
......
...@@ -24,8 +24,12 @@ template <> ...@@ -24,8 +24,12 @@ template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX()); auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto out = param->Out();
fpga::format_fp32_ofm(out);
auto float_input = new Tensor; auto float_input = new Tensor;
float_input->mutable_data<float>({1, input->dims()[1]}); float_input->mutable_data<float>(
{1, input->dims()[2], input->dims()[3], input->dims()[1]});
fpga::format_fp32_ofm(float_input); fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
...@@ -34,8 +38,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -34,8 +38,8 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
args.input_data_type = fpga::DATA_TYPE_FP16; args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32; args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = input_ptr; args.image.address = input_ptr;
args.image.height = 1; args.image.height = (uint32_t)input->dims()[2];
args.image.width = 1; args.image.width = (uint32_t)input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1]; args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->data<float>(); args.output.address = float_input->data<float>();
args.output.scale_address = float_input->scale; args.output.scale_address = float_input->scale;
...@@ -50,9 +54,9 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) { ...@@ -50,9 +54,9 @@ void SoftmaxKernel<FPGA, float>::Compute(const SoftmaxParam<FPGA> &param) {
Tensor *out = param.Out(); Tensor *out = param.Out();
fpga::PerformBypass(param.FpgaArgs()); fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate( fpga::fpga_invalidate((void *)in_x->data<float>(), // NOLINT
(void *)in_x->data<float>(), // NOLINT in_x->numel() * sizeof(float));
fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float)); // TODO: In general case, 0 should be squeezed before softmax input
math::SoftmaxFuntor<CPU, float>()(in_x, out); math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size()); fpga::fpga_flush(out->data<float>(), out->memory_size());
} }
......
...@@ -12,76 +12,125 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ...@@ -12,76 +12,125 @@ WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
See the License for the specific language governing permissions and See the License for the specific language governing permissions and
limitations under the License. */ limitations under the License. */
#include <cstring>
#include <iostream>
#include <vector>
#include "../test_helper.h"
#include "../test_include.h" #include "../test_include.h"
#include "operators/concat_op.h" #include "operators/concat_op.h"
namespace paddle_mobile {
using framework::AttributeMap;
using framework::DDim;
using framework::LoDTensor;
using framework::Scope;
using framework::make_ddim;
template <typename T>
void concat(const std::vector<LoDTensor> &input, LoDTensor &output, int axis) {
int num = input.size();
int rows = 1;
auto dim_0 = input[0].dims();
for (int i = 0; i < axis; ++i) {
rows *= dim_0[i];
}
int out_rows = rows, out_cols = 0;
std::vector<int> input_cols(input.size());
for (int i = 0; i < num; ++i) {
int t_cols = input[i].numel() / rows;
out_cols += t_cols;
input_cols[i] = t_cols;
}
// computation
auto output_data = output.data<T>();
int col_idx = 0;
for (int j = 0; j < num; ++j) {
int col_len = input_cols[j];
auto input_data = input[j].data<T>();
for (int k = 0; k < out_rows; ++k) {
memcpy(output_data + k * out_cols + col_idx, input_data + k * col_len,
sizeof(T) * col_len);
}
col_idx += col_len;
}
}
template <typename T>
int TestConcatOP() {
DDim inputA_shape = make_ddim({10, 4, 2, 2});
DDim inputB_shape = make_ddim({20, 4, 2, 2});
DDim inputC_shape = make_ddim({30, 4, 2, 2});
DDim inputD_shape = make_ddim({40, 4, 2, 2});
DDim output_shape = make_ddim({100, 4, 2, 2});
int axis_v = 0;
VariableNameMap inputs;
VariableNameMap outputs;
std::vector<LoDTensor> input_tensors;
auto scope = std::make_shared<Scope>();
inputs["X"] =
std::vector<std::string>({"inputA", "inputB", "inputC", "inputD"});
outputs["Out"] = std::vector<std::string>({"output"});
auto inputA_var = scope.get()->Var("inputA");
auto inputA = inputA_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputA, inputA_shape, -127, 127);
input_tensors.push_back(std::move(*inputA));
auto inputB_var = scope.get()->Var("inputB");
auto inputB = inputB_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputB, inputB_shape, -127, 127);
input_tensors.push_back(std::move(*inputB));
auto inputC_var = scope.get()->Var("inputC");
auto inputC = inputC_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputC, inputC_shape, -127, 127);
input_tensors.push_back(std::move(*inputC));
auto inputD_var = scope.get()->Var("inputD");
auto inputD = inputD_var->template GetMutable<framework::LoDTensor>();
SetupTensor<T>(inputD, inputD_shape, -127, 127);
input_tensors.push_back(std::move(*inputD));
auto output_var = scope.get()->Var("output");
AttributeMap attrs;
attrs["axis"].Set<int>(axis_v);
auto *op = new operators::ConcatOp<CPU, float>("concat", inputs, outputs,
attrs, scope);
op->InferShape();
op->Run();
auto output = output_var->template Get<framework::LoDTensor>();
const T *output_data = output->data<T>();
LoDTensor output_cmp;
output_cmp.mutable_data<T>(output_shape);
concat<T>(input_tensors, output_cmp, axis_v);
const T *output_cmp_data = output_cmp.data<T>();
// compare
int eq = 0;
int neq = 0;
for (int i = 0; i < output->numel(); ++i) {
PADDLE_MOBILE_ENFORCE(output_data[i] == output_cmp_data[i],
"The execution of test_concat_op is failed!");
if (output_data[i] == output_cmp_data[i]) {
++eq;
} else {
++neq;
}
}
std::cout << "eq = " << eq << ", neq = " << neq << std::endl;
delete op;
return 0;
}
} // namespace paddle_mobile
int main() { int main() {
paddle_mobile::framework::Loader<paddle_mobile::CPU> loader; paddle_mobile::PaddleMobile<paddle_mobile::CPU> paddle_mobile;
auto program = loader.Load(g_googlenet); paddle_mobile.SetThreadNum(4);
PADDLE_MOBILE_ENFORCE(program.originProgram != nullptr, paddle_mobile::TestConcatOP<float>();
"program file read fail"); paddle_mobile::TestConcatOP<int8_t>();
Executor4Test<paddle_mobile::CPU,
paddle_mobile::operators::ConcatOp<paddle_mobile::CPU, float>>
executor(program, "concat");
// 1. input_tensors;
vector<Tensor> input_tensors;
Tensor input1;
auto input1_data = CreateInput<float>(&input1, {4, 10, 2, 2}, 0, 1);
input_tensors.push_back(input1);
Tensor input2;
auto input2_data = CreateInput<float>(&input2, {4, 20, 2, 2}, 0, 1);
input_tensors.push_back(input2);
Tensor input3;
auto input3_data = CreateInput<float>(&input3, {4, 30, 2, 2}, 0, 1);
input_tensors.push_back(input3);
Tensor input4;
auto input4_data = CreateInput<float>(&input4, {4, 40, 2, 2}, 0, 1);
input_tensors.push_back(input4);
// 2. input_names
vector<string> input_names({
"conv2d_3.tmp_1",
"conv2d_5.tmp_1",
"conv2d_7.tmp_1",
"conv2d_8.tmp_1",
});
// 3. output_names
vector<string> output_names({"concat_0.tmp_0"});
// 4. out_dims;
vector<DDim> out_ddims;
auto out_ddim = paddle_mobile::framework::make_ddim({3, 100, 2, 2});
out_ddims.push_back(out_ddim);
auto output = executor.Predict<LoDTensor>(input_tensors, input_names,
output_names, out_ddims);
auto output0_data = output[0]->data<float>();
// 5. test one example.
int input_n = 1;
int input_c = 2;
int input_h = 0;
int input_w = 1;
int stride0 = input3.numel() / input3.dims()[0];
int stride1 = input3.numel() / input3.dims()[0] / input3.dims()[1];
int stride2 = input3.dims()[3];
/// inputx1 (4,10,2,2),
/// inputx2 (4,20,2,2),
/// inputx3 (4,30,2,2),
/// inputx4 (4,40,2,2),
/// axis = 1
/// output (4,100,2,2)
int input_index =
input_n * stride0 + input_c * stride1 + input_h * stride2 + input_w;
int output_index = input_n * 100 * 2 * 2 +
(input_c + input1.dims()[1] + input2.dims()[1]) * 2 * 2 +
input_h * 2 + input_w;
DLOG << " input3 [1, 2,0,1] = " << input3_data[input_index];
DLOG << " output [1,32,0,1] = " << output0_data[output_index];
return 0; return 0;
} }
...@@ -18,6 +18,9 @@ limitations under the License. */ ...@@ -18,6 +18,9 @@ limitations under the License. */
#include "../test_include.h" #include "../test_include.h"
#include "framework/operator.h" #include "framework/operator.h"
#include "operators/fusion_fc_op.h" #include "operators/fusion_fc_op.h"
#ifdef FUSION_FC_INT8_OP
#include "operators/fusion_fc_int8_op.h"
#endif
#define a(i, j) a[(i)*lda + (j)] #define a(i, j) a[(i)*lda + (j)]
#define b(i, j) b[(i)*ldb + (j)] #define b(i, j) b[(i)*ldb + (j)]
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册