提交 99cccf3b 编写于 作者: qnqinan's avatar qnqinan 提交者: jameswu2014

update kernel and related files for static quantization in FPGA v2 track fixed#1584 (#1585)

* update concat and split kernel and related files in FPGA v2(v3) track

* update

* update

* update kernel and related files in FPGA v2 track

* update

* update
上级 515c42ec
......@@ -22,79 +22,85 @@ limitations under the License. */
namespace paddle_mobile {
namespace fpga {
#define USE_RELU 1
#define USE_BIAS 2
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3];
kTypeId_t input_type = image_tensor->type();
if (input_type == type_id<float>()) {
auto data_ptr = image_tensor->data<float>();
auto external_ptr = reinterpret_cast<float *>(image_tensor->external_data);
float *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
image::format_image<float>(&p_data, channel, height, width);
if (p_data != data_ptr && external_ptr == nullptr) {
image_tensor->reset_data_ptr(p_data);
}
} else {
auto data_ptr = image_tensor->data<int8_t>();
auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
auto data_ptr = image_tensor->data<int8_t>();
auto external_ptr = reinterpret_cast<int8_t *>(image_tensor->external_data);
int8_t *p_data = external_ptr == nullptr ? data_ptr : external_ptr;
image::format_image<int8_t>(&p_data, channel, height, width);
if (p_data != data_ptr && external_ptr == nullptr) {
image_tensor->reset_data_ptr(p_data);
}
image::format_image<int8_t>(&p_data, channel, height, width);
if (p_data != data_ptr && external_ptr == nullptr) {
image_tensor->reset_data_ptr(p_data);
}
}
void format_ofm(framework::Tensor *ofm_tensor) {
if (ofm_tensor->type() == type_id<float>()) {
format_fp32_ofm(ofm_tensor);
} else {
} else if (ofm_tensor->type() == type_id<half>()) {
format_fp16_ofm(ofm_tensor);
} else {
format_int8_ofm(ofm_tensor);
}
format_int8_ofm(ofm_tensor);
}
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
void format_int8_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(half);
sizeof(int8_t);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
// memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(type_id<half>().hash_code());
ofm_tensor->fpga_data_num = memory_size / sizeof(half);
ofm_tensor->set_type(type_id<int8_t>().hash_code());
ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t);
fpga::fpga_flush(p, memory_size);
}
void format_fp16_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
// auto dims = ofm_tensor->dims();
void format_int8_ofm(framework::Tensor *ofm_tensor, framework::DDim dims) {
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3];
memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(int8_t);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(int8_t);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(type_id<int8_t>().hash_code());
ofm_tensor->fpga_data_num = memory_size / sizeof(int8_t);
fpga::fpga_flush(p, memory_size);
}
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3], num = dims[0];
memory_size = num * height * align_to_x(channel * width, IMAGE_ALIGNMENT) *
sizeof(half);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(half);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
// memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(type_id<half>().hash_code());
ofm_tensor->fpga_data_num = memory_size / sizeof(half);
fpga::fpga_flush(p, memory_size);
}
void format_fp32_ofm(framework::Tensor *ofm_tensor) {
......@@ -110,7 +116,6 @@ void format_fp32_ofm(framework::Tensor *ofm_tensor) {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
// memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
ofm_tensor->set_type(type_id<float>().hash_code());
ofm_tensor->fpga_data_num = memory_size / sizeof(float);
......@@ -269,11 +274,11 @@ void format_concat_output(framework::Tensor *out, int height, int width,
}
sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
auto data_ptr = fpga_malloc(height * sum_cw * sizeof(int8_t));
auto ddim = framework::make_ddim({1, sum_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
out->set_type(type_id<half>().hash_code());
out->set_type(type_id<int8_t>().hash_code());
}
void format_conv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
......@@ -283,7 +288,7 @@ void format_conv_data(framework::Tensor *filter_tensor,
int element_num_per_div = fpga::get_filter_num_per_div(filter_tensor, group);
fpga::format_bias_scale_array(bs_ptr, element_num_per_div,
ofm_tensor->dims()[1]);
fpga::format_fp16_ofm(ofm_tensor);
fpga::format_ofm(ofm_tensor);
}
void format_deconv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
......@@ -294,7 +299,7 @@ void format_deconv_data(framework::Tensor *filter_tensor,
int element_num_per_div =
get_deconv_filter_num_per_div(filter_tensor, group, sub_conv_n);
format_bias_scale_array(bs_ptr, element_num_per_div, channel * sub_conv_n);
format_fp16_ofm(ofm_tensor);
format_ofm(ofm_tensor);
}
void format_dwconv_data(framework::Tensor *filter_tensor,
......@@ -303,7 +308,7 @@ void format_dwconv_data(framework::Tensor *filter_tensor,
auto channel = ofm_tensor->dims()[1];
format_dwconv_filter(filter_tensor, scale_ptr);
format_bias_array(bias_ptr, channel);
format_fp16_ofm(ofm_tensor);
format_ofm(ofm_tensor);
}
void format_DWDeconv_data(framework::Tensor *filter_tensor,
framework::Tensor *ofm_tensor, float **bs_ptr,
......@@ -314,7 +319,7 @@ void format_DWDeconv_data(framework::Tensor *filter_tensor,
filter_tensor,
(reinterpret_cast<float *>(*bs_ptr) + sub_conv_n * channel), sub_conv_n);
format_bias_array(bs_ptr, channel);
format_fp16_ofm(ofm_tensor);
format_ofm(ofm_tensor);
}
void expand_conv_arg(ConvArgs *arg) {
ConvArgs args = *arg;
......@@ -486,9 +491,9 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<half>();
auto input_ptr = input->data<int8_t>();
auto filter_ptr = filter->data<int8_t>();
auto out_ptr = out->data<half>();
auto out_ptr = out->data<int8_t>();
auto deleter = [](void *p) { fpga_free(p); };
arg->group_num = (uint32_t)group_num;
......@@ -512,7 +517,7 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
int n = arg->split_num;
arg->concat_arg.images_in =
static_cast<int16_t **>(fpga_malloc(n * sizeof(int *)));
static_cast<int8_t **>(fpga_malloc(n * sizeof(int *)));
arg->concat_arg.scales_in =
static_cast<float **>(fpga_malloc(n * sizeof(float *)));
arg->concat_arg.channel_num =
......@@ -531,7 +536,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
filter->dims()[3]));
for (int i = 0; i < n; i++) {
// arg->conv_arg[i].relu_enabled = relu_enabled;
arg->conv_arg[i].output.activation.activation_type = activation_enable;
arg->conv_arg[i].output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
......@@ -563,18 +567,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
reinterpret_cast<char *>(arg->conv_arg[i].filter_address), deleter));
memcpy(arg->conv_arg[i].filter_address, filter_head, filter_size);
fpga_flush(arg->conv_arg[i].filter_address, filter_size);
// for test
// {
// static int cnt = 0;
// if(cnt == 4){
// int8_t result = 0;
// std::string str = "fc_filter";
// fpga::savefile<int8_t>(str, arg->conv_arg[i].filter_address,
// filter_size, result);
//
// }
// cnt++;
//}
size_t bs_size = 2 *
align_to_x(arg->conv_arg[i].filter_num, BS_NUM_ALIGNMENT) *
......@@ -585,18 +577,6 @@ void fill_split_arg(struct SplitConvArgs *arg, framework::Tensor *input,
reinterpret_cast<char *>(arg->conv_arg[i].sb_address), deleter));
memcpy(arg->conv_arg[i].sb_address, bs_head, bs_size);
fpga_flush(arg->conv_arg[i].sb_address, bs_size);
// for test
/*{
static int cnt = 0;
if(cnt == 4){
float result = 0;
std::string str = "fc_bs";
fpga::savefile<float>(str, arg->conv_arg[i].sb_address, bs_size/4,
result);
}
cnt++;
}*/
if (n > 1) {
arg->conv_arg[i].output.scale_address =
......@@ -606,7 +586,7 @@ result);
align_to_x((int)(out->dims()[3] * // NOLINT
arg->conv_arg[i].filter_num),
IMAGE_ALIGNMENT) *
sizeof(half));
sizeof(int8_t));
arg->vector_conv_space.push_back(std::shared_ptr<char>(
reinterpret_cast<char *>(arg->conv_arg[i].output.scale_address),
deleter));
......@@ -618,7 +598,7 @@ result);
}
arg->concat_arg.images_in[i] =
(half *)arg->conv_arg[i].output.address; // NOLINT
(int8_t *)arg->conv_arg[i].output.address; // NOLINT
arg->concat_arg.scales_in[i] = arg->conv_arg[i].output.scale_address;
arg->concat_arg.channel_num[i] = arg->conv_arg[i].filter_num;
......@@ -634,7 +614,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
int16_t leaky_relu_negative_slope, int group_num,
int stride_h, int stride_w, int padding_h, int padding_w,
float *bs_ptr) {
auto input_ptr = input->data<half>();
auto input_ptr = input->data<int8_t>();
auto filter_ptr = filter->data<int8_t>();
auto deleter = [](void *p) { fpga_free(p); };
......@@ -665,11 +645,11 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, sub_output_height * sub_conv_num, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<half>();
fpga::format_int8_ofm(out, dims_out_new);
auto out_ptr = out->data<int8_t>();
arg->output.address =
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
(int8_t *)out_ptr + // NOLINT
omit_size * sizeof(int8_t) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->output.scale_address = out->scale;
......@@ -692,7 +672,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i]->conv_arg =
static_cast<ConvArgs *>(fpga_malloc(split_num * sizeof(ConvArgs)));
arg->split_conv_args[i]->concat_arg.images_in =
static_cast<int16_t **>(fpga_malloc(split_num * sizeof(int16_t *)));
static_cast<int8_t **>(fpga_malloc(split_num * sizeof(int8_t *)));
arg->split_conv_args[i]->concat_arg.scales_in =
static_cast<float **>(fpga_malloc(split_num * sizeof(float *)));
arg->split_conv_args[i]->concat_arg.channel_num =
......@@ -744,7 +724,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
} else {
out_addr_offset =
sizeof(int16_t) * (sub_conv_num - 1 - i) *
sizeof(int8_t) * (sub_conv_num - 1 - i) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
arg->split_conv_args[i]->output.address = out_ptr;
......@@ -841,7 +821,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i]->output.scale_address;
} else {
arg->split_conv_args[i]->conv_arg[j].output.address =
fpga_malloc(conv_output_size * sizeof(int16_t));
fpga_malloc(conv_output_size * sizeof(int8_t));
arg->split_conv_args[i]->conv_arg[j].output.scale_address =
static_cast<float *>(fpga_malloc(2 * sizeof(float)));
arg->split_conv_args[i]->vector_conv_space.push_back(
......@@ -855,7 +835,7 @@ void fill_deconv_arg(struct DeconvArgs *arg, framework::Tensor *input,
arg->split_conv_args[i]->conv_arg[j].output.scale_address),
deleter));
}
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<half *>(
arg->split_conv_args[i]->concat_arg.images_in[j] = static_cast<int8_t *>(
arg->split_conv_args[i]->conv_arg[j].output.address);
arg->split_conv_args[i]->concat_arg.scales_in[j] =
arg->split_conv_args[i]->conv_arg[j].output.scale_address;
......@@ -885,10 +865,9 @@ void fill_dwconv_arg(struct DWconvArgs *arg, framework::Tensor *input,
std::shared_ptr<char>(reinterpret_cast<char *>(bias_ptr), deleter));
auto filter_ptr = filter->data<int16_t>();
auto input_ptr = input->data<half>();
auto output_ptr = out->mutable_data<half>();
auto input_ptr = input->data<int8_t>();
auto output_ptr = out->mutable_data<int8_t>();
arg->sub_conv_num = 1;
// arg->relu_enabled = relu_enabled;
arg->output.activation.activation_type = activation_enable;
arg->output.activation.leaky_relu_negative_slope = leaky_relu_negative_slope;
arg->bias_address = bias_ptr;
......@@ -915,7 +894,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
int stride_w, int padding_h, int padding_w,
float *bias_ptr) {
auto filter_ptr = filter->data<int8_t>();
auto input_ptr = input->data<half>();
auto input_ptr = input->data<int8_t>();
auto deleter = [](void *p) { fpga_free(p); };
......@@ -949,15 +928,9 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
framework::DDim dims_out_new = framework::make_ddim(
{1, arg->filter_num, real_out_height, real_out_width});
fpga::format_fp16_ofm(out, dims_out_new);
auto out_ptr = out->data<half>();
fpga::format_int8_ofm(out, dims_out_new);
auto out_ptr = out->data<int8_t>();
/*====For Addition
arg->output.address =
(half *)out_ptr + // NOLINT
omit_size * sizeof(half) *
(align_to_x(real_out_width * arg->filter_num, IMAGE_ALIGNMENT));
*/
arg->output.address = out_ptr;
arg->output.scale_address = out->scale;
......@@ -1002,7 +975,7 @@ void fill_DWDeconv_arg(struct DWDeconvArgs *arg, framework::Tensor *input,
fpga_malloc(sub_output_height *
align_to_x(sub_output_width * sub_channels * sub_conv_num,
IMAGE_ALIGNMENT) *
sizeof(int16_t));
sizeof(int8_t));
arg->dw_conv_args[i]->output.scale_address =
static_cast<float *>(fpga_malloc(2 * sizeof(float)));
arg->vector_dw_conv_space.push_back(std::shared_ptr<char>(
......
......@@ -24,6 +24,8 @@ namespace fpga {
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor);
void format_int8_ofm(framework::Tensor* ofm_tensor);
void format_int8_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor, framework::DDim dims);
void format_fp32_ofm(framework::Tensor* ofm_tensor);
......
......@@ -55,7 +55,7 @@ void convert_to_chw(float **data_in, int channel, int height, int width,
*data_in = data_tmp;
}
void concat_images(int16_t **images_in, float **scales_in, void *image_out,
void concat_images(int8_t **images_in, float **scales_in, void *image_out,
float *scale_out, int image_num, uint32_t *channel_num,
int height, int width) {
int i = 0;
......@@ -66,17 +66,29 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
int align_each_in_area_cw = 0;
int align_each_out_area_cw_differ = 0;
int tmp_channel = 0;
scale_out[0] = 0.0;
scale_out[1] = 0.0;
float Ck = 0.0f;
float So = scale_out[0];
auto images_in_tmp =
(int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *)); // NOLINT
for (i = 0; i < image_num; i++) {
images_in_tmp[i] = reinterpret_cast<int8_t *>(fpga::fpga_malloc(
height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
sizeof(int8_t)));
}
for (i = 0; i < image_num; i++) {
each_out_line_channel += channel_num[i];
scale_out[0] = std::max(*scale_out, scales_in[i][0]);
float Si_k = scales_in[i][0];
Ck = Si_k / So;
fpga_invalidate(images_in[i],
height *
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT) *
sizeof(int16_t));
sizeof(int8_t));
for (j = 0;
j < height * align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
j++) {
images_in_tmp[i][j] = (int8_t)(images_in[i][j] * Ck + 0.5);
}
}
scale_out[1] = 1 / scale_out[0];
align_each_out_area_cw =
align_to_x(each_out_line_channel * width, IMAGE_ALIGNMENT);
align_each_out_area_cw_differ =
......@@ -87,31 +99,27 @@ void concat_images(int16_t **images_in, float **scales_in, void *image_out,
for (i = 0; i < image_num; i++) {
align_each_in_area_cw =
align_to_x(channel_num[i] * width, IMAGE_ALIGNMENT);
memcpy((int16_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ,
images_in[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int16_t));
memcpy(
(int16_t *)image_out + tmp_channel + // NOLINT
k * align_each_out_area_cw_differ,
images_in_tmp[i] + j * channel_num[i] + k * align_each_in_area_cw,
channel_num[i] * sizeof(int8_t));
tmp_channel += channel_num[i];
}
}
}
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int16_t));
fpga_flush(image_out, height * align_each_out_area_cw * sizeof(int8_t));
}
void split_image(int16_t *image_in, const float *scale_in, void **images_out,
float **scales_out, int image_num,
void split_image(int8_t *image_in, void **images_out, int image_num,
const uint32_t *channel_nums, int height, int width) {
int total_channel = 0;
for (int i = 0; i < image_num; i++) {
scales_out[i][0] = scale_in[0];
scales_out[i][1] = scale_in[1];
total_channel += channel_nums[i];
}
int element_num = height * align_to_x(width * total_channel, IMAGE_ALIGNMENT);
fpga_invalidate(image_in, element_num * sizeof(int16_t));
fpga_invalidate(image_in, element_num * sizeof(int8_t));
int src_offset = 0, des_offset = 0;
for (int h = 0; h < height; h++) {
for (int w = 0; w < width; w++) {
......@@ -120,8 +128,8 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out,
for (int i = 0; i < image_num; i++) {
des_offset = h * align_to_x(channel_nums[i] * width, IMAGE_ALIGNMENT) +
w * channel_nums[i];
memcpy(reinterpret_cast<int16_t *>(images_out[i]) + des_offset,
image_in + src_offset, channel_nums[i] * sizeof(int16_t));
memcpy(reinterpret_cast<int8_t *>(images_out[i]) + des_offset,
image_in + src_offset, channel_nums[i] * sizeof(int8_t));
src_offset += channel_nums[i];
}
}
......@@ -129,7 +137,7 @@ void split_image(int16_t *image_in, const float *scale_in, void **images_out,
for (int i = 0; i < image_num; i++) {
element_num = height * align_to_x(width * channel_nums[i], IMAGE_ALIGNMENT);
fpga_flush(images_out[i], element_num * sizeof(int16_t));
fpga_flush(images_out[i], element_num * sizeof(int8_t));
}
}
......
......@@ -63,13 +63,12 @@ void format_image(T** data_in, int channel, int height, int width) {
align_to_x(channel * width, IMAGE_ALIGNMENT) * height * sizeof(T));
}
// Concat featuremaps along channel direction
void concat_images(int16_t** images_in, float** scales_in, void* image_out,
void concat_images(int8_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num,
int height, int width);
// Split featuremap along channel direction
void split_image(int16_t* image_in, const float* scale_in, void** images_out,
float** scales_out, int image_num,
void split_image(int8_t* image_in, void** images_out, int image_num,
const uint32_t* channel_nums, int height, int width);
} // namespace image
} // namespace fpga
......
......@@ -907,9 +907,8 @@ int ComputeFPGASplit(const struct SplitArgs &args) {
<< " image_scale_address:" << args.scales_out[i];
}
#endif
image::split_image(args.image_in, args.scale_in, args.images_out,
args.scales_out, args.image_num, args.out_channel_nums,
args.height, args.width);
image::split_image(args.image_in, args.images_out, args.image_num,
args.out_channel_nums, args.height, args.width);
return 0;
} // ComputeFPGASplit
int ComputeDWConv(const struct DWconvArgs &args) {
......
......@@ -88,8 +88,6 @@ struct ImageOutputArgs {
activation; // To select activation and specify (Leaky)Relu parameter.
};
// #ifdef PADDLE_MOBILE_FPGA_V1
#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
struct ConvDriverParam {
uint64_t image_address_phy;
uint64_t filter_address_phy;
......@@ -141,10 +139,8 @@ struct DeconvTxParm {
uint32_t deconv_en;
uint32_t out_addr_offset;
};
#endif
struct ConvArgs {
// bool relu_enabled;
void* sb_address; // scale and bias
void* filter_address;
float* filter_scale_address;
......@@ -155,16 +151,17 @@ struct ConvArgs {
struct ImageInputArgs image; // input image;
struct ImageOutputArgs output;
// #ifdef PADDLE_MOBILE_FPGA_V1
#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
struct DeconvTxParm deconv_tx_param;
struct ConvDriverParam driver;
#endif
};
struct ConcatArgs {
uint32_t image_num;
#ifdef PADDLE_MOBILE_FPGA_V2
int8_t** images_in;
#else
int16_t** images_in;
#endif
float** scales_in;
void* image_out;
float* scale_out;
......@@ -189,7 +186,11 @@ struct SplitConvArgs {
struct SplitArgs {
uint32_t image_num;
#ifdef PADDLE_MOBILE_FPGA_V2
int8_t* image_in;
#else
int16_t* image_in;
#endif
float* scale_in;
void** images_out;
float** scales_out;
......@@ -214,12 +215,7 @@ struct EWAddArgs {
struct ImageInputArgs image0;
struct ImageInputArgs image1;
struct ImageOutputArgs output;
std::vector<float> image_in_quantVal;
std::vector<float> image_out_quantVal;
// #ifdef PADDLE_MOBILE_FPGA_V1
#if ((defined PADDLE_MOBILE_FPGA_V1) || (defined PADDLE_MOBILE_FPGA_V2))
struct EWAddDriverParam driver;
#endif
};
struct BypassArgs {
......@@ -243,7 +239,6 @@ struct DeconvArgs {
};
struct DWconvArgs {
uint32_t sub_conv_num;
// bool relu_enabled;
void* bias_address;
void* filter_address;
struct KernelArgs kernel;
......@@ -264,8 +259,6 @@ struct DWDeconvArgs {
std::vector<std::shared_ptr<char>> vector_dw_conv_space;
};
// static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x;
// }
static inline uint32_t align_to_x(int64_t num, int64_t x) {
return ((uint32_t)(num + x) - 1) / (uint32_t)x * (uint32_t)x;
}
......
......@@ -94,7 +94,6 @@ Executor<Device, T>::Executor(const Program<Device> &program,
} else {
InitMemory();
}
int count = 0;
for (auto &op_handler : ops_of_block0_) {
DLOG << "Initialize op[" << count++ << "]: " << op_handler->Type();
......@@ -319,7 +318,11 @@ bool Executor<Device, T>::varInputMemory(
const std::shared_ptr<VarDesc> &var_desc, Variable *var) const {
#ifdef PADDLE_MOBILE_FPGA
framework::LoDTensor *tensor = var->template GetMutable<LoDTensor>();
#ifdef PADDLE_MOBILE_FPGA_V2
tensor->init(type_id<int8_t>().hash_code());
#else
tensor->init(type_id<float>().hash_code());
#endif
return true;
#endif
......@@ -677,8 +680,8 @@ void Executor<Device, T>::InitQuantMemory() {
for (int i = 0; i < count; i++) {
auto tensor = GetTensorByName(inputs_vars[i]);
tensor->scale[0] = quantValList[inputs_vars[i]];
std::cout << "input variance name : " << inputs_vars[i]
<< ", scale value : " << tensor->scale[0] << std::endl;
DLOG << "input variance name : " << inputs_vars[i]
<< ", scale value : " << tensor->scale[0];
}
}
auto output_keys = op->GetOutKeys();
......@@ -689,8 +692,8 @@ void Executor<Device, T>::InitQuantMemory() {
for (int i = 0; i < count; i++) {
auto tensor = GetTensorByName(outputs_vars[i]);
tensor->scale[0] = quantValList[outputs_vars[i]];
std::cout << "output variance name : " << outputs_vars[i]
<< ", scale value : " << tensor->scale[0] << std::endl;
DLOG << "output variance name : " << outputs_vars[i]
<< ", scale value : " << tensor->scale[0];
}
}
}
......
......@@ -25,7 +25,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
auto out = param->Out();
auto image_num = inputs.size();
auto images_in =
(half **)fpga::fpga_malloc(image_num * sizeof(int *)); // NOLINT
(int8_t **)fpga::fpga_malloc(image_num * sizeof(int8_t *)); // NOLINT
auto scales_in =
(float **)fpga::fpga_malloc(image_num * sizeof(float *)); // NOLINT
auto channel_num =
......@@ -38,7 +38,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified");
images_in[i] = input->data<half>();
images_in[i] = input->data<int8_t>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1]; // NOLINT
scales_in[i] = input->scale;
}
......@@ -48,7 +48,7 @@ bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
concatArgs.image_num = image_num;
concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in;
concatArgs.image_out = out->data<half>();
concatArgs.image_out = out->data<int8_t>();
concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num;
concatArgs.height = height;
......
......@@ -22,7 +22,6 @@ namespace operators {
template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -35,7 +34,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
......@@ -59,8 +58,6 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] =
bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
// bs_ptr[i + channel] = new_scale_ptr[i];
// bs_ptr[i] = new_bias_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
}
......
......@@ -23,7 +23,6 @@ namespace operators {
template <>
bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -35,7 +34,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
const int groups = param->Groups();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
vector<int> paddings = param->Paddings();
vector<int> strides = param->Strides();
auto bn_mean_ptr = param->InputMean()->data<float>();
......@@ -60,8 +59,6 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] =
bn_bias_ptr[i] + (bias_ptr[i] - bn_mean_ptr[i]) * new_scale_ptr[i];
// bs_ptr[i + channel] = new_scale_ptr[i];
// bs_ptr[i] = new_bias_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
if (groups == channel) {
......
......@@ -21,7 +21,6 @@ namespace operators {
template <>
bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -32,7 +31,7 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
......@@ -40,8 +39,6 @@ bool ConvAddKernel<FPGA, float>::Init(FusionConvAddParam<FPGA> *param) {
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
// bs_ptr[i + channel] = 1;
// bs_ptr[i] = bias_ptr[i];
bs_ptr[i + channel] = Si / So * Sf / 127.0;
bs_ptr[i] = bias_ptr[i] * 127.0 / So;
}
......
......@@ -21,7 +21,6 @@ namespace operators {
template <>
bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -32,7 +31,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
......@@ -40,8 +39,6 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
// bs_ptr[i + channel] = 1;
// bs_ptr[i] = bias_ptr[i];
bs_ptr[i + channel] = Si / So * Sf / 127.0;
bs_ptr[i] = bias_ptr[i] * 127.0 / So;
}
......
......@@ -22,7 +22,6 @@ namespace operators {
template <>
bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -31,7 +30,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -51,8 +50,6 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
// bs_ptr[i + channel] = new_scale_ptr[i];
// bs_ptr[i] = new_bias_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
}
......
......@@ -29,7 +29,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -48,8 +48,6 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
new_scale_ptr[i] = bn_scale_ptr[i] /
static_cast<float>(pow((bn_var_ptr[i] + epsilon), 0.5));
new_bias_ptr[i] = bn_bias_ptr[i] + (0 - bn_mean_ptr[i]) * new_scale_ptr[i];
// bs_ptr[i + channel] = new_scale_ptr[i];
// bs_ptr[i] = new_bias_ptr[i];
bs_ptr[i + channel] = new_scale_ptr[i] * Si / So * Sf / 127.0;
bs_ptr[i] = new_bias_ptr[i] * 127.0 / So;
if (groups == channel) {
......
......@@ -29,13 +29,11 @@ bool ConvKernel<FPGA, float>::Init(ConvParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
int channel = out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
// bs_ptr[i + channel] = 1;
// bs_ptr[i] = 0;
bs_ptr[i + channel] = Si / So * Sf / 127.0;
bs_ptr[i] = 0;
}
......
......@@ -23,21 +23,16 @@ namespace operators {
template <>
bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto input = const_cast<LoDTensor *>(param->Input());
// const Tensor *bias = param->Bias();
// auto bias_ptr = bias->data<float>();
auto filter = const_cast<LoDTensor *>(param->Filter());
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
// PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
// "Output channel should be equal to bias number");
int channel = out->dims()[1];
int sub_conv_n = param->Strides()[0];
......@@ -46,7 +41,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = 0; // bias_ptr[i % (channel)];
bs_ptr[i] = 0;
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
......@@ -58,7 +53,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
if (param->Groups() == channel) {
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = Si / So;
bs_ptr[i] = 0; // bias_ptr[i % (channel)];
bs_ptr[i] = 0;
}
fpga::format_DWDeconv_data(filter, out, &bs_ptr, param->Groups(),
sub_conv_n);
......@@ -71,7 +66,7 @@ bool ConvTransposeKernel<FPGA, float>::Init(ConvTransposeParam<FPGA> *param) {
} else {
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = Si / So * Sf / 127.0f;
bs_ptr[i] = 0; // bias_ptr[i % (channel)];
bs_ptr[i] = 0;
}
fpga::format_deconv_data(filter, out, &bs_ptr, param->Groups(), sub_conv_n);
fpga::DeconvArgs deconv_arg = {0};
......
......@@ -23,7 +23,6 @@ namespace operators {
template <>
bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -34,7 +33,7 @@ bool DeconvAddBNKernel<FPGA, float>::Init(FusionDeconvAddBNParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......
......@@ -24,7 +24,6 @@ namespace operators {
template <>
bool DeconvAddBNReluKernel<FPGA, float>::Init(
FusionDeconvAddBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -35,7 +34,7 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......@@ -87,7 +86,6 @@ bool DeconvAddBNReluKernel<FPGA, float>::Init(
template <>
void DeconvAddBNReluKernel<FPGA, float>::Compute(
const FusionDeconvAddBNReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
......
......@@ -23,7 +23,6 @@ namespace operators {
template <>
bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -34,7 +33,7 @@ bool DeconvAddKernel<FPGA, float>::Init(FusionDeconvAddParam<FPGA> *param) {
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......
......@@ -24,7 +24,6 @@ namespace operators {
template <>
bool DeconvAddReluKernel<FPGA, float>::Init(
FusionDeconvAddReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -35,7 +34,7 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
......@@ -44,11 +43,6 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = 1;
bs_ptr[i] = bias_ptr[i % (channel)];
}
PADDLE_MOBILE_ENFORCE(param->Strides()[1] == param->Strides()[0],
"stride_width should be equal to stride_height ");
PADDLE_MOBILE_ENFORCE(filter->dims()[2] == filter->dims()[3],
......@@ -87,7 +81,6 @@ bool DeconvAddReluKernel<FPGA, float>::Init(
template <>
void DeconvAddReluKernel<FPGA, float>::Compute(
const FusionDeconvAddReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
......
......@@ -25,7 +25,6 @@ namespace operators {
template <>
bool DeconvBNReluKernel<FPGA, float>::Init(
FusionDeconvBNReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -36,7 +35,7 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
auto out = param->Output();
float Si = input->scale[0];
float So = out->scale[0];
float Sf = fpga::filter_find_max(filter);
float Sf = fpga::filter_find_max(filter) / 127;
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -59,10 +58,6 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
int sub_conv_n = param->Strides()[0];
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sub_conv_n * // NOLINT
sizeof(float)); // NOLINT
// for (int i = 0; i < channel * sub_conv_n; i++) {
// bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel];
// bs_ptr[i] = new_bias_ptr[i % (channel)];
// }
if (param->Groups() == channel) {
for (int i = 0; i < channel * sub_conv_n; i++) {
bs_ptr[i + sub_conv_n * channel] = new_scale_ptr[i % channel] * Si / So;
......@@ -107,7 +102,6 @@ bool DeconvBNReluKernel<FPGA, float>::Init(
template <>
void DeconvBNReluKernel<FPGA, float>::Compute(
const FusionDeconvBNReluParam<FPGA> &param) {
// fpga::ComputeFpgaDeconv(param.FpgaArgs());
if (param.Groups() == param.Output()->dims()[1]) {
fpga::ComputeDWDeconv(param.FpgaDWDconvArgs());
} else {
......
......@@ -25,170 +25,50 @@ template <>
bool ElementwiseAddKernel<FPGA, float>::Init(ElementwiseAddParam<FPGA> *param) {
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
if (input_y->type() != type_id<float>()) {
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<half>();
float Si_1 = input_x->scale[0];
float Si_2 = input_y->scale[0];
float So = out->scale[0];
float C1 = Si_1 / So;
float C2 = Si_2 / So;
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = 0x3c00; // =1
ewaddArgs.const1 = 0x3c00; // =1
ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale;
ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
ewaddArgs.image0.pad_height = 0;
ewaddArgs.image0.pad_width = 0;
ewaddArgs.image1.address = input_y_ptr;
ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
ewaddArgs.image1.scale_address = input_y->scale;
ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
ewaddArgs.image1.pad_height = 0;
ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs);
} else {
param->float_input_x.Resize(param->InputX()->dims());
param->float_input_x.init(type_id<float>().hash_code());
fpga::format_fp32_ofm(&(param->float_input_x));
param->float_out.Resize(param->InputX()->dims());
param->float_out.mutable_data<float>(param->InputX()->dims());
fpga::format_fp32_ofm(&(param->float_out));
fpga::format_fp16_ofm(out);
}
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<half>();
float Si_1 = input_x->scale[0];
float Si_2 = input_y->scale[0];
float So = out->scale[0];
float C1 = Si_1 / So;
float C2 = Si_2 / So;
fpga::EWAddArgs ewaddArgs = {0};
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale;
ewaddArgs.image0.height = (uint32_t)input_x->dims()[2];
ewaddArgs.image0.width = (uint32_t)input_x->dims()[3];
ewaddArgs.image0.pad_height = 0;
ewaddArgs.image0.pad_width = 0;
ewaddArgs.image1.address = input_y_ptr;
ewaddArgs.image1.channels = (uint32_t)input_y->dims()[1];
ewaddArgs.image1.scale_address = input_y->scale;
ewaddArgs.image1.height = (uint32_t)input_y->dims()[2];
ewaddArgs.image1.width = (uint32_t)input_y->dims()[3];
ewaddArgs.image1.pad_height = 0;
ewaddArgs.image1.pad_width = 0;
ewaddArgs.output.scale_address = out->scale;
ewaddArgs.output.address = out_ptr;
fpga::expand_EW_arg(&ewaddArgs);
param->SetFpgaArgs(ewaddArgs);
return true;
}
inline void ElementwiseAddCompute(const ElementwiseAddParam<FPGA> &param) {
auto input_x = param.float_input_x;
auto input_y = param.InputY();
auto Out = param.float_out;
int axis = param.Axis();
const auto &x_dims = input_x.dims();
const auto &y_dims = input_y->dims();
/// axis = -1 represent the last dimensions.
axis = (axis == -1 ? x_dims.size() - y_dims.size() : axis);
size_t batch = 1;
size_t channels = 1;
size_t elementwise_num = 1;
for (int i = 0; i < axis; ++i) {
batch *= x_dims[i];
}
for (int i = 0; i < y_dims.size(); ++i) {
channels *= y_dims[i];
}
for (int i = y_dims.size() + axis; i < x_dims.size(); ++i) {
elementwise_num *= x_dims[i];
}
const float *bias_data = input_y->data<float>();
const float *input_data = input_x.data<float>();
float *output_data = Out.mutable_data<float>();
for (int i = 0; i < batch; ++i) {
for (int j = 0; j < channels; ++j) {
size_t offset = (i * channels + j) * elementwise_num;
const float *input = input_data + offset;
const float bias = bias_data[j];
float *output = output_data + offset;
// DLOG << "output address: "<< output;
for (int k = 0; k < elementwise_num; ++k) {
output[k] = input[k] + bias;
// DLOG << "output[" << k << "]= " << output[k] ;
}
}
}
}
template <>
void ElementwiseAddKernel<FPGA, float>::Compute(
const ElementwiseAddParam<FPGA> &param) {
auto input_y = const_cast<LoDTensor *>(param.InputY());
if (input_y->type() != type_id<float>()) {
fpga::ComputeFpgaEWAdd(param.FpgaArgs());
} else {
auto input_x = const_cast<LoDTensor *>(param.InputX());
auto intput_x_float = const_cast<Tensor *>(&(param.float_input_x));
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = input_x->data<half>();
args.image.channels = (uint32_t)(input_x->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = intput_x_float->data<float>();
args.output.scale_address = intput_x_float->scale;
// fpga::fpga_flush(input_x->data<half>(),input_x->fpga_data_num *
// sizeof(half));
fpga::PerformBypass(args);
fpga::fpga_invalidate(args.output.address,
input_x->fpga_data_num * sizeof(float));
// just for test
/* {
static int cnt = 0;
if(cnt == 0){
std::string str= "first_bypass_data";
float rslt = 0.0f;
fpga::savefile(str, args.output.address, input_x->fpga_data_num,
rslt); cnt++;
}
}*/
ElementwiseAddCompute(param);
auto out_float = const_cast<Tensor *>(&(param.float_out));
DLOG << "out float: " << out_float->data<float>();
fpga::fpga_flush(out_float->data<float>(),
input_x->fpga_data_num * sizeof(float));
// just for test
/*{
static int cnt = 0;
if(cnt == 0){
std::string str= "ew_output_data";
float rslt = 0.0f;
fpga::savefile(str, out_float->data<float>(), input_x->fpga_data_num,
rslt); cnt++;
}
}*/
auto Out = param.Out();
args.input_data_type = fpga::DATA_TYPE_FP32;
args.output_data_type = fpga::DATA_TYPE_FP16;
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = out_float->data<float>();
args.image.channels = (uint32_t)(input_x->fpga_data_num);
args.image.height = 1;
args.image.width = 1;
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = Out->data<half>();
args.output.scale_address = Out->scale;
fpga::PerformBypass(args);
}
fpga::ComputeFpgaEWAdd(param.FpgaArgs());
}
} // namespace operators
} // namespace paddle_mobile
......
......@@ -21,29 +21,27 @@ namespace operators {
template <>
bool ElementwiseAddReluKernel<FPGA, float>::Init(
ElementwiseAddReluParam<FPGA> *param) {
// bool relu_enabled = true;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto *input_y = const_cast<LoDTensor *>(param->InputY());
auto *out = param->Out();
auto input_x_ptr = input_x->data<half>();
auto input_y_ptr = input_y->data<half>();
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<half>();
auto input_x_ptr = input_x->data<int8_t>();
auto input_y_ptr = input_y->data<int8_t>();
fpga::format_ofm(out);
auto out_ptr = out->mutable_data<int8_t>();
float Si_1 = input_x->scale[0];
float Si_2 = input_y->scale[0];
float So = out->scale[0];
float C1 = Si_1 / So;
float C2 = Si_2 / So;
fpga::EWAddArgs ewaddArgs = {0};
// ewaddArgs.relu_enabled = relu_enabled;
ewaddArgs.output.activation.activation_type = activation_enable;
ewaddArgs.output.activation.leaky_relu_negative_slope =
leaky_relu_negative_slope;
ewaddArgs.const0 = 0x3c00; // =1
ewaddArgs.const1 = 0x3c00; // =1
ewaddArgs.const0 = fpga::fp32_2_fp16(C1);
ewaddArgs.const1 = fpga::fp32_2_fp16(C2);
ewaddArgs.image0.address = input_x_ptr;
ewaddArgs.image0.channels = (uint32_t)input_x->dims()[1];
ewaddArgs.image0.scale_address = input_x->scale;
......
......@@ -20,7 +20,6 @@ namespace operators {
template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::NONE;
int16_t leaky_relu_negative_slope = 0;
......@@ -30,17 +29,13 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
auto input_z_ptr = input_z->data<float>();
auto out = param->Out();
float Si = input_x->scale[0];
float Sf = filter->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float So = out->scale[0];
// PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
// "Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
// bs_ptr[i + channel] = 1;
// bs_ptr[i] = input_z_ptr[i];
bs_ptr[i + channel] = Si / So * Sf / 127.0f;
bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
}
......@@ -60,7 +55,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_ofm(out);
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
......
......@@ -20,7 +20,6 @@ namespace operators {
template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
// bool relu_enabled = false;
paddle_mobile::fpga::ActivationType activation_enable =
paddle_mobile::fpga::LEAKYRELU;
int16_t leaky_relu_negative_slope = 0;
......@@ -30,17 +29,13 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
auto input_z_ptr = input_z->data<float>();
auto out = param->Out();
float Si = input_x->scale[0];
float Sf = filter->scale[0];
float Sf = fpga::filter_find_max(filter) / 127;
float So = out->scale[0];
// PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
// "Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto bs_ptr =
(float *)fpga::fpga_malloc(2 * channel * sizeof(float)); // NOLINT
for (int i = 0; i < channel; i++) {
// bs_ptr[i + channel] = 1;
// bs_ptr[i] = input_z_ptr[i];
bs_ptr[i + channel] = Si / So * Sf / 127.0f;
bs_ptr[i] = input_z_ptr[i] * 127.0f / So;
}
......@@ -60,7 +55,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_fp16_ofm(out);
fpga::format_ofm(out);
fpga::SplitConvArgs conv_arg = {0};
fpga::fill_split_arg(&conv_arg, input_x, out, filter, activation_enable,
......
......@@ -41,9 +41,9 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
return true;
}
auto input_ptr = input->data<half>();
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<half>();
auto input_ptr = input->data<int8_t>();
fpga::format_ofm(output);
auto output_ptr = output->mutable_data<int8_t>();
float Si = input->scale[0];
float So = output->scale[0];
......
......@@ -48,8 +48,8 @@ bool Reshape2Kernel<FPGA, float>::Init(Reshape2Param<FPGA> *param) {
void reshape(LoDTensor *input, LoDTensor *output) {
// Subscript r means after reshape
auto input_ptr = input->data<half>();
auto output_ptr = output->data<half>();
auto input_ptr = input->data<int8_t>();
auto output_ptr = output->data<int8_t>();
output->scale[0] = input->scale[0];
output->scale[1] = input->scale[1];
......@@ -67,7 +67,7 @@ void reshape(LoDTensor *input, LoDTensor *output) {
auto WCr_align = fpga::align_to_x(WCr, IMAGE_ALIGNMENT);
auto HWr = Hr * Wr;
fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(half));
fpga::fpga_invalidate(input_ptr, H * WC_align * sizeof(int8_t));
int offset_align = 0;
int offset_r = 0, offset_align_r = 0;
......@@ -89,7 +89,7 @@ void reshape(LoDTensor *input, LoDTensor *output) {
}
}
fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(half));
fpga::fpga_flush(output_ptr, Hr * WCr_align * sizeof(int8_t));
}
template <>
......
......@@ -22,7 +22,7 @@ namespace operators {
template <>
bool SliceKernel<FPGA, float>::Init(SliceParam<FPGA>* param) {
auto output = param->output_;
fpga::format_fp16_ofm(output);
fpga::format_ofm(output);
DLOG << "input: " << param->input_;
DLOG << "output: " << param->output_;
if (param->input_->type() != type_id<half>()) {
......@@ -40,8 +40,8 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
auto output = param.output_;
int HW = input->dims()[2] * input->dims()[3];
int channel = input->dims()[1];
auto input_ptr = input->data<half>();
auto output_ptr = output->data<half>();
auto input_ptr = input->data<int8_t>();
auto output_ptr = output->data<int8_t>();
output->scale[0] = input->scale[0];
output->scale[1] = input->scale[1];
......@@ -52,7 +52,7 @@ void SliceKernel<FPGA, float>::Compute(const SliceParam<FPGA>& param) {
start = start > channel ? channel : start;
end = end > channel ? channel : end;
int len = end - start;
size_t size = len * sizeof(half);
size_t size = len * sizeof(int8_t);
for (int i = 0; i < HW; i++) {
memcpy(output_ptr + len * i, input_ptr + i * channel + start, size);
......
......@@ -38,7 +38,7 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
for (int i = 0; i < image_num; i++) {
fpga::format_fp16_ofm(outs[i]);
DLOG << "output: " << outs[i];
images_out[i] = outs[i]->mutable_data<half>();
images_out[i] = outs[i]->mutable_data<int8_t>();
scales_out[i] = outs[i]->scale;
out_channels[i] = (uint32_t)sections[i];
}
......@@ -47,7 +47,7 @@ bool SplitKernel<FPGA, float>::Init(SplitParam<FPGA> *param) {
fpga::SplitArgs arg = {0};
arg.image_num = image_num;
arg.image_in = in->data<half>();
arg.image_in = in->data<int8_t>();
arg.scale_in = in->scale;
arg.images_out = images_out;
arg.scales_out = scales_out;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册