提交 5892754e 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

Merge pull request #936 from zhangyang0701/develop

Implement concat op for FPGA track close #935
......@@ -89,8 +89,14 @@ DLOG << " kernel_height:" << args.kernel.height
DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address;*/
#endif
int split_num = args.split_num;
for (int i = 0; i < split_num; i++) {
do_ioctl(IOCTL_CONFIG_CONV, &args.conv_args[i]);
}
return do_ioctl(IOCTL_CONFIG_CONV, &args);
if (split_num > 1) {
ComputeFPGAConcat(args.concat_arg);
}
}
int ComputeFpgaPool(const struct PoolingArgs &args) {
......@@ -155,9 +161,16 @@ int PerformBypass(const struct BypassArgs &args) {
return do_ioctl(IOCTL_CONFIG_BYPASS, &args);
}
int ComputeFPGAConcat(const struct ConcatArgs &args) {
image::concat_images(args.images_in, args.scales_in, args.image_out,
args.scale_out, args.image_num, args.channel_num,
args.height, args.width);
return 0;
}
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->mutable_data<float>();
size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
......@@ -168,7 +181,7 @@ void format_image(framework::Tensor *image_tensor) {
void format_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
int channel = dims[1], height = dims[2], width = dims[3];
auto channel = dims[1], height = dims[2], width = dims[3];
size_t memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(half);
ofm_tensor->reset_data_ptr(fpga_malloc(memory_size));
......@@ -181,16 +194,16 @@ float filter_find_max(framework::Tensor *filter_tensor) {
int get_plit_num(framework::Tensor *filter_tensor) {
auto dims = filter_tensor->dims();
int chw = dims[1] * dims[2] * dims[3];
int num = dims[0];
auto chw = dims[1] * dims[2] * dims[3];
auto num = dims[0];
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_split_num(num, div_capacity);
}
int get_element_num_per_div(framework::Tensor *filter_tensor, int group_num) {
auto dims = filter_tensor->dims();
int chw = dims[1] * dims[2] * dims[3];
int num = dims[0];
auto chw = dims[1] * dims[2] * dims[3];
auto num = dims[0];
int div_capacity = filter::calc_division_capacity(chw);
return filter::calc_num_per_div(num, group_num, div_capacity);
}
......@@ -206,25 +219,10 @@ int get_aligned_filter_num(int num) {
void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num) {
auto dims = filter_tensor->dims();
int num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
filter_tensor->reset_data_ptr(new_data);
}
void format_fc_matrix(framework::Tensor *filter_tensor, float max_value,
int group_num, int height, int width) {
auto dims = filter_tensor->dims();
PADDLE_MOBILE_ENFORCE(height == 1 && width == 1,
"IFM should be flattened for FC");
int num = dims[1], channel = dims[0] / height / width;
auto data_ptr = filter_tensor->mutable_data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
auto new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
filter::format_filter(&new_data, num, channel, height, width, group_num,
max_value);
......@@ -237,5 +235,19 @@ void format_bias_scale_array(float **bias_scale_array,
element_num_per_division, num);
}
void format_concat_output(framework::Tensor *out, int height, int width,
int image_num, uint32_t *channel_num) {
int sum_channel = 0, sum_cw = 0;
for (int i = 0; i < image_num; i++) {
sum_channel += channel_num[i];
}
sum_cw = align_to_x(width * sum_channel, IMAGE_ALIGNMENT);
auto data_ptr = fpga_malloc(height * sum_cw * sizeof(half));
auto ddim = framework::make_ddim({-1, sum_channel, height, width});
out->Resize(ddim);
out->reset_data_ptr(data_ptr);
}
} // namespace fpga
} // namespace paddle_mobile
......@@ -92,12 +92,24 @@ struct ConvArgs {
struct ImageOutputArgs output;
};
struct ConcatArgs {
uint32_t image_num;
half** images_in;
float** scales_in;
void* image_out;
float* scale_out;
uint32_t* channel_num;
uint32_t height;
uint32_t width;
};
struct WrapperConvArgs {
uint32_t split_num;
uint32_t group_num;
uint32_t filter_num;
struct ImageOutputArgs output;
struct ConvArgs* args;
struct ConvArgs* conv_args;
struct ConcatArgs concat_arg;
};
struct PoolingArgs {
......@@ -176,6 +188,7 @@ int PerformBypass(const struct BypassArgs& args);
int ComputeFpgaConv(const struct WrapperConvArgs& args);
int ComputeFpgaPool(const struct PoolingArgs& args);
int ComputeFpgaEWAdd(const struct EWAddArgs& args);
int ComputeFPGAConcat(const struct ConcatArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
void format_image(framework::Tensor* image_tensor);
......@@ -188,10 +201,10 @@ int get_aligned_filter_num(int num);
void format_filter(framework::Tensor* filter_tensor, float max_value,
int group_num);
void format_fc_matrix(framework::Tensor* filter_tensor, float max_value,
int group_num, int height = 1, int width = 1);
void format_bias_scale_array(float** bias_scale_array,
int element_num_per_division, int num);
void format_concat_output(framework::Tensor* out, int height, int width,
int image_num, uint32_t* channel_num);
} // namespace fpga
} // namespace paddle_mobile
......@@ -62,6 +62,10 @@ void format_image(float **data_in, int channel, int height, int width) {
align_element_conv(data_in, height, channel * width);
}
void concat_images(int16_t **images_in, float **scales_in, void *image_out,
float *scale_out, int image_num, uint32_t *channel_num,
int height, int width) {}
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
......@@ -13,6 +13,9 @@ See the License for the specific language governing permissions and
limitations under the License. */
#pragma once
#include <stdint.h>
#define IMAGE_ALIGNMENT 16 // Aligned to 16
namespace paddle_mobile {
namespace fpga {
......@@ -21,6 +24,10 @@ namespace image {
void convert_to_hwc(float** data_in, int channel, int height, int width);
void align_element_conv(float** data_in, int height, int cw);
void format_image(float** data_in, int channel, int height, int width);
void concat_images(int16_t** images_in, float** scales_in, void* image_out,
float* scale_out, int image_num, uint32_t* channel_num,
int height,
int width); // Concat featuremaps along channel direction
} // namespace image
} // namespace fpga
} // namespace paddle_mobile
......@@ -21,31 +21,44 @@ namespace operators {
template <>
bool ConcatKernel<FPGA, float>::Init(ConcatParam<FPGA> *param) {
auto inputs = param->Inputs();
auto out = param->Out();
auto image_num = inputs.size();
auto images_in = (half **)fpga::fpga_malloc(image_num * sizeof(int *));
auto scales_in = (float **)fpga::fpga_malloc(image_num * sizeof(float *));
auto channel_num =
(uint32_t *)fpga::fpga_malloc(image_num * sizeof(uint32_t));
auto height = inputs[0]->dims()[2];
auto width = inputs[0]->dims()[3];
for (int i = 0; i < image_num; i++) {
auto input = inputs[i];
PADDLE_MOBILE_ENFORCE(
input->dims()[2] == height && input->dims()[3] == width,
"Image height & width should be unified");
images_in[i] = (half *)input->data<float>();
channel_num[i] = (uint32_t)inputs[i]->dims()[1];
scales_in[i] = input->scale;
}
fpga::format_concat_output(out, (int)height, (int)width, (int)image_num,
channel_num);
fpga::ConcatArgs concatArgs;
concatArgs.image_num = (uint32_t)image_num;
concatArgs.images_in = images_in;
concatArgs.scales_in = scales_in;
concatArgs.image_out = (half *)out->mutable_data<float>();
concatArgs.scale_out = out->scale;
concatArgs.channel_num = channel_num;
concatArgs.height = (uint32_t)height;
concatArgs.width = (uint32_t)width;
param->SetFpgaArgs(concatArgs);
return true;
}
template <>
void ConcatKernel<FPGA, float>::Compute(const ConcatParam<FPGA> &param) const {
auto inputs = param.Inputs();
auto *out = param.Out();
int64_t axis = param.Axis();
out->mutable_data<half>();
DDim out_dim = out->dims();
int pixels = out_dim[1] * out_dim[2];
auto out_channel = out_dim[3];
auto out_offset = 0;
for (int i = 0; i < inputs.size(); ++i) {
auto input = inputs[i];
auto channels = input->dims()[3];
out_offset += channels;
auto src = input->data<half>();
for (int j = 0; j < pixels; ++j) {
auto dst = out->mutable_data<half>() + out_offset;
memory::Copy(dst, src, sizeof(half));
}
}
ComputeFPGAConcat(param.FpgaArgs());
}
template class ConcatKernel<FPGA, float>;
......
......@@ -22,13 +22,13 @@ namespace operators {
template <>
bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
bool relu_enabled = false;
auto *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias();
auto bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto *filter = const_cast<Tensor *>(param->Filter());
auto filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output();
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
......@@ -40,10 +40,10 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
auto *bs_ptr =
auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
auto *new_scale = new Tensor();
auto *new_bias = new Tensor();
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -75,35 +75,68 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num *
sizeof(fpga::ConvArgs));
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
convArgs.concat_arg.image_num = convArgs.split_num;
convArgs.concat_arg.image_out = out_ptr;
convArgs.concat_arg.scale_out = out->scale;
convArgs.concat_arg.height = (uint32_t)filter->dims()[2];
convArgs.concat_arg.width = (uint32_t)filter->dims()[3];
int n = convArgs.split_num;
convArgs.concat_arg.images_in = (half **)fpga::fpga_malloc(n * sizeof(int *));
convArgs.concat_arg.scales_in =
(float **)fpga::fpga_malloc(n * sizeof(float *));
convArgs.concat_arg.channel_num =
(uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
convArgs.concat_arg.image_out = out_ptr;
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.args[i].relu_enabled = relu_enabled;
convArgs.args[i].group_num = (uint32_t)param->Groups();
convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.args[i].image.address = input_ptr;
convArgs.args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.args[i].image.height = (uint32_t)input->dims()[2];
convArgs.args[i].image.width = (uint32_t)input->dims()[3];
convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.args[i].filter_num =
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.scale_address = input->scale;
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.args[i].image.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
if (n > 1) {
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].output.address =
fpga::fpga_malloc(input->dims()[2] * input->dims()[3] *
convArgs.conv_args[i].filter_num * sizeof(half));
}
else {
convArgs.conv_args[i].output.scale_address = out->scale;
convArgs.conv_args[i].output.address = out_ptr;
}
convArgs.concat_arg.images_in[i] =
(half *)convArgs.conv_args[i].output.address;
convArgs.concat_arg.scales_in[i] =
(float *)convArgs.conv_args[i].sb_address;
convArgs.concat_arg.channel_num[i] = convArgs.conv_args[i].filter_num;
}
return true;
}
......
......@@ -23,12 +23,12 @@ template <>
bool ConvAddBNReluKernel<FPGA, float>::Init(
FusionConvAddBNReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
Tensor *filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -39,9 +39,9 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -73,8 +73,8 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num *
sizeof(fpga::ConvArgs));
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
......@@ -82,26 +82,28 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.args[i].relu_enabled = relu_enabled;
convArgs.args[i].group_num = (uint32_t)param->Groups();
convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.args[i].image.address = input_ptr;
convArgs.args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.args[i].image.height = (uint32_t)input->dims()[2];
convArgs.args[i].image.width = (uint32_t)input->dims()[3];
convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.args[i].filter_num =
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.args[i].image.scale_address =
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
return true;
......
......@@ -22,17 +22,17 @@ namespace operators {
template <>
bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
const Tensor *bias = param->Bias();
auto bias_ptr = bias->data<float>();
auto *filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == bias->dims()[0],
"Output channel should be equal to bias number");
int channel = out->dims()[1];
auto *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = bias_ptr[i];
......@@ -55,8 +55,8 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num *
sizeof(fpga::ConvArgs));
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
......@@ -64,26 +64,28 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.args[i].relu_enabled = relu_enabled;
convArgs.args[i].group_num = (uint32_t)param->Groups();
convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.args[i].image.address = input_ptr;
convArgs.args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.args[i].image.height = (uint32_t)input->dims()[2];
convArgs.args[i].image.width = (uint32_t)input->dims()[3];
convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.args[i].filter_num =
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.args[i].image.scale_address =
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
}
......
......@@ -23,10 +23,10 @@ namespace operators {
template <>
bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
bool relu_enabled = false;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
Tensor *filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -36,10 +36,10 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
float *bs_ptr =
auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -70,8 +70,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num *
sizeof(fpga::ConvArgs));
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
......@@ -79,26 +79,28 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.args[i].relu_enabled = relu_enabled;
convArgs.args[i].group_num = (uint32_t)param->Groups();
convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.args[i].image.address = input_ptr;
convArgs.args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.args[i].image.height = (uint32_t)input->dims()[2];
convArgs.args[i].image.width = (uint32_t)input->dims()[3];
convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.args[i].filter_num =
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.args[i].image.scale_address =
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
}
......
......@@ -22,10 +22,10 @@ namespace operators {
template <>
bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
bool relu_enabled = true;
Tensor *input = const_cast<Tensor *>(param->Input());
auto input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
Tensor *filter = const_cast<Tensor *>(param->Filter());
Tensor *out = param->Output();
auto filter = const_cast<Tensor *>(param->Filter());
auto out = param->Output();
auto bn_mean_ptr = param->InputMean()->data<float>();
auto bn_var_ptr = param->InputVariance()->data<float>();
auto bn_scale_ptr = param->InputScale()->data<float>();
......@@ -34,9 +34,9 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
float *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
Tensor *new_scale = new Tensor();
Tensor *new_bias = new Tensor();
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
auto new_bias_ptr = new_bias->mutable_data<float>({channel});
......@@ -67,8 +67,8 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num *
sizeof(fpga::ConvArgs));
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
......@@ -76,26 +76,28 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.args[i].relu_enabled = relu_enabled;
convArgs.args[i].group_num = (uint32_t)param->Groups();
convArgs.args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.args[i].image.address = input_ptr;
convArgs.args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.args[i].image.height = (uint32_t)input->dims()[2];
convArgs.args[i].image.width = (uint32_t)input->dims()[3];
convArgs.args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.args[i].filter_num =
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = (uint32_t)param->Groups();
convArgs.conv_args[i].kernel.stride_h = (uint32_t)param->Strides()[0];
convArgs.conv_args[i].kernel.stride_w = (uint32_t)param->Strides()[1];
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input->dims()[3];
convArgs.conv_args[i].image.pad_height = (uint32_t)param->Paddings()[0];
convArgs.conv_args[i].image.pad_width = (uint32_t)param->Paddings()[1];
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.args[i].image.scale_address =
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input->scale;
}
return true;
}
......
......@@ -20,16 +20,16 @@ namespace operators {
template <>
bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
bool relu_enabled = true;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>();
auto *filter = const_cast<Tensor *>(param->InputY());
const Tensor *input_z = param->InputZ();
auto filter = const_cast<Tensor *>(param->InputY());
auto input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out();
auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i];
......@@ -60,8 +60,8 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num *
sizeof(fpga::ConvArgs));
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
......@@ -69,26 +69,28 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.args[i].relu_enabled = relu_enabled;
convArgs.args[i].group_num = 1;
convArgs.args[i].kernel.stride_h = 1;
convArgs.args[i].kernel.stride_w = 1;
convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.args[i].image.address = input_x_ptr;
convArgs.args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.args[i].image.pad_height = 0;
convArgs.args[i].image.pad_width = 0;
convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.args[i].filter_num =
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = 1;
convArgs.conv_args[i].kernel.stride_h = 1;
convArgs.conv_args[i].kernel.stride_w = 1;
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_x_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.conv_args[i].image.pad_height = 0;
convArgs.conv_args[i].image.pad_width = 0;
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.args[i].image.scale_address =
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input_x->scale;
}
return true;
}
......
......@@ -21,17 +21,17 @@ namespace operators {
template <>
bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
bool relu_enabled = false;
auto *input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x = const_cast<LoDTensor *>(param->InputX());
auto input_x_ptr = input_x->data<float>();
auto *filter = const_cast<Tensor *>(param->InputY());
auto filter = const_cast<Tensor *>(param->InputY());
const Tensor *input_z = param->InputZ();
auto input_z_ptr = input_z->data<float>();
Tensor *out = param->Out();
auto out = param->Out();
PADDLE_MOBILE_ENFORCE(input_x->dims()[1] == filter->dims()[0],
"Image channel should be equal to weight number");
int channel = (uint32_t)out->dims()[1];
auto *bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
for (int i = 0; i < channel; i++) {
bs_ptr[i + channel] = 1;
bs_ptr[i] = input_z_ptr[i];
......@@ -61,8 +61,8 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
convArgs.filter_num = (uint32_t)filter->dims()[0];
convArgs.output.address = out_ptr;
convArgs.output.scale_address = out->scale;
convArgs.args = (fpga::ConvArgs *)fpga::fpga_malloc(convArgs.split_num *
sizeof(fpga::ConvArgs));
convArgs.conv_args = (fpga::ConvArgs *)fpga::fpga_malloc(
convArgs.split_num * sizeof(fpga::ConvArgs));
param->SetFpgaArgs(convArgs);
int element_num = fpga::get_aligned_filter_element_num(
......@@ -70,26 +70,28 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
int n = convArgs.split_num;
for (int i = 0; i < n; i++) {
convArgs.args[i].relu_enabled = relu_enabled;
convArgs.args[i].group_num = 1;
convArgs.args[i].kernel.stride_h = 1;
convArgs.args[i].kernel.stride_w = 1;
convArgs.args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.args[i].image.address = input_x_ptr;
convArgs.args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.args[i].image.pad_height = 0;
convArgs.args[i].image.pad_width = 0;
convArgs.args[i].filter_address = &((int8_t *)filter_ptr)[i * element_num];
convArgs.args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.args[i].filter_num =
convArgs.conv_args[i].relu_enabled = relu_enabled;
convArgs.conv_args[i].group_num = 1;
convArgs.conv_args[i].kernel.stride_h = 1;
convArgs.conv_args[i].kernel.stride_w = 1;
convArgs.conv_args[i].kernel.height = (uint32_t)filter->dims()[2];
convArgs.conv_args[i].kernel.width = (uint32_t)filter->dims()[3];
convArgs.conv_args[i].image.address = input_x_ptr;
convArgs.conv_args[i].image.channels = (uint32_t)input_x->dims()[1];
convArgs.conv_args[i].image.height = (uint32_t)input_x->dims()[2];
convArgs.conv_args[i].image.width = (uint32_t)input_x->dims()[3];
convArgs.conv_args[i].image.pad_height = 0;
convArgs.conv_args[i].image.pad_width = 0;
convArgs.conv_args[i].filter_address =
&((int8_t *)filter_ptr)[i * element_num];
convArgs.conv_args[i].sb_address = &((int8_t *)bs_ptr)[i * element_num];
convArgs.conv_args[i].filter_num =
(uint32_t)(i == n - 1 ? fpga::get_aligned_filter_num(
channel - (n - 1) * element_num_per_div)
: element_num_per_div);
convArgs.args[i].image.scale_address =
convArgs.conv_args[i].output.scale_address =
(float *)fpga::fpga_malloc(2 * sizeof(float));
convArgs.conv_args[i].image.scale_address = input_x->scale;
}
return true;
}
......
......@@ -734,7 +734,7 @@ void InnerKernelWithBnAdd(int mc, int nc, float alpha, const float *a,
#endif
}
}
WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
// WriteWithBnAddRelu(mc, nc, c, C, ldc, new_scale, new_bias, bias);
}
void InnerKernelWithPRelu(int mc, int nc, const float *a, const float *b,
......
......@@ -483,6 +483,15 @@ class ConcatParam : public OpParam {
vector<GType *> inputs_;
GType *out_;
int axis_;
#ifdef PADDLE_MOBILE_FPGA
private:
fpga::ConcatArgs fpga_concat_args;
public:
const fpga::ConcatArgs &FpgaArgs() const { return fpga_concat_args; }
void SetFpgaArgs(const fpga::ConcatArgs &args) { fpga_concat_args = args; }
#endif
};
#endif
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册