提交 09a678af 编写于 作者: qnqinan's avatar qnqinan 提交者: GitHub

Merge pull request #967 from zhangyang0701/develop

modify Softmax for FPGA track close #966
......@@ -181,10 +181,12 @@ int ComputeFPGAConcat(const struct ConcatArgs &args) {
return 0;
}
int get_align_image_cw(int cw) { return align_to_x(cw, IMAGE_ALIGNMENT); }
void format_image(framework::Tensor *image_tensor) {
auto dims = image_tensor->dims();
auto channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = image_tensor->mutable_data<float>();
auto data_ptr = image_tensor->data<float>();
size_t memory_size = channel * height * width * sizeof(float);
float *new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
......@@ -192,7 +194,7 @@ void format_image(framework::Tensor *image_tensor) {
image_tensor->reset_data_ptr(new_data);
}
void format_ofm(framework::Tensor *ofm_tensor) {
void format_fp16_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
......@@ -209,6 +211,23 @@ void format_ofm(framework::Tensor *ofm_tensor) {
ofm_tensor->reset_data_ptr(p);
}
void format_fp32_ofm(framework::Tensor *ofm_tensor) {
auto dims = ofm_tensor->dims();
size_t memory_size = 0;
if (dims.size() == 4) {
auto channel = dims[1], height = dims[2], width = dims[3];
memory_size =
height * align_to_x(channel * width, IMAGE_ALIGNMENT) * sizeof(float);
} else if (dims.size() == 2) {
memory_size = align_to_x(dims[1], IMAGE_ALIGNMENT) * sizeof(float);
} else {
DLOG << "Wrong ofm dimension";
}
auto p = fpga_malloc(memory_size);
memset(p, 0, memory_size);
ofm_tensor->reset_data_ptr(p);
}
float filter_find_max(framework::Tensor *filter_tensor) {
auto filter_ptr = filter_tensor->data<float>();
return filter::find_max(filter_ptr, filter_tensor->numel());
......@@ -242,7 +261,7 @@ void format_filter(framework::Tensor *filter_tensor, float max_value,
int group_num) {
auto dims = filter_tensor->dims();
auto num = dims[0], channel = dims[1], height = dims[2], width = dims[3];
auto data_ptr = filter_tensor->mutable_data<float>();
auto data_ptr = filter_tensor->data<float>();
size_t memory_size = num * channel * height * width * sizeof(float);
auto new_data = (float *)fpga_malloc(memory_size);
fpga_copy(new_data, data_ptr, memory_size);
......@@ -277,7 +296,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
int padding_h, int padding_w, float *bs_ptr) {
auto input_ptr = input->data<float>();
auto filter_ptr = filter->data<float>();
auto out_ptr = out->mutable_data<float>();
auto out_ptr = out->data<float>();
arg->group_num = (uint32_t)group_num;
arg->split_num = (uint32_t)fpga::get_plit_num(filter);
......@@ -300,7 +319,7 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
(uint32_t *)fpga::fpga_malloc(n * sizeof(uint32_t));
arg->concat_arg.image_out = out_ptr;
const int channel = (int)out->dims()[1];
auto channel = (int)out->dims()[1];
int filter_num_per_div = fpga::get_filter_num_per_div(filter, group_num);
int element_num = fpga::get_aligned_filter_element_num(
filter->dims()[1] * filter->dims()[2] * filter->dims()[3]);
......
......@@ -206,8 +206,10 @@ int ComputeFPGAConcat(const struct ConcatArgs& args);
static inline int align_to_x(int num, int x) { return (num + x - 1) / x * x; }
int get_align_image_cw(int cw);
void format_image(framework::Tensor* image_tensor);
void format_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp16_ofm(framework::Tensor* ofm_tensor); // only allocate memory
void format_fp32_ofm(framework::Tensor* ofm_tensor);
float filter_find_max(framework::Tensor* filter_tensor);
int get_filter_num_per_div(framework::Tensor* filter_tensor, int group_num);
......
......@@ -45,7 +45,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
void Init() {
Tensor *output = param_.Out();
fpga::format_ofm(output);
fpga::format_fp16_ofm(output);
}
void RunImpl() const {
......@@ -53,7 +53,7 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
auto input_ptr = input->data<float>();
fpga::format_image(input);
Tensor *output = param_.Out();
auto output_ptr = output->mutable_data<half>();
auto output_ptr = output->data<half>();
fpga::BypassArgs args;
......@@ -62,9 +62,9 @@ class FeedOp : public framework::OperatorBase<DeviceType> {
args.input_layout_type = fpga::LAYOUT_CHW;
args.output_layout_type = fpga::LAYOUT_HWC;
args.image.address = (void *)input_ptr;
args.image.channels = input->dims()[1];
args.image.height = input->dims()[2];
args.image.width = input->dims()[3];
args.image.channels = (uint32_t)input->dims()[1];
args.image.height = (uint32_t)input->dims()[2];
args.image.width = (uint32_t)input->dims()[3];
args.image.pad_height = 0;
args.image.pad_width = 0;
args.output.address = output_ptr;
......
......@@ -64,7 +64,7 @@ bool ConvAddBNKernel<FPGA, float>::Init(FusionConvAddBNParam<FPGA> *param) {
int element_num_per_div =
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
......
......@@ -62,7 +62,7 @@ bool ConvAddBNReluKernel<FPGA, float>::Init(
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
......
......@@ -44,7 +44,7 @@ bool ConvAddReluKernel<FPGA, float>::Init(FusionConvAddReluParam<FPGA> *param) {
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
......
......@@ -15,7 +15,6 @@ limitations under the License. */
#ifdef FUSION_CONVBN_OP
#include "operators/kernel/conv_bn_kernel.h"
#include "fpga/api.h"
namespace paddle_mobile {
namespace operators {
......@@ -33,10 +32,8 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
const float epsilon = param->Epsilon();
PADDLE_MOBILE_ENFORCE(out->dims()[1] == param->InputBias()->dims()[0],
"Output channel should be equal to bias number");
const int channel = out->dims()[1];
auto bs_ptr =
reinterpret_cast<float *>(fpga::fpga_malloc(2 * channel * sizeof(float)));
auto bs_ptr = (float *)fpga::fpga_malloc(2 * channel * sizeof(float));
auto new_scale = new Tensor();
auto new_bias = new Tensor();
auto new_scale_ptr = new_scale->mutable_data<float>({channel});
......@@ -59,7 +56,7 @@ bool ConvBNKernel<FPGA, float>::Init(FusionConvBNParam<FPGA> *param) {
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
......
......@@ -56,7 +56,7 @@ bool ConvBNReluKernel<FPGA, float>::Init(FusionConvBNReluParam<FPGA> *param) {
fpga::get_filter_num_per_div(filter, param->Groups());
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input, out, filter, relu_enabled,
......
......@@ -27,7 +27,7 @@ bool ElementwiseAddReluKernel<FPGA, float>::Init(
auto *out = param->Out();
auto input_x_ptr = input_x->data<float>();
auto input_y_ptr = input_y->data<float>();
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
auto out_ptr = out->mutable_data<float>();
fpga::EWAddArgs ewaddArgs;
......
......@@ -49,7 +49,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
......
......@@ -50,7 +50,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
int element_num_per_div = fpga::get_filter_num_per_div(filter, 1);
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, channel);
fpga::format_ofm(out);
fpga::format_fp16_ofm(out);
fpga::WrapperConvArgs conv_arg;
fpga::fill_conv_arg(&conv_arg, input_x, out, filter, relu_enabled, 1, 1, 1, 0,
......
......@@ -24,7 +24,7 @@ bool PoolKernel<FPGA, float>::Init(PoolParam<FPGA> *param) {
auto *input = const_cast<Tensor *>(param->Input());
auto input_ptr = input->data<float>();
Tensor *output = param->Output();
fpga::format_ofm(output);
fpga::format_fp16_ofm(output);
auto output_ptr = output->mutable_data<float>();
vector<int> ksize = param->Ksize();
vector<int> strides = param->Strides();
......
......@@ -24,22 +24,23 @@ namespace operators {
template <>
bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
const Tensor *input = param->InputX();
auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>();
auto output_ptr = param->Out();
Tensor *floatInput = new Tensor(*input);
auto float_input = new Tensor(*input);
fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args;
args.input_layout_type = fpga::LAYOUT_HWC;
args.output_layout_type = fpga::LAYOUT_CHW;
args.input_data_type = fpga::DATA_TYPE_FP16;
args.output_data_type = fpga::DATA_TYPE_FP32;
args.image.address = (void *)(input_ptr);
args.image.height = (uint32_t)input->dims()[0];
args.image.width = (uint32_t)input->dims()[1];
args.image.channels = 1;
args.output.address = (void *)floatInput->mutable_data<float>();
args.image.address = input_ptr;
args.image.height = 1;
args.image.width = 1;
args.image.channels = (uint32_t)input->dims()[1];
args.output.address = float_input->mutable_data<float>();
param->SetFloatInput(floatInput);
param->SetFloatInput(float_input);
param->SetFpgaArgs(args);
return true;
}
......@@ -47,17 +48,16 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
template <>
void SoftmaxKernel<FPGA, float>::Compute(
const SoftmaxParam<FPGA> &param) const {
DLOG << "======================================= FPGA SoftMAX "
"===============================================";
const Tensor *in_x = param.FloatInput();
Tensor *in_x = param.FloatInput();
Tensor *out = param.Out();
fpga::fpga_flush((void *)in_x->data<float>(), in_x->memory_size());
fpga::PerformBypass(param.FpgaArgs());
fpga::fpga_invalidate(out->data<float>(), out->memory_size());
fpga::fpga_invalidate(
(void *)in_x->data<float>(),
(size_t)fpga::get_align_image_cw((int)in_x->dims()[1]) * sizeof(float));
auto x_dims = in_x->dims();
out->Resize(x_dims);
math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size());
}
} // namespace operators
......
......@@ -71,7 +71,7 @@ void test_fill_conv_arg() {
fpga::format_bias_scale_array(&bs_ptr, element_num_per_div, 1001);
DLOG << "format ofm";
fpga::format_ofm(&out);
fpga::format_fp16_ofm(&out);
DLOG << "Build arg";
fpga::WrapperConvArgs arg;
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册