未验证 提交 c4bc8f09 编写于 作者: R Ray Liu 提交者: GitHub

Merge branch 'develop' into develop

...@@ -22,7 +22,7 @@ limitations under the License. */ ...@@ -22,7 +22,7 @@ limitations under the License. */
#include "fpga/filter.h" #include "fpga/filter.h"
#include "fpga/image.h" #include "fpga/image.h"
#define FPGA_TEST_MODE #define FPGA_TEST_MODE
#define PADDLE_MOBILE_OS_LINUX //#define PADDLE_MOBILE_OS_LINUX
namespace paddle_mobile { namespace paddle_mobile {
namespace fpga { namespace fpga {
...@@ -125,6 +125,7 @@ float fp16_2_fp32(half fp16_num) { ...@@ -125,6 +125,7 @@ float fp16_2_fp32(half fp16_num) {
} }
int ComputeBasicConv(const struct ConvArgs &args) { int ComputeBasicConv(const struct ConvArgs &args) {
#ifdef FPGA_TEST_MODE
DLOG << "======Compute Basic Conv======"; DLOG << "======Compute Basic Conv======";
DLOG << " relu_enabled:" << args.relu_enabled DLOG << " relu_enabled:" << args.relu_enabled
<< " sb_address:" << args.sb_address << " sb_address:" << args.sb_address
...@@ -144,7 +145,7 @@ int ComputeBasicConv(const struct ConvArgs &args) { ...@@ -144,7 +145,7 @@ int ComputeBasicConv(const struct ConvArgs &args) {
<< " stride_w:" << args.kernel.stride_w; << " stride_w:" << args.kernel.stride_w;
DLOG << " out_address:" << args.output.address DLOG << " out_address:" << args.output.address
<< " out_scale_address:" << args.output.scale_address; << " out_scale_address:" << args.output.scale_address;
#endif
return do_ioctl(IOCTL_CONFIG_CONV, &args); return do_ioctl(IOCTL_CONFIG_CONV, &args);
} }
...@@ -192,8 +193,9 @@ int ComputeFpgaPool(const struct PoolingArgs &args) { ...@@ -192,8 +193,9 @@ int ComputeFpgaPool(const struct PoolingArgs &args) {
int ComputeFpgaEWAdd(const struct EWAddArgs &args) { int ComputeFpgaEWAdd(const struct EWAddArgs &args) {
#ifdef FPGA_TEST_MODE #ifdef FPGA_TEST_MODE
DLOG << "=============ComputeFpgaEWAdd==========="; DLOG << "=============ComputeFpgaEWAdd===========";
DLOG << " relu_enabled:" << args.relu_enabled << " const0:" << args.const0 DLOG << " relu_enabled:" << args.relu_enabled
<< " const1:" << args.const1; << " const0:" << fp16_2_fp32(short(args.const0))
<< " const1:" << fp16_2_fp32(short(args.const1));
DLOG << " image0_address:" << args.image0.address DLOG << " image0_address:" << args.image0.address
<< " image0_scale_address:" << args.image0.scale_address << " image0_scale_address:" << args.image0.scale_address
<< " image0_channels:" << args.image0.channels << " image0_channels:" << args.image0.channels
...@@ -401,8 +403,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -401,8 +403,8 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
arg->concat_arg.image_num = arg->split_num; arg->concat_arg.image_num = arg->split_num;
arg->concat_arg.image_out = out_ptr; arg->concat_arg.image_out = out_ptr;
arg->concat_arg.scale_out = out->scale; arg->concat_arg.scale_out = out->scale;
arg->concat_arg.height = (uint32_t)filter->dims()[2]; arg->concat_arg.height = (uint32_t)out->dims()[2];
arg->concat_arg.width = (uint32_t)filter->dims()[3]; arg->concat_arg.width = (uint32_t)out->dims()[3];
int n = arg->split_num; int n = arg->split_num;
arg->concat_arg.images_in = arg->concat_arg.images_in =
...@@ -411,7 +413,6 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input, ...@@ -411,7 +413,6 @@ void fill_conv_arg(struct WrapperConvArgs *arg, framework::Tensor *input,
(float **)fpga_malloc(n * sizeof(float *)); // NOLINT (float **)fpga_malloc(n * sizeof(float *)); // NOLINT
arg->concat_arg.channel_num = arg->concat_arg.channel_num =
(uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT (uint32_t *)fpga_malloc(n * sizeof(uint32_t)); // NOLINT
arg->concat_arg.image_out = out_ptr;
auto channel = (int)out->dims()[1]; // NOLINT auto channel = (int)out->dims()[1]; // NOLINT
int filter_num_per_div = get_filter_num_per_div(filter, group_num); int filter_num_per_div = get_filter_num_per_div(filter, group_num);
......
...@@ -27,6 +27,9 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) { ...@@ -27,6 +27,9 @@ void align_element(float **data_in, int num_per_div_before_alignment, int num) {
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_per_div_after_alignment = int num_per_div_after_alignment =
align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, BS_NUM_ALIGNMENT);
if (num_per_div_before_alignment == num_per_div_after_alignment) {
return;
}
int num_element = int num_element =
2 * div_num * num_per_div_after_alignment; // including bias & scale 2 * div_num * num_per_div_after_alignment; // including bias & scale
float *ptr_aligned = float *ptr_aligned =
......
...@@ -210,12 +210,12 @@ void format_filter(float **data_in, int num, int channel, int height, int width, ...@@ -210,12 +210,12 @@ void format_filter(float **data_in, int num, int channel, int height, int width,
align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT); align_to_x(num_per_div_before_alignment, FILTER_NUM_ALIGNMENT);
int div_num = int div_num =
(num + num_per_div_before_alignment - 1) / num_per_div_before_alignment; (num + num_per_div_before_alignment - 1) / num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment * div_num; int residual = num % num_per_div_before_alignment;
int num_after_alignment = num_per_div_after_alignment *
((residual == 0) ? div_num : (div_num - 1)) +
align_to_x(residual, FILTER_NUM_ALIGNMENT);
quantize(data_in, data_size, max); quantize(data_in, data_size, max);
char **quantize_data = (char **)data_in; // NOLINT char **quantize_data = (char **)data_in; // NOLINT
convert_to_hwc(quantize_data, num, channel, height, width); convert_to_hwc(quantize_data, num, channel, height, width);
align_element(quantize_data, num, chw); align_element(quantize_data, num, chw);
align_num(quantize_data, num_per_div_before_alignment, num, chw); align_num(quantize_data, num_per_div_before_alignment, num, chw);
......
...@@ -44,6 +44,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) { ...@@ -44,6 +44,7 @@ bool FusionFcReluKernel<FPGA, float>::Init(FusionFcReluParam<FPGA> *param) {
int width = (uint32_t)input_x->dims()[3]; int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width; int filter_channel = chw / height / width;
out->Resize(framework::make_ddim({1, channel, 1, 1}));
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value); fpga::format_fc_filter(filter, max_value);
......
...@@ -45,6 +45,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) { ...@@ -45,6 +45,7 @@ bool FusionFcKernel<FPGA, float>::Init(FusionFcParam<FPGA> *param) {
int width = (uint32_t)input_x->dims()[3]; int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width; int filter_channel = chw / height / width;
out->Resize(framework::make_ddim({1, channel, 1, 1}));
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value); fpga::format_fc_filter(filter, max_value);
......
...@@ -44,6 +44,7 @@ bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) { ...@@ -44,6 +44,7 @@ bool MulKernel<FPGA, float>::Init(MulParam<FPGA> *param) {
int width = (uint32_t)input_x->dims()[3]; int width = (uint32_t)input_x->dims()[3];
int filter_channel = chw / height / width; int filter_channel = chw / height / width;
out->Resize(framework::make_ddim({1, channel, 1, 1}));
filter->Resize(framework::make_ddim({num, filter_channel, height, width})); filter->Resize(framework::make_ddim({num, filter_channel, height, width}));
float max_value = fpga::filter_find_max(filter); float max_value = fpga::filter_find_max(filter);
fpga::format_fc_filter(filter, max_value); fpga::format_fc_filter(filter, max_value);
......
...@@ -27,7 +27,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) { ...@@ -27,7 +27,7 @@ bool SoftmaxKernel<FPGA, float>::Init(SoftmaxParam<FPGA> *param) {
auto input = const_cast<Tensor *>(param->InputX()); auto input = const_cast<Tensor *>(param->InputX());
auto input_ptr = input->data<float>(); auto input_ptr = input->data<float>();
auto float_input = new Tensor; auto float_input = new Tensor;
float_input->mutable_data<float>(input->dims()); float_input->mutable_data<float>({1, input->dims()[1]});
fpga::format_fp32_ofm(float_input); fpga::format_fp32_ofm(float_input);
fpga::BypassArgs args = {fpga::DATA_TYPE_FP16}; fpga::BypassArgs args = {fpga::DATA_TYPE_FP16};
...@@ -56,7 +56,6 @@ void SoftmaxKernel<FPGA, float>::Compute( ...@@ -56,7 +56,6 @@ void SoftmaxKernel<FPGA, float>::Compute(
fpga::fpga_invalidate( fpga::fpga_invalidate(
(void *)in_x->data<float>(), // NOLINT (void *)in_x->data<float>(), // NOLINT
fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float)); fpga::get_align_image_cw(in_x->dims()[1]) * sizeof(float));
math::SoftmaxFuntor<CPU, float>()(in_x, out); math::SoftmaxFuntor<CPU, float>()(in_x, out);
fpga::fpga_flush(out->data<float>(), out->memory_size()); fpga::fpga_flush(out->data<float>(), out->memory_size());
} }
......
Markdown is supported
0% .
You are about to add 0 people to the discussion. Proceed with caution.
先完成此消息的编辑!
想要评论请 注册